From 747ada36ee23225d81657e4d633ac93b8ccbea7d Mon Sep 17 00:00:00 2001
From: Olaf Dabrunz <od@suse.de>
Date: Wed, 11 Jun 2008 16:35:13 +0200
Subject: pci: add PCI IDs for devices that need boot irq quirks

Signed-off-by: Stefan Assmann <sassmann@suse.de>
Signed-off-by: Olaf Dabrunz <od@suse.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/pci_ids.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h
index 65953822c9c..7f3f101e03c 100644
--- a/include/linux/pci_ids.h
+++ b/include/linux/pci_ids.h
@@ -2235,6 +2235,10 @@
 #define PCI_DEVICE_ID_INTEL_PXH_0	0x0329
 #define PCI_DEVICE_ID_INTEL_PXH_1	0x032A
 #define PCI_DEVICE_ID_INTEL_PXHV	0x032C
+#define PCI_DEVICE_ID_INTEL_80332_0	0x0330
+#define PCI_DEVICE_ID_INTEL_80332_1	0x0332
+#define PCI_DEVICE_ID_INTEL_80333_0	0x0370
+#define PCI_DEVICE_ID_INTEL_80333_1	0x0372
 #define PCI_DEVICE_ID_INTEL_82375	0x0482
 #define PCI_DEVICE_ID_INTEL_82424	0x0483
 #define PCI_DEVICE_ID_INTEL_82378	0x0484
@@ -2307,6 +2311,7 @@
 #define PCI_DEVICE_ID_INTEL_ESB_4	0x25a4
 #define PCI_DEVICE_ID_INTEL_ESB_5	0x25a6
 #define PCI_DEVICE_ID_INTEL_ESB_9	0x25ab
+#define PCI_DEVICE_ID_INTEL_ESB_10	0x25ac
 #define PCI_DEVICE_ID_INTEL_82820_HB	0x2500
 #define PCI_DEVICE_ID_INTEL_82820_UP_HB	0x2501
 #define PCI_DEVICE_ID_INTEL_82850_HB	0x2530
-- 
cgit v1.2.3-70-g09d2


From e1d3a90846b40ad3160bf4b648d36c6badad39ac Mon Sep 17 00:00:00 2001
From: Stefan Assmann <sassmann@suse.de>
Date: Wed, 11 Jun 2008 16:35:17 +0200
Subject: pci, acpi: reroute PCI interrupt to legacy boot interrupt equivalent

Some chipsets (e.g. intel 6700PXH) generate a legacy INTx when the
IRQ entry in the chipset's IO-APIC is masked (as, e.g. the RT kernel
does during interrupt handling). On chipsets where this INTx generation
cannot be disabled, we reroute the valid interrupts to their legacy
equivalent to get rid of spurious interrupts that might otherwise bring
down (vital) interrupt lines through spurious interrupt detection in
note_interrupt().

This patch benefited from discussions with Alexander Graf, Torsten Duwe,
Ihno Krumreich, Daniel Gollub, Hannes Reinecke. The conclusions we drew
and the patch itself are the authors' responsibility alone.

Signed-off-by: Stefan Assmann <sassmann@suse.de>
Signed-off-by: Olaf Dabrunz <od@suse.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 drivers/acpi/pci_irq.c | 56 ++++++++++++++++++++++++++++++++++++++++++++++++++
 drivers/pci/quirks.c   | 28 +++++++++++++++++++++++++
 include/linux/pci.h    |  6 ++++++
 3 files changed, 90 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/acpi/pci_irq.c b/drivers/acpi/pci_irq.c
index 89022a74fae..b37cb0a9826 100644
--- a/drivers/acpi/pci_irq.c
+++ b/drivers/acpi/pci_irq.c
@@ -384,6 +384,27 @@ acpi_pci_free_irq(struct acpi_prt_entry *entry,
 	return irq;
 }
 
+#ifdef CONFIG_X86_IO_APIC
+extern int noioapicquirk;
+
+static int bridge_has_boot_interrupt_variant(struct pci_bus *bus)
+{
+	struct pci_bus *bus_it;
+
+	for (bus_it = bus ; bus_it ; bus_it = bus_it->parent) {
+		if (!bus_it->self)
+			return 0;
+
+		printk(KERN_INFO "vendor=%04x device=%04x\n", bus_it->self->vendor,
+				bus_it->self->device);
+
+		if (bus_it->self->irq_reroute_variant)
+			return bus_it->self->irq_reroute_variant;
+	}
+	return 0;
+}
+#endif /* CONFIG_X86_IO_APIC */
+
 /*
  * acpi_pci_irq_lookup
  * success: return IRQ >= 0
@@ -413,6 +434,41 @@ acpi_pci_irq_lookup(struct pci_bus *bus,
 	}
 
 	ret = func(entry, triggering, polarity, link);
+
+#ifdef CONFIG_X86_IO_APIC
+	/*
+	 * Some chipsets (e.g. intel 6700PXH) generate a legacy INTx when the
+	 * IRQ entry in the chipset's IO-APIC is masked (as, e.g. the RT kernel
+	 * does during interrupt handling). When this INTx generation cannot be
+	 * disabled, we reroute these interrupts to their legacy equivalent to
+	 * get rid of spurious interrupts.
+	 */
+        if (!noioapicquirk) {
+		switch (bridge_has_boot_interrupt_variant(bus)) {
+		case 0:
+			/* no rerouting necessary */
+			break;
+
+		case INTEL_IRQ_REROUTE_VARIANT:
+			/*
+			 * Remap according to INTx routing table in 6700PXH
+			 * specs, intel order number 302628-002, section
+			 * 2.15.2. Other chipsets (80332, ...) have the same
+			 * mapping and are handled here as well.
+			 */
+			printk(KERN_INFO "pci irq %d -> rerouted to legacy "
+					 "irq %d\n", ret, (ret % 4) + 16);
+			ret = (ret % 4) + 16;
+			break;
+
+		default:
+			printk(KERN_INFO "not rerouting irq %d to legacy irq: "
+					 "unknown mapping\n", ret);
+			break;
+		}
+	}
+#endif /* CONFIG_X86_IO_APIC */
+
 	return ret;
 }
 
diff --git a/drivers/pci/quirks.c b/drivers/pci/quirks.c
index eb97564316d..ac634ae2eb0 100644
--- a/drivers/pci/quirks.c
+++ b/drivers/pci/quirks.c
@@ -1364,6 +1364,34 @@ DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL,	0x260a, quirk_intel_pcie_pm);
 DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL,	0x260b, quirk_intel_pcie_pm);
 
 #ifdef CONFIG_X86_IO_APIC
+/*
+ * Boot interrupts on some chipsets cannot be turned off. For these chipsets,
+ * remap the original interrupt in the linux kernel to the boot interrupt, so
+ * that a PCI device's interrupt handler is installed on the boot interrupt
+ * line instead.
+ */
+static void quirk_reroute_to_boot_interrupts_intel(struct pci_dev *dev)
+{
+	int i;
+
+	if (noioapicquirk)
+		return;
+
+	dev->irq_reroute_variant = INTEL_IRQ_REROUTE_VARIANT;
+
+	printk(KERN_INFO "PCI quirk: reroute interrupts for 0x%04x:0x%04x\n",
+			dev->vendor, dev->device);
+	return;
+}
+DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL,	PCI_DEVICE_ID_INTEL_80333_0,	quirk_reroute_to_boot_interrupts_intel);
+DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL,	PCI_DEVICE_ID_INTEL_80333_1,	quirk_reroute_to_boot_interrupts_intel);
+DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL,	PCI_DEVICE_ID_INTEL_ESB2_0,	quirk_reroute_to_boot_interrupts_intel);
+DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL,	PCI_DEVICE_ID_INTEL_PXH_0,	quirk_reroute_to_boot_interrupts_intel);
+DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL,	PCI_DEVICE_ID_INTEL_PXH_1,	quirk_reroute_to_boot_interrupts_intel);
+DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL,	PCI_DEVICE_ID_INTEL_PXHV,	quirk_reroute_to_boot_interrupts_intel);
+DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL,	PCI_DEVICE_ID_INTEL_80332_0,	quirk_reroute_to_boot_interrupts_intel);
+DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL,	PCI_DEVICE_ID_INTEL_80332_1,	quirk_reroute_to_boot_interrupts_intel);
+
 /*
  * On some chipsets we can disable the generation of legacy INTx boot
  * interrupts.
diff --git a/include/linux/pci.h b/include/linux/pci.h
index d18b1dd49fa..6755cf5ac10 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -117,6 +117,11 @@ enum pci_dev_flags {
 	PCI_DEV_FLAGS_MSI_INTX_DISABLE_BUG = (__force pci_dev_flags_t) 1,
 };
 
+enum pci_irq_reroute_variant {
+	INTEL_IRQ_REROUTE_VARIANT = 1,
+	MAX_IRQ_REROUTE_VARIANTS = 3
+};
+
 typedef unsigned short __bitwise pci_bus_flags_t;
 enum pci_bus_flags {
 	PCI_BUS_FLAGS_NO_MSI   = (__force pci_bus_flags_t) 1,
@@ -194,6 +199,7 @@ struct pci_dev {
 	unsigned int	no_d1d2:1;   /* only allow d0 or d3 */
 	unsigned int	block_ucfg_access:1;	/* userspace config space access is blocked */
 	unsigned int	broken_parity_status:1;	/* Device generates false positive parity */
+	unsigned int	irq_reroute_variant:2;	/* device needs IRQ rerouting variant */
 	unsigned int 	msi_enabled:1;
 	unsigned int	msix_enabled:1;
 	unsigned int	is_managed:1;
-- 
cgit v1.2.3-70-g09d2


From bd8fbdee6562ee526f3c2582a3b373ef195015dd Mon Sep 17 00:00:00 2001
From: Rui Sousa <rui.p.m.sousa@gmail.com>
Date: Wed, 3 Sep 2008 17:53:07 +0200
Subject: lockdep: fix compilation when CONFIG_TRACE_IRQFLAGS_SUPPORT is not
 set

This patch fixes compilation if CONFIG_TRACE_IRQFLAGS_SUPPORT is ever
disabled (which is currently not allowed by Kconfig). Alternatively
we could just remove the option altogether and the associated code
paths. Since the compilation error has been in the tree for at
least two years and no one noticed it, I guess we don't really have
the need for CONFIG_TRACE_IRQFLAGS_SUPPORT=n.

Boot tested on x86 UP.

Signed-off-by: Rui Sousa <rui.p.m.sousa@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/irqflags.h | 18 ++++++++----------
 1 file changed, 8 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/irqflags.h b/include/linux/irqflags.h
index 74bde13224c..f2993512b3b 100644
--- a/include/linux/irqflags.h
+++ b/include/linux/irqflags.h
@@ -52,10 +52,10 @@
 # define start_critical_timings() do { } while (0)
 #endif
 
-#ifdef CONFIG_TRACE_IRQFLAGS_SUPPORT
-
 #include <asm/irqflags.h>
 
+#ifdef CONFIG_TRACE_IRQFLAGS_SUPPORT
+
 #define local_irq_enable() \
 	do { trace_hardirqs_on(); raw_local_irq_enable(); } while (0)
 #define local_irq_disable() \
@@ -84,21 +84,20 @@
  * The local_irq_*() APIs are equal to the raw_local_irq*()
  * if !TRACE_IRQFLAGS.
  */
-# define raw_local_irq_disable()	local_irq_disable()
-# define raw_local_irq_enable()		local_irq_enable()
-# define raw_local_irq_save(flags)			\
+#define local_irq_disable()		raw_local_irq_disable()
+#define local_irq_enable()		raw_local_irq_enable()
+#define local_irq_save(flags)				\
 	do {						\
 		typecheck(unsigned long, flags);	\
-		local_irq_save(flags);			\
+		raw_local_irq_save(flags);		\
 	} while (0)
-# define raw_local_irq_restore(flags)			\
+# define local_irq_restore(flags)			\
 	do {						\
 		typecheck(unsigned long, flags);	\
-		local_irq_restore(flags);		\
+		raw_local_irq_restore(flags);		\
 	} while (0)
 #endif /* CONFIG_TRACE_IRQFLAGS_SUPPORT */
 
-#ifdef CONFIG_TRACE_IRQFLAGS_SUPPORT
 #define safe_halt()						\
 	do {							\
 		trace_hardirqs_on();				\
@@ -124,6 +123,5 @@
 	typecheck(unsigned long, flags);	\
 	raw_irqs_disabled_flags(flags);		\
 })
-#endif		/* CONFIG_X86 */
 
 #endif
-- 
cgit v1.2.3-70-g09d2


From ab7476cf76e560f0efda2a631a70aabe93009025 Mon Sep 17 00:00:00 2001
From: Arjan van de Ven <arjan@linux.intel.com>
Date: Fri, 15 Aug 2008 15:29:38 -0700
Subject: debug: add notifier chain debugging, v2

- unbreak ia64 (and powerpc) where function pointers dont
  point at code but at data (reported by Tony Luck)

[ mingo@elte.hu: various cleanups ]

Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/kernel.h |  3 +++
 kernel/extable.c       | 16 ++++++++++++++++
 kernel/notifier.c      | 10 +---------
 lib/vsprintf.c         |  2 +-
 4 files changed, 21 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index 2651f805ba6..4e1366b552a 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -187,6 +187,9 @@ extern unsigned long long memparse(char *ptr, char **retptr);
 extern int core_kernel_text(unsigned long addr);
 extern int __kernel_text_address(unsigned long addr);
 extern int kernel_text_address(unsigned long addr);
+extern int func_ptr_is_kernel_text(void *ptr);
+extern void *dereference_function_descriptor(void *ptr);
+
 struct pid;
 extern struct pid *session_of_pgrp(struct pid *pgrp);
 
diff --git a/kernel/extable.c b/kernel/extable.c
index a26cb2e1702..adf0cc9c02d 100644
--- a/kernel/extable.c
+++ b/kernel/extable.c
@@ -66,3 +66,19 @@ int kernel_text_address(unsigned long addr)
 		return 1;
 	return module_text_address(addr) != NULL;
 }
+
+/*
+ * On some architectures (PPC64, IA64) function pointers
+ * are actually only tokens to some data that then holds the
+ * real function address. As a result, to find if a function
+ * pointer is part of the kernel text, we need to do some
+ * special dereferencing first.
+ */
+int func_ptr_is_kernel_text(void *ptr)
+{
+	unsigned long addr;
+	addr = (unsigned long) dereference_function_descriptor(ptr);
+	if (core_kernel_text(addr))
+		return 1;
+	return module_text_address(addr) != NULL;
+}
diff --git a/kernel/notifier.c b/kernel/notifier.c
index 143fdd77dbf..0f39e398ef6 100644
--- a/kernel/notifier.c
+++ b/kernel/notifier.c
@@ -21,10 +21,6 @@ BLOCKING_NOTIFIER_HEAD(reboot_notifier_list);
 static int notifier_chain_register(struct notifier_block **nl,
 		struct notifier_block *n)
 {
-	if (!kernel_text_address((unsigned long)n->notifier_call)) {
-		WARN(1, "Invalid notifier registered!");
-		return 0;
-	}
 	while ((*nl) != NULL) {
 		if (n->priority > (*nl)->priority)
 			break;
@@ -38,10 +34,6 @@ static int notifier_chain_register(struct notifier_block **nl,
 static int notifier_chain_cond_register(struct notifier_block **nl,
 		struct notifier_block *n)
 {
-	if (!kernel_text_address((unsigned long)n->notifier_call)) {
-		WARN(1, "Invalid notifier registered!");
-		return 0;
-	}
 	while ((*nl) != NULL) {
 		if ((*nl) == n)
 			return 0;
@@ -92,7 +84,7 @@ static int __kprobes notifier_call_chain(struct notifier_block **nl,
 		next_nb = rcu_dereference(nb->next);
 
 #ifdef CONFIG_DEBUG_NOTIFIERS
-		if (!kernel_text_address((unsigned long)nb->notifier_call)) {
+		if (unlikely(!func_ptr_is_kernel_text(nb->notifier_call))) {
 			WARN(1, "Invalid notifier called!");
 			nb = next_nb;
 			continue;
diff --git a/lib/vsprintf.c b/lib/vsprintf.c
index d8d1d114224..f5e5ffb9942 100644
--- a/lib/vsprintf.c
+++ b/lib/vsprintf.c
@@ -513,7 +513,7 @@ static char *string(char *buf, char *end, char *s, int field_width, int precisio
 	return buf;
 }
 
-static inline void *dereference_function_descriptor(void *ptr)
+void *dereference_function_descriptor(void *ptr)
 {
 #if defined(CONFIG_IA64) || defined(CONFIG_PPC64)
 	void *p;
-- 
cgit v1.2.3-70-g09d2


From 76b189e91845eab3a9d52bb97f971d312d25652d Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Wed, 10 Sep 2008 09:57:35 +0200
Subject: lockdep: add might_lock() / might_lock_read()

useful to establish a lock dependency in case the actual dependency is
rare or hard to trigger.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Acked-by: Nick Piggin <nickpiggin@yahoo.com.au>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/lockdep.h | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h
index 331e5f1c2d8..0aa657aa8a1 100644
--- a/include/linux/lockdep.h
+++ b/include/linux/lockdep.h
@@ -480,4 +480,22 @@ static inline void print_irqtrace_events(struct task_struct *curr)
 # define lock_map_release(l)			do { } while (0)
 #endif
 
+#ifdef CONFIG_PROVE_LOCKING
+# define might_lock(lock) 						\
+do {									\
+	typecheck(struct lockdep_map *, &(lock)->dep_map);		\
+	lock_acquire(&(lock)->dep_map, 0, 0, 0, 2, NULL, _THIS_IP_);	\
+	lock_release(&(lock)->dep_map, 0, _THIS_IP_);			\
+} while (0)
+# define might_lock_read(lock) 						\
+do {									\
+	typecheck(struct lockdep_map *, &(lock)->dep_map);		\
+	lock_acquire(&(lock)->dep_map, 0, 0, 1, 2, NULL, _THIS_IP_);	\
+	lock_release(&(lock)->dep_map, 0, _THIS_IP_);			\
+} while (0)
+#else
+# define might_lock(lock) do { } while (0)
+# define might_lock_read(lock) do { } while (0)
+#endif
+
 #endif /* __LINUX_LOCKDEP_H */
-- 
cgit v1.2.3-70-g09d2


From 3ee1afa308f2a38e5d1e2ad3752ad7abcf480da1 Mon Sep 17 00:00:00 2001
From: Nick Piggin <npiggin@suse.de>
Date: Wed, 10 Sep 2008 13:37:17 +0200
Subject: x86: some lock annotations for user copy paths, v2

 - introduce might_fault()
 - handle the atomic user copy paths correctly

[ mingo@elte.hu: move might_sleep() outside of in_atomic(). ]
Signed-off-by: Nick Piggin <npiggin@suse.de>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/lib/usercopy_32.c   | 12 +++---------
 arch/x86/lib/usercopy_64.c   |  8 ++------
 include/asm-x86/uaccess.h    | 18 ++++--------------
 include/asm-x86/uaccess_32.h | 12 +++---------
 include/asm-x86/uaccess_64.h | 12 +++---------
 include/linux/kernel.h       |  9 +++++++++
 mm/memory.c                  | 15 +++++++++++++++
 7 files changed, 39 insertions(+), 47 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/lib/usercopy_32.c b/arch/x86/lib/usercopy_32.c
index 8eedde2a9ca..7393152a252 100644
--- a/arch/x86/lib/usercopy_32.c
+++ b/arch/x86/lib/usercopy_32.c
@@ -32,9 +32,7 @@ static inline int __movsl_is_ok(unsigned long a1, unsigned long a2, unsigned lon
 #define __do_strncpy_from_user(dst, src, count, res)			   \
 do {									   \
 	int __d0, __d1, __d2;						   \
-	might_sleep();							   \
-	if (current->mm)						   \
-		might_lock_read(&current->mm->mmap_sem);		   \
+	might_fault();							   \
 	__asm__ __volatile__(						   \
 		"	testl %1,%1\n"					   \
 		"	jz 2f\n"					   \
@@ -121,9 +119,7 @@ EXPORT_SYMBOL(strncpy_from_user);
 #define __do_clear_user(addr,size)					\
 do {									\
 	int __d0;							\
-	might_sleep();							\
-	if (current->mm)						\
-		might_lock_read(&current->mm->mmap_sem);		\
+	might_fault();							\
 	__asm__ __volatile__(						\
 		"0:	rep; stosl\n"					\
 		"	movl %2,%0\n"					\
@@ -193,9 +189,7 @@ long strnlen_user(const char __user *s, long n)
 	unsigned long mask = -__addr_ok(s);
 	unsigned long res, tmp;
 
-	might_sleep();
-	if (current->mm)
-		might_lock_read(&current->mm->mmap_sem);
+	might_fault();
 
 	__asm__ __volatile__(
 		"	testl %0, %0\n"
diff --git a/arch/x86/lib/usercopy_64.c b/arch/x86/lib/usercopy_64.c
index 847d1294599..64d6c84e635 100644
--- a/arch/x86/lib/usercopy_64.c
+++ b/arch/x86/lib/usercopy_64.c
@@ -15,9 +15,7 @@
 #define __do_strncpy_from_user(dst,src,count,res)			   \
 do {									   \
 	long __d0, __d1, __d2;						   \
-	might_sleep();							   \
-	if (current->mm)						   \
-		might_lock_read(&current->mm->mmap_sem);		   \
+	might_fault();							   \
 	__asm__ __volatile__(						   \
 		"	testq %1,%1\n"					   \
 		"	jz 2f\n"					   \
@@ -66,9 +64,7 @@ EXPORT_SYMBOL(strncpy_from_user);
 unsigned long __clear_user(void __user *addr, unsigned long size)
 {
 	long __d0;
-	might_sleep();
-	if (current->mm)
-		might_lock_read(&current->mm->mmap_sem);
+	might_fault();
 	/* no memory constraint because it doesn't change any memory gcc knows
 	   about */
 	asm volatile(
diff --git a/include/asm-x86/uaccess.h b/include/asm-x86/uaccess.h
index ad29752a171..39f8420c75d 100644
--- a/include/asm-x86/uaccess.h
+++ b/include/asm-x86/uaccess.h
@@ -8,8 +8,6 @@
 #include <linux/thread_info.h>
 #include <linux/prefetch.h>
 #include <linux/string.h>
-#include <linux/lockdep.h>
-#include <linux/sched.h>
 #include <asm/asm.h>
 #include <asm/page.h>
 
@@ -159,9 +157,7 @@ extern int __get_user_bad(void);
 	int __ret_gu;							\
 	unsigned long __val_gu;						\
 	__chk_user_ptr(ptr);						\
-	might_sleep();							\
-	if (current->mm)						\
-		might_lock_read(&current->mm->mmap_sem);		\
+	might_fault();							\
 	switch (sizeof(*(ptr))) {					\
 	case 1:								\
 		__get_user_x(1, __ret_gu, __val_gu, ptr);		\
@@ -246,9 +242,7 @@ extern void __put_user_8(void);
 	int __ret_pu;						\
 	__typeof__(*(ptr)) __pu_val;				\
 	__chk_user_ptr(ptr);					\
-	might_sleep();						\
-	if (current->mm)					\
-		might_lock_read(&current->mm->mmap_sem);	\
+	might_fault();						\
 	__pu_val = x;						\
 	switch (sizeof(*(ptr))) {				\
 	case 1:							\
@@ -273,9 +267,7 @@ extern void __put_user_8(void);
 #define __put_user_size(x, ptr, size, retval, errret)			\
 do {									\
 	retval = 0;							\
-	might_sleep();							\
-	if (current->mm)						\
-		might_lock_read(&current->mm->mmap_sem);		\
+	might_fault();							\
 	__chk_user_ptr(ptr);						\
 	switch (size) {							\
 	case 1:								\
@@ -328,9 +320,7 @@ do {									\
 #define __get_user_size(x, ptr, size, retval, errret)			\
 do {									\
 	retval = 0;							\
-	might_sleep();							\
-	if (current->mm)						\
-		might_lock_read(&current->mm->mmap_sem);		\
+	might_fault();							\
 	__chk_user_ptr(ptr);						\
 	switch (size) {							\
 	case 1:								\
diff --git a/include/asm-x86/uaccess_32.h b/include/asm-x86/uaccess_32.h
index d725e2d703f..d10e842ec3e 100644
--- a/include/asm-x86/uaccess_32.h
+++ b/include/asm-x86/uaccess_32.h
@@ -82,9 +82,7 @@ __copy_to_user_inatomic(void __user *to, const void *from, unsigned long n)
 static __always_inline unsigned long __must_check
 __copy_to_user(void __user *to, const void *from, unsigned long n)
 {
-	might_sleep();
-	if (current->mm)
-		might_lock_read(&current->mm->mmap_sem);
+	might_fault();
 	return __copy_to_user_inatomic(to, from, n);
 }
 
@@ -139,9 +137,7 @@ __copy_from_user_inatomic(void *to, const void __user *from, unsigned long n)
 static __always_inline unsigned long
 __copy_from_user(void *to, const void __user *from, unsigned long n)
 {
-	might_sleep();
-	if (current->mm)
-		might_lock_read(&current->mm->mmap_sem);
+	might_fault();
 	if (__builtin_constant_p(n)) {
 		unsigned long ret;
 
@@ -163,9 +159,7 @@ __copy_from_user(void *to, const void __user *from, unsigned long n)
 static __always_inline unsigned long __copy_from_user_nocache(void *to,
 				const void __user *from, unsigned long n)
 {
-	might_sleep();
-	if (current->mm)
-		might_lock_read(&current->mm->mmap_sem);
+	might_fault();
 	if (__builtin_constant_p(n)) {
 		unsigned long ret;
 
diff --git a/include/asm-x86/uaccess_64.h b/include/asm-x86/uaccess_64.h
index 40a7205fe57..13fd56fbc3a 100644
--- a/include/asm-x86/uaccess_64.h
+++ b/include/asm-x86/uaccess_64.h
@@ -29,9 +29,7 @@ int __copy_from_user(void *dst, const void __user *src, unsigned size)
 {
 	int ret = 0;
 
-	might_sleep();
-	if (current->mm)
-		might_lock_read(&current->mm->mmap_sem);
+	might_fault();
 	if (!__builtin_constant_p(size))
 		return copy_user_generic(dst, (__force void *)src, size);
 	switch (size) {
@@ -75,9 +73,7 @@ int __copy_to_user(void __user *dst, const void *src, unsigned size)
 {
 	int ret = 0;
 
-	might_sleep();
-	if (current->mm)
-		might_lock_read(&current->mm->mmap_sem);
+	might_fault();
 	if (!__builtin_constant_p(size))
 		return copy_user_generic((__force void *)dst, src, size);
 	switch (size) {
@@ -121,9 +117,7 @@ int __copy_in_user(void __user *dst, const void __user *src, unsigned size)
 {
 	int ret = 0;
 
-	might_sleep();
-	if (current->mm)
-		might_lock_read(&current->mm->mmap_sem);
+	might_fault();
 	if (!__builtin_constant_p(size))
 		return copy_user_generic((__force void *)dst,
 					 (__force void *)src, size);
diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index 2651f805ba6..e580ec09576 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -140,6 +140,15 @@ extern int _cond_resched(void);
 		(__x < 0) ? -__x : __x;		\
 	})
 
+#ifdef CONFIG_PROVE_LOCKING
+void might_fault(void);
+#else
+static inline void might_fault(void)
+{
+	might_sleep();
+}
+#endif
+
 extern struct atomic_notifier_head panic_notifier_list;
 extern long (*panic_blink)(long time);
 NORET_TYPE void panic(const char * fmt, ...)
diff --git a/mm/memory.c b/mm/memory.c
index 1002f473f49..b8fdf4e5e65 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -3016,3 +3016,18 @@ void print_vma_addr(char *prefix, unsigned long ip)
 	}
 	up_read(&current->mm->mmap_sem);
 }
+
+#ifdef CONFIG_PROVE_LOCKING
+void might_fault(void)
+{
+	might_sleep();
+	/*
+	 * it would be nicer only to annotate paths which are not under
+	 * pagefault_disable, however that requires a larger audit and
+	 * providing helpers like get_user_atomic.
+	 */
+	if (!in_atomic() && current->mm)
+		might_lock_read(&current->mm->mmap_sem);
+}
+EXPORT_SYMBOL(might_fault);
+#endif
-- 
cgit v1.2.3-70-g09d2


From 1d18ef489509314506328b9e464dd47c24c1d68f Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Thu, 11 Sep 2008 20:53:21 +0200
Subject: x86: some lock annotations for user copy paths, v3

- add annotation back to clear_user()
- change probe_kernel_address() to _inatomic*() method

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/lib/usercopy_32.c | 1 +
 include/asm-x86/uaccess.h  | 2 --
 include/linux/uaccess.h    | 2 +-
 3 files changed, 2 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/lib/usercopy_32.c b/arch/x86/lib/usercopy_32.c
index 7393152a252..fab5faba1d3 100644
--- a/arch/x86/lib/usercopy_32.c
+++ b/arch/x86/lib/usercopy_32.c
@@ -148,6 +148,7 @@ do {									\
 unsigned long
 clear_user(void __user *to, unsigned long n)
 {
+	might_fault();
 	if (access_ok(VERIFY_WRITE, to, n))
 		__do_clear_user(to, n);
 	return n;
diff --git a/include/asm-x86/uaccess.h b/include/asm-x86/uaccess.h
index 39f8420c75d..dc8edb5c465 100644
--- a/include/asm-x86/uaccess.h
+++ b/include/asm-x86/uaccess.h
@@ -267,7 +267,6 @@ extern void __put_user_8(void);
 #define __put_user_size(x, ptr, size, retval, errret)			\
 do {									\
 	retval = 0;							\
-	might_fault();							\
 	__chk_user_ptr(ptr);						\
 	switch (size) {							\
 	case 1:								\
@@ -320,7 +319,6 @@ do {									\
 #define __get_user_size(x, ptr, size, retval, errret)			\
 do {									\
 	retval = 0;							\
-	might_fault();							\
 	__chk_user_ptr(ptr);						\
 	switch (size) {							\
 	case 1:								\
diff --git a/include/linux/uaccess.h b/include/linux/uaccess.h
index fec6decfb98..2062293e57e 100644
--- a/include/linux/uaccess.h
+++ b/include/linux/uaccess.h
@@ -78,7 +78,7 @@ static inline unsigned long __copy_from_user_nocache(void *to,
 							\
 		set_fs(KERNEL_DS);			\
 		pagefault_disable();			\
-		ret = __get_user(retval, (__force typeof(retval) __user *)(addr));		\
+		ret = __copy_from_user_inatomic((__force typeof(retval) __user *)(addr), &(retval), sizeof(retval));		\
 		pagefault_enable();			\
 		set_fs(old_fs);				\
 		ret;					\
-- 
cgit v1.2.3-70-g09d2


From 53b9d87f41a3d8838210ad7cdef02d814817ce85 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@linux-foundation.org>
Date: Thu, 11 Sep 2008 17:02:58 -0700
Subject: lock debug: sit tight when we are already in a panic

in:

  > http://bugzilla.kernel.org/show_bug.cgi?id=11543

The panic code called the kexec code which called mutex_trylock() which
called spin_lock_mutex() which then stupidly went and blurted a load of
debug stuff because of in_interrupt().

Keep the lock debug code from escallating an already crappy situation.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/debug_locks.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/debug_locks.h b/include/linux/debug_locks.h
index 4aaa4afb1cb..096476f1fb3 100644
--- a/include/linux/debug_locks.h
+++ b/include/linux/debug_locks.h
@@ -17,7 +17,7 @@ extern int debug_locks_off(void);
 ({									\
 	int __ret = 0;							\
 									\
-	if (unlikely(c)) {						\
+	if (!oops_in_progress && unlikely(c)) {				\
 		if (debug_locks_off() && !debug_locks_silent)		\
 			WARN_ON(1);					\
 		__ret = 1;						\
-- 
cgit v1.2.3-70-g09d2


From 30742d5c2277c325fb0e9d2d817d55a19995fe8f Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Sun, 14 Sep 2008 14:43:39 +0200
Subject: Revert "lockdep: fix compilation when CONFIG_TRACE_IRQFLAGS_SUPPORT
 is not set"

This reverts commit bd8fbdee6562ee526f3c2582a3b373ef195015dd.

This broke the powerpc build - more fixes are needed before we can
undo this revert.
---
 include/linux/irqflags.h | 18 ++++++++++--------
 1 file changed, 10 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/irqflags.h b/include/linux/irqflags.h
index f2993512b3b..74bde13224c 100644
--- a/include/linux/irqflags.h
+++ b/include/linux/irqflags.h
@@ -52,10 +52,10 @@
 # define start_critical_timings() do { } while (0)
 #endif
 
-#include <asm/irqflags.h>
-
 #ifdef CONFIG_TRACE_IRQFLAGS_SUPPORT
 
+#include <asm/irqflags.h>
+
 #define local_irq_enable() \
 	do { trace_hardirqs_on(); raw_local_irq_enable(); } while (0)
 #define local_irq_disable() \
@@ -84,20 +84,21 @@
  * The local_irq_*() APIs are equal to the raw_local_irq*()
  * if !TRACE_IRQFLAGS.
  */
-#define local_irq_disable()		raw_local_irq_disable()
-#define local_irq_enable()		raw_local_irq_enable()
-#define local_irq_save(flags)				\
+# define raw_local_irq_disable()	local_irq_disable()
+# define raw_local_irq_enable()		local_irq_enable()
+# define raw_local_irq_save(flags)			\
 	do {						\
 		typecheck(unsigned long, flags);	\
-		raw_local_irq_save(flags);		\
+		local_irq_save(flags);			\
 	} while (0)
-# define local_irq_restore(flags)			\
+# define raw_local_irq_restore(flags)			\
 	do {						\
 		typecheck(unsigned long, flags);	\
-		raw_local_irq_restore(flags);		\
+		local_irq_restore(flags);		\
 	} while (0)
 #endif /* CONFIG_TRACE_IRQFLAGS_SUPPORT */
 
+#ifdef CONFIG_TRACE_IRQFLAGS_SUPPORT
 #define safe_halt()						\
 	do {							\
 		trace_hardirqs_on();				\
@@ -123,5 +124,6 @@
 	typecheck(unsigned long, flags);	\
 	raw_irqs_disabled_flags(flags);		\
 })
+#endif		/* CONFIG_X86 */
 
 #endif
-- 
cgit v1.2.3-70-g09d2


From fb71e45338453698bd7460f7e8f171ea0304d218 Mon Sep 17 00:00:00 2001
From: Hiroshi Shimamoto <h-shimamoto@ct.jp.nec.com>
Date: Mon, 15 Sep 2008 18:04:26 -0700
Subject: uaccess: fix parameters inversion for __copy_from_user_inatomic()

The following patch changes to use __copy_from_user_inatomic(),
but the passing parameters incorrect:

    x86: some lock annotations for user copy paths, v3

This fixes the netfilter crash reported by Steven Noonan.

Reported-by: Steven Noonan <steven@uplinklabs.net>
Signed-off-by: Hiroshi Shimamoto <h-shimamoto@ct.jp.nec.com>
Tested-by: Steven Noonan <steven@uplinklabs.net>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/uaccess.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/uaccess.h b/include/linux/uaccess.h
index 2062293e57e..6b58367d145 100644
--- a/include/linux/uaccess.h
+++ b/include/linux/uaccess.h
@@ -78,7 +78,7 @@ static inline unsigned long __copy_from_user_nocache(void *to,
 							\
 		set_fs(KERNEL_DS);			\
 		pagefault_disable();			\
-		ret = __copy_from_user_inatomic((__force typeof(retval) __user *)(addr), &(retval), sizeof(retval));		\
+		ret = __copy_from_user_inatomic(&(retval), (__force typeof(retval) __user *)(addr), sizeof(retval));		\
 		pagefault_enable();			\
 		set_fs(old_fs);				\
 		ret;					\
-- 
cgit v1.2.3-70-g09d2


From 38d47c1b7075bd7ec3881141bb3629da58f88dab Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Fri, 26 Sep 2008 19:32:20 +0200
Subject: futex: rely on get_user_pages() for shared futexes

On the way of getting rid of the mmap_sem requirement for shared futexes,
start by relying on get_user_pages().

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Acked-by: Nick Piggin <nickpiggin@yahoo.com.au>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/futex.h |   2 +
 kernel/futex.c        | 162 +++++++++++++++++++++++++-------------------------
 2 files changed, 82 insertions(+), 82 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/futex.h b/include/linux/futex.h
index 586ab56a3ec..8f627b9ae2b 100644
--- a/include/linux/futex.h
+++ b/include/linux/futex.h
@@ -164,6 +164,8 @@ union futex_key {
 	} both;
 };
 
+#define FUTEX_KEY_INIT (union futex_key) { .both = { .ptr = NULL } }
+
 #ifdef CONFIG_FUTEX
 extern void exit_robust_list(struct task_struct *curr);
 extern void exit_pi_state_list(struct task_struct *curr);
diff --git a/kernel/futex.c b/kernel/futex.c
index 7d1136e97c1..a4c39fa0a7a 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -161,6 +161,45 @@ static inline int match_futex(union futex_key *key1, union futex_key *key2)
 		&& key1->both.offset == key2->both.offset);
 }
 
+/*
+ * Take a reference to the resource addressed by a key.
+ * Can be called while holding spinlocks.
+ *
+ */
+static void get_futex_key_refs(union futex_key *key)
+{
+	if (!key->both.ptr)
+		return;
+
+	switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) {
+	case FUT_OFF_INODE:
+		atomic_inc(&key->shared.inode->i_count);
+		break;
+	case FUT_OFF_MMSHARED:
+		atomic_inc(&key->private.mm->mm_count);
+		break;
+	}
+}
+
+/*
+ * Drop a reference to the resource addressed by a key.
+ * The hash bucket spinlock must not be held.
+ */
+static void drop_futex_key_refs(union futex_key *key)
+{
+	if (!key->both.ptr)
+		return;
+
+	switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) {
+	case FUT_OFF_INODE:
+		iput(key->shared.inode);
+		break;
+	case FUT_OFF_MMSHARED:
+		mmdrop(key->private.mm);
+		break;
+	}
+}
+
 /**
  * get_futex_key - Get parameters which are the keys for a futex.
  * @uaddr: virtual address of the futex
@@ -184,7 +223,6 @@ static int get_futex_key(u32 __user *uaddr, struct rw_semaphore *fshared,
 {
 	unsigned long address = (unsigned long)uaddr;
 	struct mm_struct *mm = current->mm;
-	struct vm_area_struct *vma;
 	struct page *page;
 	int err;
 
@@ -210,98 +248,47 @@ static int get_futex_key(u32 __user *uaddr, struct rw_semaphore *fshared,
 		key->private.address = address;
 		return 0;
 	}
-	/*
-	 * The futex is hashed differently depending on whether
-	 * it's in a shared or private mapping.  So check vma first.
-	 */
-	vma = find_extend_vma(mm, address);
-	if (unlikely(!vma))
-		return -EFAULT;
 
-	/*
-	 * Permissions.
-	 */
-	if (unlikely((vma->vm_flags & (VM_IO|VM_READ)) != VM_READ))
-		return (vma->vm_flags & VM_IO) ? -EPERM : -EACCES;
+again:
+	err = get_user_pages(current, mm, address, 1, 0, 0, &page, NULL);
+	if (err < 0)
+		return err;
+
+	lock_page(page);
+	if (!page->mapping) {
+		unlock_page(page);
+		put_page(page);
+		goto again;
+	}
 
 	/*
 	 * Private mappings are handled in a simple way.
 	 *
 	 * NOTE: When userspace waits on a MAP_SHARED mapping, even if
 	 * it's a read-only handle, it's expected that futexes attach to
-	 * the object not the particular process.  Therefore we use
-	 * VM_MAYSHARE here, not VM_SHARED which is restricted to shared
-	 * mappings of _writable_ handles.
+	 * the object not the particular process.
 	 */
-	if (likely(!(vma->vm_flags & VM_MAYSHARE))) {
-		key->both.offset |= FUT_OFF_MMSHARED; /* reference taken on mm */
+	if (PageAnon(page)) {
+		key->both.offset |= FUT_OFF_MMSHARED; /* ref taken on mm */
 		key->private.mm = mm;
 		key->private.address = address;
-		return 0;
-	}
-
-	/*
-	 * Linear file mappings are also simple.
-	 */
-	key->shared.inode = vma->vm_file->f_path.dentry->d_inode;
-	key->both.offset |= FUT_OFF_INODE; /* inode-based key. */
-	if (likely(!(vma->vm_flags & VM_NONLINEAR))) {
-		key->shared.pgoff = (((address - vma->vm_start) >> PAGE_SHIFT)
-				     + vma->vm_pgoff);
-		return 0;
+	} else {
+		key->both.offset |= FUT_OFF_INODE; /* inode-based key */
+		key->shared.inode = page->mapping->host;
+		key->shared.pgoff = page->index;
 	}
 
-	/*
-	 * We could walk the page table to read the non-linear
-	 * pte, and get the page index without fetching the page
-	 * from swap.  But that's a lot of code to duplicate here
-	 * for a rare case, so we simply fetch the page.
-	 */
-	err = get_user_pages(current, mm, address, 1, 0, 0, &page, NULL);
-	if (err >= 0) {
-		key->shared.pgoff =
-			page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
-		put_page(page);
-		return 0;
-	}
-	return err;
-}
+	get_futex_key_refs(key);
 
-/*
- * Take a reference to the resource addressed by a key.
- * Can be called while holding spinlocks.
- *
- */
-static void get_futex_key_refs(union futex_key *key)
-{
-	if (key->both.ptr == NULL)
-		return;
-	switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) {
-		case FUT_OFF_INODE:
-			atomic_inc(&key->shared.inode->i_count);
-			break;
-		case FUT_OFF_MMSHARED:
-			atomic_inc(&key->private.mm->mm_count);
-			break;
-	}
+	unlock_page(page);
+	put_page(page);
+	return 0;
 }
 
-/*
- * Drop a reference to the resource addressed by a key.
- * The hash bucket spinlock must not be held.
- */
-static void drop_futex_key_refs(union futex_key *key)
+static inline
+void put_futex_key(struct rw_semaphore *fshared, union futex_key *key)
 {
-	if (!key->both.ptr)
-		return;
-	switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) {
-		case FUT_OFF_INODE:
-			iput(key->shared.inode);
-			break;
-		case FUT_OFF_MMSHARED:
-			mmdrop(key->private.mm);
-			break;
-	}
+	drop_futex_key_refs(key);
 }
 
 static u32 cmpxchg_futex_value_locked(u32 __user *uaddr, u32 uval, u32 newval)
@@ -385,6 +372,7 @@ static int refill_pi_state_cache(void)
 	/* pi_mutex gets initialized later */
 	pi_state->owner = NULL;
 	atomic_set(&pi_state->refcount, 1);
+	pi_state->key = FUTEX_KEY_INIT;
 
 	current->pi_state_cache = pi_state;
 
@@ -462,7 +450,7 @@ void exit_pi_state_list(struct task_struct *curr)
 	struct list_head *next, *head = &curr->pi_state_list;
 	struct futex_pi_state *pi_state;
 	struct futex_hash_bucket *hb;
-	union futex_key key;
+	union futex_key key = FUTEX_KEY_INIT;
 
 	if (!futex_cmpxchg_enabled)
 		return;
@@ -725,7 +713,7 @@ static int futex_wake(u32 __user *uaddr, struct rw_semaphore *fshared,
 	struct futex_hash_bucket *hb;
 	struct futex_q *this, *next;
 	struct plist_head *head;
-	union futex_key key;
+	union futex_key key = FUTEX_KEY_INIT;
 	int ret;
 
 	if (!bitset)
@@ -760,6 +748,7 @@ static int futex_wake(u32 __user *uaddr, struct rw_semaphore *fshared,
 
 	spin_unlock(&hb->lock);
 out:
+	put_futex_key(fshared, &key);
 	futex_unlock_mm(fshared);
 	return ret;
 }
@@ -773,7 +762,7 @@ futex_wake_op(u32 __user *uaddr1, struct rw_semaphore *fshared,
 	      u32 __user *uaddr2,
 	      int nr_wake, int nr_wake2, int op)
 {
-	union futex_key key1, key2;
+	union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT;
 	struct futex_hash_bucket *hb1, *hb2;
 	struct plist_head *head;
 	struct futex_q *this, *next;
@@ -873,6 +862,8 @@ retry:
 	if (hb1 != hb2)
 		spin_unlock(&hb2->lock);
 out:
+	put_futex_key(fshared, &key2);
+	put_futex_key(fshared, &key1);
 	futex_unlock_mm(fshared);
 
 	return ret;
@@ -886,7 +877,7 @@ static int futex_requeue(u32 __user *uaddr1, struct rw_semaphore *fshared,
 			 u32 __user *uaddr2,
 			 int nr_wake, int nr_requeue, u32 *cmpval)
 {
-	union futex_key key1, key2;
+	union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT;
 	struct futex_hash_bucket *hb1, *hb2;
 	struct plist_head *head1;
 	struct futex_q *this, *next;
@@ -974,6 +965,8 @@ out_unlock:
 		drop_futex_key_refs(&key1);
 
 out:
+	put_futex_key(fshared, &key2);
+	put_futex_key(fshared, &key1);
 	futex_unlock_mm(fshared);
 	return ret;
 }
@@ -1220,6 +1213,7 @@ static int futex_wait(u32 __user *uaddr, struct rw_semaphore *fshared,
  retry:
 	futex_lock_mm(fshared);
 
+	q.key = FUTEX_KEY_INIT;
 	ret = get_futex_key(uaddr, fshared, &q.key);
 	if (unlikely(ret != 0))
 		goto out_release_sem;
@@ -1360,6 +1354,7 @@ static int futex_wait(u32 __user *uaddr, struct rw_semaphore *fshared,
 	queue_unlock(&q, hb);
 
  out_release_sem:
+	put_futex_key(fshared, &q.key);
 	futex_unlock_mm(fshared);
 	return ret;
 }
@@ -1411,6 +1406,7 @@ static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared,
  retry:
 	futex_lock_mm(fshared);
 
+	q.key = FUTEX_KEY_INIT;
 	ret = get_futex_key(uaddr, fshared, &q.key);
 	if (unlikely(ret != 0))
 		goto out_release_sem;
@@ -1625,6 +1621,7 @@ static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared,
 	queue_unlock(&q, hb);
 
  out_release_sem:
+	put_futex_key(fshared, &q.key);
 	futex_unlock_mm(fshared);
 	if (to)
 		destroy_hrtimer_on_stack(&to->timer);
@@ -1671,7 +1668,7 @@ static int futex_unlock_pi(u32 __user *uaddr, struct rw_semaphore *fshared)
 	struct futex_q *this, *next;
 	u32 uval;
 	struct plist_head *head;
-	union futex_key key;
+	union futex_key key = FUTEX_KEY_INIT;
 	int ret, attempt = 0;
 
 retry:
@@ -1744,6 +1741,7 @@ retry_unlocked:
 out_unlock:
 	spin_unlock(&hb->lock);
 out:
+	put_futex_key(fshared, &key);
 	futex_unlock_mm(fshared);
 
 	return ret;
-- 
cgit v1.2.3-70-g09d2


From c7e78cff6b7518212247fb20b1dc6411540dc9af Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Thu, 16 Oct 2008 23:17:09 +0200
Subject: lockstat: contend with points

We currently only provide points that have to wait on contention, also
lists the points we have to wait for.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 Documentation/lockstat.txt | 50 +++++++++++++++++++++++++++++-----------------
 include/linux/lockdep.h    | 13 ++++++++----
 kernel/lockdep.c           | 33 +++++++++++++++++++-----------
 kernel/lockdep_proc.c      | 21 +++++++++++++++++--
 kernel/mutex.c             |  2 +-
 5 files changed, 82 insertions(+), 37 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/lockstat.txt b/Documentation/lockstat.txt
index 02f36f5c64f..9cb9138f7a7 100644
--- a/Documentation/lockstat.txt
+++ b/Documentation/lockstat.txt
@@ -71,34 +71,48 @@ Look at the current lock statistics:
 
 # less /proc/lock_stat
 
-01 lock_stat version 0.2
+01 lock_stat version 0.3
 02 -----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
 03                               class name    con-bounces    contentions   waittime-min   waittime-max waittime-total    acq-bounces   acquisitions   holdtime-min   holdtime-max holdtime-total
 04 -----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
 05
-06               &inode->i_data.tree_lock-W:            15          21657           0.18     1093295.30 11547131054.85             58          10415           0.16          87.51        6387.60
-07               &inode->i_data.tree_lock-R:             0              0           0.00           0.00           0.00          23302         231198           0.25           8.45       98023.38
-08               --------------------------
-09                 &inode->i_data.tree_lock              0          [<ffffffff8027c08f>] add_to_page_cache+0x5f/0x190
-10
-11 ...............................................................................................................................................................................................
-12
-13                              dcache_lock:          1037           1161           0.38          45.32         774.51           6611         243371           0.15         306.48       77387.24
-14                              -----------
-15                              dcache_lock            180          [<ffffffff802c0d7e>] sys_getcwd+0x11e/0x230
-16                              dcache_lock            165          [<ffffffff802c002a>] d_alloc+0x15a/0x210
-17                              dcache_lock             33          [<ffffffff8035818d>] _atomic_dec_and_lock+0x4d/0x70
-18                              dcache_lock              1          [<ffffffff802beef8>] shrink_dcache_parent+0x18/0x130
+06                          &mm->mmap_sem-W:           233            538 18446744073708       22924.27      607243.51           1342          45806           1.71        8595.89     1180582.34
+07                          &mm->mmap_sem-R:           205            587 18446744073708       28403.36      731975.00           1940         412426           0.58      187825.45     6307502.88
+08                          ---------------
+09                            &mm->mmap_sem            487          [<ffffffff8053491f>] do_page_fault+0x466/0x928
+10                            &mm->mmap_sem            179          [<ffffffff802a6200>] sys_mprotect+0xcd/0x21d
+11                            &mm->mmap_sem            279          [<ffffffff80210a57>] sys_mmap+0x75/0xce
+12                            &mm->mmap_sem             76          [<ffffffff802a490b>] sys_munmap+0x32/0x59
+13                          ---------------
+14                            &mm->mmap_sem            270          [<ffffffff80210a57>] sys_mmap+0x75/0xce
+15                            &mm->mmap_sem            431          [<ffffffff8053491f>] do_page_fault+0x466/0x928
+16                            &mm->mmap_sem            138          [<ffffffff802a490b>] sys_munmap+0x32/0x59
+17                            &mm->mmap_sem            145          [<ffffffff802a6200>] sys_mprotect+0xcd/0x21d
+18
+19 ...............................................................................................................................................................................................
+20
+21                              dcache_lock:           621            623           0.52         118.26        1053.02           6745          91930           0.29         316.29      118423.41
+22                              -----------
+23                              dcache_lock            179          [<ffffffff80378274>] _atomic_dec_and_lock+0x34/0x54
+24                              dcache_lock            113          [<ffffffff802cc17b>] d_alloc+0x19a/0x1eb
+25                              dcache_lock             99          [<ffffffff802ca0dc>] d_rehash+0x1b/0x44
+26                              dcache_lock            104          [<ffffffff802cbca0>] d_instantiate+0x36/0x8a
+27                              -----------
+28                              dcache_lock            192          [<ffffffff80378274>] _atomic_dec_and_lock+0x34/0x54
+29                              dcache_lock             98          [<ffffffff802ca0dc>] d_rehash+0x1b/0x44
+30                              dcache_lock             72          [<ffffffff802cc17b>] d_alloc+0x19a/0x1eb
+31                              dcache_lock            112          [<ffffffff802cbca0>] d_instantiate+0x36/0x8a
 
 This excerpt shows the first two lock class statistics. Line 01 shows the
 output version - each time the format changes this will be updated. Line 02-04
-show the header with column descriptions. Lines 05-10 and 13-18 show the actual
+show the header with column descriptions. Lines 05-18 and 20-31 show the actual
 statistics. These statistics come in two parts; the actual stats separated by a
-short separator (line 08, 14) from the contention points.
+short separator (line 08, 13) from the contention points.
 
-The first lock (05-10) is a read/write lock, and shows two lines above the
+The first lock (05-18) is a read/write lock, and shows two lines above the
 short separator. The contention points don't match the column descriptors,
-they have two: contentions and [<IP>] symbol.
+they have two: contentions and [<IP>] symbol. The second set of contention
+points are the points we're contending with.
 
 The integer part of the time values is in us.
 
diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h
index 0aa657aa8a1..fc9f8e88123 100644
--- a/include/linux/lockdep.h
+++ b/include/linux/lockdep.h
@@ -73,6 +73,8 @@ struct lock_class_key {
 	struct lockdep_subclass_key	subkeys[MAX_LOCKDEP_SUBCLASSES];
 };
 
+#define LOCKSTAT_POINTS		4
+
 /*
  * The lock-class itself:
  */
@@ -119,7 +121,8 @@ struct lock_class {
 	int				name_version;
 
 #ifdef CONFIG_LOCK_STAT
-	unsigned long			contention_point[4];
+	unsigned long			contention_point[LOCKSTAT_POINTS];
+	unsigned long			contending_point[LOCKSTAT_POINTS];
 #endif
 };
 
@@ -144,6 +147,7 @@ enum bounce_type {
 
 struct lock_class_stats {
 	unsigned long			contention_point[4];
+	unsigned long			contending_point[4];
 	struct lock_time		read_waittime;
 	struct lock_time		write_waittime;
 	struct lock_time		read_holdtime;
@@ -165,6 +169,7 @@ struct lockdep_map {
 	const char			*name;
 #ifdef CONFIG_LOCK_STAT
 	int				cpu;
+	unsigned long			ip;
 #endif
 };
 
@@ -355,7 +360,7 @@ struct lock_class_key { };
 #ifdef CONFIG_LOCK_STAT
 
 extern void lock_contended(struct lockdep_map *lock, unsigned long ip);
-extern void lock_acquired(struct lockdep_map *lock);
+extern void lock_acquired(struct lockdep_map *lock, unsigned long ip);
 
 #define LOCK_CONTENDED(_lock, try, lock)			\
 do {								\
@@ -363,13 +368,13 @@ do {								\
 		lock_contended(&(_lock)->dep_map, _RET_IP_);	\
 		lock(_lock);					\
 	}							\
-	lock_acquired(&(_lock)->dep_map);			\
+	lock_acquired(&(_lock)->dep_map, _RET_IP_);			\
 } while (0)
 
 #else /* CONFIG_LOCK_STAT */
 
 #define lock_contended(lockdep_map, ip) do {} while (0)
-#define lock_acquired(lockdep_map) do {} while (0)
+#define lock_acquired(lockdep_map, ip) do {} while (0)
 
 #define LOCK_CONTENDED(_lock, try, lock) \
 	lock(_lock)
diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index dbda475b13b..234a9dccb4b 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -136,16 +136,16 @@ static inline struct lock_class *hlock_class(struct held_lock *hlock)
 #ifdef CONFIG_LOCK_STAT
 static DEFINE_PER_CPU(struct lock_class_stats[MAX_LOCKDEP_KEYS], lock_stats);
 
-static int lock_contention_point(struct lock_class *class, unsigned long ip)
+static int lock_point(unsigned long points[], unsigned long ip)
 {
 	int i;
 
-	for (i = 0; i < ARRAY_SIZE(class->contention_point); i++) {
-		if (class->contention_point[i] == 0) {
-			class->contention_point[i] = ip;
+	for (i = 0; i < LOCKSTAT_POINTS; i++) {
+		if (points[i] == 0) {
+			points[i] = ip;
 			break;
 		}
-		if (class->contention_point[i] == ip)
+		if (points[i] == ip)
 			break;
 	}
 
@@ -185,6 +185,9 @@ struct lock_class_stats lock_stats(struct lock_class *class)
 		for (i = 0; i < ARRAY_SIZE(stats.contention_point); i++)
 			stats.contention_point[i] += pcs->contention_point[i];
 
+		for (i = 0; i < ARRAY_SIZE(stats.contending_point); i++)
+			stats.contending_point[i] += pcs->contending_point[i];
+
 		lock_time_add(&pcs->read_waittime, &stats.read_waittime);
 		lock_time_add(&pcs->write_waittime, &stats.write_waittime);
 
@@ -209,6 +212,7 @@ void clear_lock_stats(struct lock_class *class)
 		memset(cpu_stats, 0, sizeof(struct lock_class_stats));
 	}
 	memset(class->contention_point, 0, sizeof(class->contention_point));
+	memset(class->contending_point, 0, sizeof(class->contending_point));
 }
 
 static struct lock_class_stats *get_lock_stats(struct lock_class *class)
@@ -3001,7 +3005,7 @@ __lock_contended(struct lockdep_map *lock, unsigned long ip)
 	struct held_lock *hlock, *prev_hlock;
 	struct lock_class_stats *stats;
 	unsigned int depth;
-	int i, point;
+	int i, contention_point, contending_point;
 
 	depth = curr->lockdep_depth;
 	if (DEBUG_LOCKS_WARN_ON(!depth))
@@ -3025,18 +3029,22 @@ __lock_contended(struct lockdep_map *lock, unsigned long ip)
 found_it:
 	hlock->waittime_stamp = sched_clock();
 
-	point = lock_contention_point(hlock_class(hlock), ip);
+	contention_point = lock_point(hlock_class(hlock)->contention_point, ip);
+	contending_point = lock_point(hlock_class(hlock)->contending_point,
+				      lock->ip);
 
 	stats = get_lock_stats(hlock_class(hlock));
-	if (point < ARRAY_SIZE(stats->contention_point))
-		stats->contention_point[point]++;
+	if (contention_point < LOCKSTAT_POINTS)
+		stats->contention_point[contention_point]++;
+	if (contending_point < LOCKSTAT_POINTS)
+		stats->contending_point[contending_point]++;
 	if (lock->cpu != smp_processor_id())
 		stats->bounces[bounce_contended + !!hlock->read]++;
 	put_lock_stats(stats);
 }
 
 static void
-__lock_acquired(struct lockdep_map *lock)
+__lock_acquired(struct lockdep_map *lock, unsigned long ip)
 {
 	struct task_struct *curr = current;
 	struct held_lock *hlock, *prev_hlock;
@@ -3085,6 +3093,7 @@ found_it:
 	put_lock_stats(stats);
 
 	lock->cpu = cpu;
+	lock->ip = ip;
 }
 
 void lock_contended(struct lockdep_map *lock, unsigned long ip)
@@ -3106,7 +3115,7 @@ void lock_contended(struct lockdep_map *lock, unsigned long ip)
 }
 EXPORT_SYMBOL_GPL(lock_contended);
 
-void lock_acquired(struct lockdep_map *lock)
+void lock_acquired(struct lockdep_map *lock, unsigned long ip)
 {
 	unsigned long flags;
 
@@ -3119,7 +3128,7 @@ void lock_acquired(struct lockdep_map *lock)
 	raw_local_irq_save(flags);
 	check_flags(flags);
 	current->lockdep_recursion = 1;
-	__lock_acquired(lock);
+	__lock_acquired(lock, ip);
 	current->lockdep_recursion = 0;
 	raw_local_irq_restore(flags);
 }
diff --git a/kernel/lockdep_proc.c b/kernel/lockdep_proc.c
index 8d3a6eba8d5..13716b81389 100644
--- a/kernel/lockdep_proc.c
+++ b/kernel/lockdep_proc.c
@@ -557,7 +557,7 @@ static void seq_stats(struct seq_file *m, struct lock_stat_data *data)
 	if (stats->read_holdtime.nr)
 		namelen += 2;
 
-	for (i = 0; i < ARRAY_SIZE(class->contention_point); i++) {
+	for (i = 0; i < LOCKSTAT_POINTS; i++) {
 		char sym[KSYM_SYMBOL_LEN];
 		char ip[32];
 
@@ -574,6 +574,23 @@ static void seq_stats(struct seq_file *m, struct lock_stat_data *data)
 				stats->contention_point[i],
 				ip, sym);
 	}
+	for (i = 0; i < LOCKSTAT_POINTS; i++) {
+		char sym[KSYM_SYMBOL_LEN];
+		char ip[32];
+
+		if (class->contending_point[i] == 0)
+			break;
+
+		if (!i)
+			seq_line(m, '-', 40-namelen, namelen);
+
+		sprint_symbol(sym, class->contending_point[i]);
+		snprintf(ip, sizeof(ip), "[<%p>]",
+				(void *)class->contending_point[i]);
+		seq_printf(m, "%40s %14lu %29s %s\n", name,
+				stats->contending_point[i],
+				ip, sym);
+	}
 	if (i) {
 		seq_puts(m, "\n");
 		seq_line(m, '.', 0, 40 + 1 + 10 * (14 + 1));
@@ -583,7 +600,7 @@ static void seq_stats(struct seq_file *m, struct lock_stat_data *data)
 
 static void seq_header(struct seq_file *m)
 {
-	seq_printf(m, "lock_stat version 0.2\n");
+	seq_printf(m, "lock_stat version 0.3\n");
 	seq_line(m, '-', 0, 40 + 1 + 10 * (14 + 1));
 	seq_printf(m, "%40s %14s %14s %14s %14s %14s %14s %14s %14s "
 			"%14s %14s\n",
diff --git a/kernel/mutex.c b/kernel/mutex.c
index 12c779dc65d..39a3816b68d 100644
--- a/kernel/mutex.c
+++ b/kernel/mutex.c
@@ -184,7 +184,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
 	}
 
 done:
-	lock_acquired(&lock->dep_map);
+	lock_acquired(&lock->dep_map, ip);
 	/* got the lock - rejoice! */
 	mutex_remove_waiter(lock, &waiter, task_thread_info(task));
 	debug_mutex_set_owner(lock, task_thread_info(task));
-- 
cgit v1.2.3-70-g09d2


From a53ccab3ccac9e8676a683df9822a2daec83ef54 Mon Sep 17 00:00:00 2001
From: Matthew Ranostay <mranostay@embeddedalley.com>
Date: Sat, 25 Oct 2008 01:05:04 -0400
Subject: ALSA: jack: lineout support to jack abstraction layer

This patch introduces support for reporting SW_LINEOUT_INSERT detection events
via the jack abstraction layer.

Also adds a SND_JACK_LINEOUT define to the input system header.

Signed-off-by: Matthew Ranostay <mranostay@embeddedalley.com>
Cc: Dmitry Torokhov <dtor@mail.ru>
Acked-by: Mark Brown <broonie@opensource.wolfsonmicro.com>
Signed-off-by: Takashi Iwai <tiwai@suse.de>
---
 include/linux/input.h | 1 +
 include/sound/jack.h  | 1 +
 sound/core/jack.c     | 6 ++++++
 3 files changed, 8 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/input.h b/include/linux/input.h
index a5802c9c81a..7323d2ff515 100644
--- a/include/linux/input.h
+++ b/include/linux/input.h
@@ -644,6 +644,7 @@ struct input_absinfo {
 #define SW_RADIO		SW_RFKILL_ALL	/* deprecated */
 #define SW_MICROPHONE_INSERT	0x04  /* set = inserted */
 #define SW_DOCK			0x05  /* set = plugged into dock */
+#define SW_LINEOUT_INSERT	0x06  /* set = inserted */
 #define SW_MAX			0x0f
 #define SW_CNT			(SW_MAX+1)
 
diff --git a/include/sound/jack.h b/include/sound/jack.h
index b1b2b8b59ad..7cb25f4b50b 100644
--- a/include/sound/jack.h
+++ b/include/sound/jack.h
@@ -35,6 +35,7 @@ enum snd_jack_types {
 	SND_JACK_HEADPHONE	= 0x0001,
 	SND_JACK_MICROPHONE	= 0x0002,
 	SND_JACK_HEADSET	= SND_JACK_HEADPHONE | SND_JACK_MICROPHONE,
+	SND_JACK_LINEOUT	= 0x0004,
 };
 
 struct snd_jack {
diff --git a/sound/core/jack.c b/sound/core/jack.c
index 8133a2b173a..c4bb9bad313 100644
--- a/sound/core/jack.c
+++ b/sound/core/jack.c
@@ -102,6 +102,9 @@ int snd_jack_new(struct snd_card *card, const char *id, int type,
 	if (type & SND_JACK_HEADPHONE)
 		input_set_capability(jack->input_dev, EV_SW,
 				     SW_HEADPHONE_INSERT);
+	if (type & SND_JACK_LINEOUT)
+		input_set_capability(jack->input_dev, EV_SW,
+				     SW_LINEOUT_INSERT);
 	if (type & SND_JACK_MICROPHONE)
 		input_set_capability(jack->input_dev, EV_SW,
 				     SW_MICROPHONE_INSERT);
@@ -150,6 +153,9 @@ void snd_jack_report(struct snd_jack *jack, int status)
 	if (jack->type & SND_JACK_HEADPHONE)
 		input_report_switch(jack->input_dev, SW_HEADPHONE_INSERT,
 				    status & SND_JACK_HEADPHONE);
+	if (jack->type & SND_JACK_LINEOUT)
+		input_report_switch(jack->input_dev, SW_LINEOUT_INSERT,
+				    status & SND_JACK_LINEOUT);
 	if (jack->type & SND_JACK_MICROPHONE)
 		input_report_switch(jack->input_dev, SW_MICROPHONE_INSERT,
 				    status & SND_JACK_MICROPHONE);
-- 
cgit v1.2.3-70-g09d2


From 505e371da195fad20cb8aaf45407a2849774d6d0 Mon Sep 17 00:00:00 2001
From: Lai Jiangshan <laijs@cn.fujitsu.com>
Date: Wed, 15 Oct 2008 14:56:42 +0800
Subject: markers: remove exported symbol marker_probe_cb_noarg()

marker_probe_cb_noarg() should not be seen by outer code.
this patch remove it.

Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com>
Acked-by: Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/marker.h | 2 --
 kernel/marker.c        | 3 +--
 2 files changed, 1 insertion(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/marker.h b/include/linux/marker.h
index 889196c7fbb..4cf45472d9f 100644
--- a/include/linux/marker.h
+++ b/include/linux/marker.h
@@ -136,8 +136,6 @@ extern marker_probe_func __mark_empty_function;
 
 extern void marker_probe_cb(const struct marker *mdata,
 	void *call_private, ...);
-extern void marker_probe_cb_noarg(const struct marker *mdata,
-	void *call_private, ...);
 
 /*
  * Connect a probe to a marker.
diff --git a/kernel/marker.c b/kernel/marker.c
index 75a1a17cd78..23192646555 100644
--- a/kernel/marker.c
+++ b/kernel/marker.c
@@ -158,7 +158,7 @@ EXPORT_SYMBOL_GPL(marker_probe_cb);
  *
  * Should be connected to markers "MARK_NOARGS".
  */
-void marker_probe_cb_noarg(const struct marker *mdata, void *call_private, ...)
+static void marker_probe_cb_noarg(const struct marker *mdata, void *call_private, ...)
 {
 	va_list args;	/* not initialized */
 	char ptype;
@@ -198,7 +198,6 @@ void marker_probe_cb_noarg(const struct marker *mdata, void *call_private, ...)
 	}
 	rcu_read_unlock_sched();
 }
-EXPORT_SYMBOL_GPL(marker_probe_cb_noarg);
 
 static void free_old_closure(struct rcu_head *head)
 {
-- 
cgit v1.2.3-70-g09d2


From 944ac4259e39801c843a915c3da8194ac9af0440 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Thu, 23 Oct 2008 19:26:08 -0400
Subject: ftrace: ftrace dump on oops control

Impact: add (default-off) dump-trace-on-oops flag

Currently, ftrace is set up to dump its contents to the console if the
kernel panics or oops. This can be annoying if you have trace data in
the buffers and you experience an oops, but the trace data is old or
static.

Usually when you want ftrace to dump its contents is when you are debugging
your system and you have set up ftrace to trace the events leading to
an oops.

This patch adds a control variable called "ftrace_dump_on_oops" that will
enable the ftrace dump to console on oops. This variable is default off
but a developer can enable it either through the kernel command line
by adding "ftrace_dump_on_oops" or at run time by setting (or disabling)
/proc/sys/kernel/ftrace_dump_on_oops.

v2:

   Replaced /** with /* as Randy explained that kernel-doc does
    not yet handle variables.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/ftrace.h |  2 ++
 kernel/sysctl.c        | 10 ++++++++++
 kernel/trace/trace.c   | 29 ++++++++++++++++++++++++++---
 3 files changed, 38 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index a3d46151be1..9623b7b9e5a 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -165,6 +165,8 @@ static inline void __ftrace_enabled_restore(int enabled)
 #endif
 
 #ifdef CONFIG_TRACING
+extern int ftrace_dump_on_oops;
+
 extern void
 ftrace_special(unsigned long arg1, unsigned long arg2, unsigned long arg3);
 
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index a13bd4dfaeb..84754f5801e 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -484,6 +484,16 @@ static struct ctl_table kern_table[] = {
 		.proc_handler	= &ftrace_enable_sysctl,
 	},
 #endif
+#ifdef CONFIG_TRACING
+	{
+		.ctl_name	= CTL_UNNUMBERED,
+		.procname	= "ftrace_dump_on_opps",
+		.data		= &ftrace_dump_on_oops,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+#endif
 #ifdef CONFIG_MODULES
 	{
 		.ctl_name	= KERN_MODPROBE,
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index d345d649d07..47f46cbdd86 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -63,6 +63,28 @@ static cpumask_t __read_mostly		tracing_buffer_mask;
 
 static int tracing_disabled = 1;
 
+/*
+ * ftrace_dump_on_oops - variable to dump ftrace buffer on oops
+ *
+ * If there is an oops (or kernel panic) and the ftrace_dump_on_oops
+ * is set, then ftrace_dump is called. This will output the contents
+ * of the ftrace buffers to the console.  This is very useful for
+ * capturing traces that lead to crashes and outputing it to a
+ * serial console.
+ *
+ * It is default off, but you can enable it with either specifying
+ * "ftrace_dump_on_oops" in the kernel command line, or setting
+ * /proc/sys/kernel/ftrace_dump_on_oops to true.
+ */
+int ftrace_dump_on_oops;
+
+static int __init set_ftrace_dump_on_oops(char *str)
+{
+	ftrace_dump_on_oops = 1;
+	return 1;
+}
+__setup("ftrace_dump_on_oops", set_ftrace_dump_on_oops);
+
 long
 ns2usecs(cycle_t nsec)
 {
@@ -3021,7 +3043,8 @@ EXPORT_SYMBOL_GPL(__ftrace_printk);
 static int trace_panic_handler(struct notifier_block *this,
 			       unsigned long event, void *unused)
 {
-	ftrace_dump();
+	if (ftrace_dump_on_oops)
+		ftrace_dump();
 	return NOTIFY_OK;
 }
 
@@ -3037,7 +3060,8 @@ static int trace_die_handler(struct notifier_block *self,
 {
 	switch (val) {
 	case DIE_OOPS:
-		ftrace_dump();
+		if (ftrace_dump_on_oops)
+			ftrace_dump();
 		break;
 	default:
 		break;
@@ -3078,7 +3102,6 @@ trace_printk_seq(struct trace_seq *s)
 	trace_seq_reset(s);
 }
 
-
 void ftrace_dump(void)
 {
 	static DEFINE_SPINLOCK(ftrace_dump_lock);
-- 
cgit v1.2.3-70-g09d2


From cae1c11414912bf77a62aebd65ced321f0b9da51 Mon Sep 17 00:00:00 2001
From: David Vrabel <david.vrabel@csr.com>
Date: Mon, 27 Oct 2008 15:22:46 +0000
Subject: uwb: reference count reservations

Reference counting the struct uwb_rsv's is safer and easier to get right than
the transferring ownership of the structures from the PAL to reservation
manager.

This fixes an oops in the debug PAL after a reservation timed out.

Signed-off-by: David Vrabel <david.vrabel@csr.com>
---
 drivers/usb/wusbcore/reservation.c |  7 +++---
 drivers/uwb/rsv.c                  | 48 +++++++++++++++++++++++++-------------
 drivers/uwb/uwb-debug.c            | 13 ++++++++---
 include/linux/uwb.h                |  1 +
 4 files changed, 46 insertions(+), 23 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/usb/wusbcore/reservation.c b/drivers/usb/wusbcore/reservation.c
index fc63e77ded2..7b6525dac2f 100644
--- a/drivers/usb/wusbcore/reservation.c
+++ b/drivers/usb/wusbcore/reservation.c
@@ -59,7 +59,6 @@ static void wusbhc_rsv_complete_cb(struct uwb_rsv *rsv)
 	case UWB_RSV_STATE_NONE:
 		dev_dbg(dev, "removed reservation\n");
 		wusbhc_bwa_set(wusbhc, 0, NULL);
-		wusbhc->rsv = NULL;
 		break;
 	default:
 		dev_dbg(dev, "unexpected reservation state: %d\n", rsv->state);
@@ -105,11 +104,11 @@ int wusbhc_rsv_establish(struct wusbhc *wusbhc)
 
 
 /**
- * wusbhc_rsv_terminate - terminate any cluster reservation
+ * wusbhc_rsv_terminate - terminate the cluster reservation
  * @wusbhc: the WUSB host whose reservation is to be terminated
  */
 void wusbhc_rsv_terminate(struct wusbhc *wusbhc)
 {
-	if (wusbhc->rsv)
-		uwb_rsv_terminate(wusbhc->rsv);
+	uwb_rsv_terminate(wusbhc->rsv);
+	uwb_rsv_destroy(wusbhc->rsv);
 }
diff --git a/drivers/uwb/rsv.c b/drivers/uwb/rsv.c
index e4facae46e0..bcc41a4a660 100644
--- a/drivers/uwb/rsv.c
+++ b/drivers/uwb/rsv.c
@@ -82,6 +82,23 @@ static void uwb_rsv_dump(struct uwb_rsv *rsv)
 	dev_dbg(dev, "rsv %s -> %s: %s\n", owner, target, uwb_rsv_state_str(rsv->state));
 }
 
+static void uwb_rsv_release(struct kref *kref)
+{
+	struct uwb_rsv *rsv = container_of(kref, struct uwb_rsv, kref);
+
+	kfree(rsv);
+}
+
+static void uwb_rsv_get(struct uwb_rsv *rsv)
+{
+	kref_get(&rsv->kref);
+}
+
+static void uwb_rsv_put(struct uwb_rsv *rsv)
+{
+	kref_put(&rsv->kref, uwb_rsv_release);
+}
+
 /*
  * Get a free stream index for a reservation.
  *
@@ -325,6 +342,7 @@ static struct uwb_rsv *uwb_rsv_alloc(struct uwb_rc *rc)
 
 	INIT_LIST_HEAD(&rsv->rc_node);
 	INIT_LIST_HEAD(&rsv->pal_node);
+	kref_init(&rsv->kref);
 	init_timer(&rsv->timer);
 	rsv->timer.function = uwb_rsv_timer;
 	rsv->timer.data     = (unsigned long)rsv;
@@ -334,14 +352,6 @@ static struct uwb_rsv *uwb_rsv_alloc(struct uwb_rc *rc)
 	return rsv;
 }
 
-static void uwb_rsv_free(struct uwb_rsv *rsv)
-{
-	uwb_dev_put(rsv->owner);
-	if (rsv->target.type == UWB_RSV_TARGET_DEV)
-		uwb_dev_put(rsv->target.dev);
-	kfree(rsv);
-}
-
 /**
  * uwb_rsv_create - allocate and initialize a UWB reservation structure
  * @rc: the radio controller
@@ -375,23 +385,23 @@ void uwb_rsv_remove(struct uwb_rsv *rsv)
 	if (rsv->state != UWB_RSV_STATE_NONE)
 		uwb_rsv_set_state(rsv, UWB_RSV_STATE_NONE);
 	del_timer_sync(&rsv->timer);
-	list_del(&rsv->rc_node);
-	uwb_rsv_free(rsv);
+	uwb_dev_put(rsv->owner);
+	if (rsv->target.type == UWB_RSV_TARGET_DEV)
+		uwb_dev_put(rsv->target.dev);
+
+	list_del_init(&rsv->rc_node);
+	uwb_rsv_put(rsv);
 }
 
 /**
  * uwb_rsv_destroy - free a UWB reservation structure
  * @rsv: the reservation to free
  *
- * The reservation will be terminated if it is pending or established.
+ * The reservation must already be terminated.
  */
 void uwb_rsv_destroy(struct uwb_rsv *rsv)
 {
-	struct uwb_rc *rc = rsv->rc;
-
-	mutex_lock(&rc->rsvs_mutex);
-	uwb_rsv_remove(rsv);
-	mutex_unlock(&rc->rsvs_mutex);
+	uwb_rsv_put(rsv);
 }
 EXPORT_SYMBOL_GPL(uwb_rsv_destroy);
 
@@ -423,6 +433,7 @@ int uwb_rsv_establish(struct uwb_rsv *rsv)
 		goto out;
 	}
 
+	uwb_rsv_get(rsv);
 	list_add_tail(&rsv->rc_node, &rc->reservations);
 	rsv->owner = &rc->uwb_dev;
 	uwb_dev_get(rsv->owner);
@@ -478,9 +489,14 @@ EXPORT_SYMBOL_GPL(uwb_rsv_terminate);
  *
  * Reservation requests from peers are denied unless a PAL accepts it
  * by calling this function.
+ *
+ * The PAL call uwb_rsv_destroy() for all accepted reservations before
+ * calling uwb_pal_unregister().
  */
 void uwb_rsv_accept(struct uwb_rsv *rsv, uwb_rsv_cb_f cb, void *pal_priv)
 {
+	uwb_rsv_get(rsv);
+
 	rsv->callback = cb;
 	rsv->pal_priv = pal_priv;
 	rsv->state    = UWB_RSV_STATE_T_ACCEPTED;
diff --git a/drivers/uwb/uwb-debug.c b/drivers/uwb/uwb-debug.c
index 6d232c35d07..6db641e4531 100644
--- a/drivers/uwb/uwb-debug.c
+++ b/drivers/uwb/uwb-debug.c
@@ -104,6 +104,11 @@ static void uwb_dbg_rsv_cb(struct uwb_rsv *rsv)
 
 	dev_dbg(dev, "debug: rsv %s -> %s: %s\n",
 		owner, target, uwb_rsv_state_str(rsv->state));
+
+	if (rsv->state == UWB_RSV_STATE_NONE) {
+		list_del(&rsv->pal_node);
+		uwb_rsv_destroy(rsv);
+	}
 }
 
 static int cmd_rsv_establish(struct uwb_rc *rc,
@@ -153,11 +158,11 @@ static int cmd_rsv_terminate(struct uwb_rc *rc,
 			found = rsv;
 			break;
 		}
+		i++;
 	}
 	if (!found)
 		return -EINVAL;
 
-	list_del(&found->pal_node);
 	uwb_rsv_terminate(found);
 
 	return 0;
@@ -287,8 +292,10 @@ static void uwb_dbg_new_rsv(struct uwb_rsv *rsv)
 {
 	struct uwb_rc *rc = rsv->rc;
 
-	if (rc->dbg->accept)
+	if (rc->dbg->accept) {
+		list_add_tail(&rsv->pal_node, &rc->dbg->rsvs);
 		uwb_rsv_accept(rsv, uwb_dbg_rsv_cb, NULL);
+	}
 }
 
 /**
@@ -336,7 +343,7 @@ void uwb_dbg_del_rc(struct uwb_rc *rc)
 		return;
 
 	list_for_each_entry_safe(rsv, t, &rc->dbg->rsvs, pal_node) {
-		uwb_rsv_destroy(rsv);
+		uwb_rsv_terminate(rsv);
 	}
 
 	uwb_pal_unregister(rc, &rc->dbg->pal);
diff --git a/include/linux/uwb.h b/include/linux/uwb.h
index f9ccbd9a2ce..010ee708304 100644
--- a/include/linux/uwb.h
+++ b/include/linux/uwb.h
@@ -201,6 +201,7 @@ struct uwb_rsv {
 	struct uwb_rc *rc;
 	struct list_head rc_node;
 	struct list_head pal_node;
+	struct kref kref;
 
 	struct uwb_dev *owner;
 	struct uwb_rsv_target target;
-- 
cgit v1.2.3-70-g09d2


From 4d2bea4ca0adb4cebfbf89d34869c74081c42577 Mon Sep 17 00:00:00 2001
From: David Vrabel <david.vrabel@csr.com>
Date: Mon, 27 Oct 2008 15:42:31 +0000
Subject: wusb: do a proper channel stop

When stopping the WUSB channel the host should send Channel Stop IEs giving
the WUSB Channel Time of the last MMC.  Both WHCI and HWA hosts provide a
channel stop command for this.

Signed-off-by: David Vrabel <david.vrabel@csr.com>
---
 drivers/usb/host/hwa-hc.c       | 102 +++++++++++++++++++---------------------
 drivers/usb/host/whci/whcd.h    |   2 +-
 drivers/usb/host/whci/whci-hc.h |   2 +
 drivers/usb/host/whci/wusb.c    |  15 ++++--
 drivers/usb/wusbcore/mmc.c      |   8 +---
 drivers/usb/wusbcore/wusbhc.h   |  24 ++++++----
 include/linux/usb/wusb-wa.h     |   1 +
 7 files changed, 80 insertions(+), 74 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/usb/host/hwa-hc.c b/drivers/usb/host/hwa-hc.c
index 64be4d88df1..0e18989e165 100644
--- a/drivers/usb/host/hwa-hc.c
+++ b/drivers/usb/host/hwa-hc.c
@@ -171,11 +171,6 @@ static int hwahc_op_start(struct usb_hcd *usb_hcd)
 	if (result < 0)
 		goto error_set_cluster_id;
 
-	result = wa_nep_arm(&hwahc->wa, GFP_KERNEL);
-	if (result < 0) {
-		dev_err(dev, "cannot listen to notifications: %d\n", result);
-		goto error_stop;
-	}
 	usb_hcd->uses_new_polling = 1;
 	usb_hcd->poll_rh = 1;
 	usb_hcd->state = HC_STATE_RUNNING;
@@ -185,8 +180,6 @@ out:
 	d_fnend(4, dev, "(hwahc %p) = %d\n", hwahc, result);
 	return result;
 
-error_stop:
-	__wa_stop(&hwahc->wa);
 error_set_cluster_id:
 	wusb_cluster_id_put(wusbhc->cluster_id);
 error_cluster_id_get:
@@ -194,39 +187,6 @@ error_cluster_id_get:
 
 }
 
-/*
- * FIXME: break this function up
- */
-static int __hwahc_op_wusbhc_start(struct wusbhc *wusbhc)
-{
-	int result;
-	struct hwahc *hwahc = container_of(wusbhc, struct hwahc, wusbhc);
-	struct device *dev = &hwahc->wa.usb_iface->dev;
-
-	/* Set up a Host Info WUSB Information Element */
-	d_fnstart(4, dev, "(hwahc %p)\n", hwahc);
-	result = -ENOSPC;
-
-	result = __wa_set_feature(&hwahc->wa, WA_ENABLE);
-	if (result < 0) {
-		dev_err(dev, "error commanding HC to start: %d\n", result);
-		goto error_stop;
-	}
-	result = __wa_wait_status(&hwahc->wa, WA_ENABLE, WA_ENABLE);
-	if (result < 0) {
-		dev_err(dev, "error waiting for HC to start: %d\n", result);
-		goto error_stop;
-	}
-	result = 0;
-out:
-	d_fnend(4, dev, "(hwahc %p) = %d\n", hwahc, result);
-	return result;
-
-error_stop:
-	result = __wa_clear_feature(&hwahc->wa, WA_ENABLE);
-	goto out;
-}
-
 static int hwahc_op_suspend(struct usb_hcd *usb_hcd, pm_message_t msg)
 {
 	struct wusbhc *wusbhc = usb_hcd_to_wusbhc(usb_hcd);
@@ -246,18 +206,6 @@ static int hwahc_op_resume(struct usb_hcd *usb_hcd)
 	return -ENOSYS;
 }
 
-static void __hwahc_op_wusbhc_stop(struct wusbhc *wusbhc)
-{
-	int result;
-	struct hwahc *hwahc = container_of(wusbhc, struct hwahc, wusbhc);
-	struct device *dev = &hwahc->wa.usb_iface->dev;
-
-	d_fnstart(4, dev, "(hwahc %p)\n", hwahc);
-	/* Nothing for now */
-	d_fnend(4, dev, "(hwahc %p) = %d\n", hwahc, result);
-	return;
-}
-
 /*
  * No need to abort pipes, as when this is called, all the children
  * has been disconnected and that has done it [through
@@ -275,8 +223,6 @@ static void hwahc_op_stop(struct usb_hcd *usb_hcd)
 	d_fnstart(4, dev, "(hwahc %p)\n", hwahc);
 	mutex_lock(&wusbhc->mutex);
 	wusbhc_stop(wusbhc);
-	wa_nep_disarm(&hwahc->wa);
-	result = __wa_stop(&hwahc->wa);
 	wusb_cluster_id_put(wusbhc->cluster_id);
 	mutex_unlock(&wusbhc->mutex);
 	d_fnend(4, dev, "(hwahc %p) = %d\n", hwahc, result);
@@ -325,6 +271,54 @@ static void hwahc_op_endpoint_disable(struct usb_hcd *usb_hcd,
 	rpipe_ep_disable(&hwahc->wa, ep);
 }
 
+static int __hwahc_op_wusbhc_start(struct wusbhc *wusbhc)
+{
+	int result;
+	struct hwahc *hwahc = container_of(wusbhc, struct hwahc, wusbhc);
+	struct device *dev = &hwahc->wa.usb_iface->dev;
+
+	result = __wa_set_feature(&hwahc->wa, WA_ENABLE);
+	if (result < 0) {
+		dev_err(dev, "error commanding HC to start: %d\n", result);
+		goto error_stop;
+	}
+	result = __wa_wait_status(&hwahc->wa, WA_ENABLE, WA_ENABLE);
+	if (result < 0) {
+		dev_err(dev, "error waiting for HC to start: %d\n", result);
+		goto error_stop;
+	}
+	result = wa_nep_arm(&hwahc->wa, GFP_KERNEL);
+	if (result < 0) {
+		dev_err(dev, "cannot listen to notifications: %d\n", result);
+		goto error_stop;
+	}
+	return result;
+
+error_stop:
+	__wa_clear_feature(&hwahc->wa, WA_ENABLE);
+	return result;
+}
+
+static void __hwahc_op_wusbhc_stop(struct wusbhc *wusbhc, int delay)
+{
+	struct hwahc *hwahc = container_of(wusbhc, struct hwahc, wusbhc);
+	struct wahc *wa = &hwahc->wa;
+	u8 iface_no = wa->usb_iface->cur_altsetting->desc.bInterfaceNumber;
+	int ret;
+
+	ret = usb_control_msg(wa->usb_dev, usb_sndctrlpipe(wa->usb_dev, 0),
+			      WUSB_REQ_CHAN_STOP,
+			      USB_DIR_OUT | USB_TYPE_CLASS | USB_RECIP_INTERFACE,
+			      delay * 1000,
+			      iface_no,
+			      NULL, 0, 1000 /* FIXME: arbitrary */);
+	if (ret == 0)
+		msleep(delay);
+
+	wa_nep_disarm(&hwahc->wa);
+	__wa_stop(&hwahc->wa);
+}
+
 /*
  * Set the UWB MAS allocation for the WUSB cluster
  *
diff --git a/drivers/usb/host/whci/whcd.h b/drivers/usb/host/whci/whcd.h
index 1d2a53bd39f..1bbb8cb6bf8 100644
--- a/drivers/usb/host/whci/whcd.h
+++ b/drivers/usb/host/whci/whcd.h
@@ -136,7 +136,7 @@ int whc_do_gencmd(struct whc *whc, u32 cmd, u32 params, void *addr, size_t len);
 
 /* wusb.c */
 int whc_wusbhc_start(struct wusbhc *wusbhc);
-void whc_wusbhc_stop(struct wusbhc *wusbhc);
+void whc_wusbhc_stop(struct wusbhc *wusbhc, int delay);
 int whc_mmcie_add(struct wusbhc *wusbhc, u8 interval, u8 repeat_cnt,
 		  u8 handle, struct wuie_hdr *wuie);
 int whc_mmcie_rm(struct wusbhc *wusbhc, u8 handle);
diff --git a/drivers/usb/host/whci/whci-hc.h b/drivers/usb/host/whci/whci-hc.h
index bff1eb7a35c..51df7e313b3 100644
--- a/drivers/usb/host/whci/whci-hc.h
+++ b/drivers/usb/host/whci/whci-hc.h
@@ -410,6 +410,8 @@ struct dn_buf_entry {
 #  define WUSBDNTSCTRL_SLOTS(s)    ((s) << 0)
 
 #define WUSBTIME             0x68
+#  define WUSBTIME_CHANNEL_TIME_MASK 0x00ffffff
+
 #define WUSBBPST             0x6c
 #define WUSBDIBUPDATED       0x70
 
diff --git a/drivers/usb/host/whci/wusb.c b/drivers/usb/host/whci/wusb.c
index 66e4ddcd961..2befd475def 100644
--- a/drivers/usb/host/whci/wusb.c
+++ b/drivers/usb/host/whci/wusb.c
@@ -64,8 +64,9 @@ static int whc_update_di(struct whc *whc, int idx)
 }
 
 /*
- * WHCI starts and stops MMCs based on there being a valid GTK so
- * these need only start/stop the asynchronous and periodic schedules.
+ * WHCI starts MMCs based on there being a valid GTK so these need
+ * only start/stop the asynchronous and periodic schedules and send a
+ * channel stop command.
  */
 
 int whc_wusbhc_start(struct wusbhc *wusbhc)
@@ -78,12 +79,20 @@ int whc_wusbhc_start(struct wusbhc *wusbhc)
 	return 0;
 }
 
-void whc_wusbhc_stop(struct wusbhc *wusbhc)
+void whc_wusbhc_stop(struct wusbhc *wusbhc, int delay)
 {
 	struct whc *whc = wusbhc_to_whc(wusbhc);
+	u32 stop_time, now_time;
+	int ret;
 
 	pzl_stop(whc);
 	asl_stop(whc);
+
+	now_time = le_readl(whc->base + WUSBTIME) & WUSBTIME_CHANNEL_TIME_MASK;
+	stop_time = (now_time + ((delay * 8) << 7)) & 0x00ffffff;
+	ret = whc_do_gencmd(whc, WUSBGENCMDSTS_CHAN_STOP, stop_time, NULL, 0);
+	if (ret == 0)
+		msleep(delay);
 }
 
 int whc_mmcie_add(struct wusbhc *wusbhc, u8 interval, u8 repeat_cnt,
diff --git a/drivers/usb/wusbcore/mmc.c b/drivers/usb/wusbcore/mmc.c
index cfa77a01ceb..af2aee0fdff 100644
--- a/drivers/usb/wusbcore/mmc.c
+++ b/drivers/usb/wusbcore/mmc.c
@@ -250,18 +250,14 @@ error_alloc:
  * wusbhc_stop - stop transmitting MMCs
  * @wusbhc: the HC to stop
  *
- * Send a Host Disconnect IE, wait, remove all the MMCs (stop sending MMCs).
- *
- * If we can't allocate a Host Stop IE, screw it, we don't notify the
- * devices we are disconnecting...
+ * Stops the WUSB channel and removes the cluster reservation.
  */
 void wusbhc_stop(struct wusbhc *wusbhc)
 {
 	if (wusbhc->active) {
 		wusbhc->active = 0;
-		wusbhc->stop(wusbhc);
+		wusbhc->stop(wusbhc, WUSB_CHANNEL_STOP_DELAY_MS);
 		wusbhc_sec_stop(wusbhc);
-		__wusbhc_host_disconnect_ie(wusbhc);
 		wusbhc_devconnect_stop(wusbhc);
 		wusbhc_rsv_terminate(wusbhc);
 	}
diff --git a/drivers/usb/wusbcore/wusbhc.h b/drivers/usb/wusbcore/wusbhc.h
index d0c132434f1..b9bdf5a5f11 100644
--- a/drivers/usb/wusbcore/wusbhc.h
+++ b/drivers/usb/wusbcore/wusbhc.h
@@ -64,6 +64,13 @@
 #include <linux/uwb.h>
 #include <linux/usb/wusb.h>
 
+/*
+ * Time from a WUSB channel stop request to the last transmitted MMC.
+ *
+ * This needs to be > 4.096 ms in case no MMCs can be transmitted in
+ * zone 0.
+ */
+#define WUSB_CHANNEL_STOP_DELAY_MS 8
 
 /**
  * Wireless USB device
@@ -198,21 +205,18 @@ struct wusb_port {
  * @mmcies_max	   Max number of Information Elements this HC can send
  *                 in its MMC. Read-only.
  *
+ * @start          Start the WUSB channel.
+ *
+ * @stop           Stop the WUSB channel after the specified number of
+ *                 milliseconds.  Channel Stop IEs should be transmitted
+ *                 as required by [WUSB] 4.16.2.1.
+ *
  * @mmcie_add	   HC specific operation (WHCI or HWA) for adding an
  *                 MMCIE.
  *
  * @mmcie_rm	   HC specific operation (WHCI or HWA) for removing an
  *                 MMCIE.
  *
- * @enc_types	   Array which describes the encryptions methods
- *                 supported by the host as described in WUSB1.0 --
- *                 one entry per supported method. As of WUSB1.0 there
- *                 is only four methods, we make space for eight just in
- *                 case they decide to add some more (and pray they do
- *                 it in sequential order). if 'enc_types[enc_method]
- *                 != 0', then it is supported by the host. enc_method
- *                 is USB_ENC_TYPE*.
- *
  * @set_ptk:       Set the PTK and enable encryption for a device. Or, if
  *                 the supplied key is NULL, disable encryption for that
  *                 device.
@@ -269,7 +273,7 @@ struct wusbhc {
 	u8 mmcies_max;
 	/* FIXME: make wusbhc_ops? */
 	int (*start)(struct wusbhc *wusbhc);
-	void (*stop)(struct wusbhc *wusbhc);
+	void (*stop)(struct wusbhc *wusbhc, int delay);
 	int (*mmcie_add)(struct wusbhc *wusbhc, u8 interval, u8 repeat_cnt,
 			 u8 handle, struct wuie_hdr *wuie);
 	int (*mmcie_rm)(struct wusbhc *wusbhc, u8 handle);
diff --git a/include/linux/usb/wusb-wa.h b/include/linux/usb/wusb-wa.h
index a102561e702..fb7c359bdfb 100644
--- a/include/linux/usb/wusb-wa.h
+++ b/include/linux/usb/wusb-wa.h
@@ -51,6 +51,7 @@ enum {
 	WUSB_REQ_GET_TIME       = 25,
 	WUSB_REQ_SET_STREAM_IDX = 26,
 	WUSB_REQ_SET_WUSB_MAS   = 27,
+	WUSB_REQ_CHAN_STOP      = 28,
 };
 
 
-- 
cgit v1.2.3-70-g09d2


From 1cde7f68ced8d10a20dd2370e9d1d22ab3c1ea5c Mon Sep 17 00:00:00 2001
From: David Vrabel <david.vrabel@csr.com>
Date: Mon, 27 Oct 2008 16:48:09 +0000
Subject: uwb: order IEs by element ID

ECMA-368 requires that IEs in a beacon must be sorted by element ID.  Most
hardware uses the ordering in the Set IE URC command so get the ordering
right on the host.

Also refactor the IE management code:
  - use uwb_ie_next() instead of uwb_ie_for_each().
  - remove unnecessary functions.
  - API is now only uwb_rc_ie_add() and uwb_rc_ie_rm().

Signed-off-by: David Vrabel <david.vrabel@csr.com>
---
 drivers/uwb/beacon.c           |  24 +--
 drivers/uwb/ie.c               | 463 +++++++++++++----------------------------
 drivers/uwb/lc-rc.c            |  25 ---
 drivers/uwb/uwb-internal.h     |  16 +-
 drivers/uwb/wlp/wlp-internal.h |   4 -
 include/linux/uwb.h            |  18 +-
 6 files changed, 172 insertions(+), 378 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/uwb/beacon.c b/drivers/uwb/beacon.c
index 46b18eec502..ad823987ced 100644
--- a/drivers/uwb/beacon.c
+++ b/drivers/uwb/beacon.c
@@ -349,22 +349,22 @@ ssize_t uwb_bce_print_IEs(struct uwb_dev *uwb_dev, struct uwb_beca_e *bce,
 	ssize_t result = 0;
 	struct uwb_rc_evt_beacon *be;
 	struct uwb_beacon_frame *bf;
-	struct uwb_buf_ctx ctx = {
-		.buf = buf,
-		.bytes = 0,
-		.size = size
-	};
+	int ies_len;
+	struct uwb_ie_hdr *ies;
 
 	mutex_lock(&bce->mutex);
+
 	be = bce->be;
-	if (be == NULL)
-		goto out;
-	bf = (void *) be->BeaconInfo;
-	uwb_ie_for_each(uwb_dev, uwb_ie_dump_hex, &ctx,
-			bf->IEData, be->wBeaconInfoLength - sizeof(*bf));
-	result = ctx.bytes;
-out:
+	if (be) {
+		bf = (struct uwb_beacon_frame *)bce->be->BeaconInfo;
+		ies_len = be->wBeaconInfoLength - sizeof(struct uwb_beacon_frame);
+		ies = (struct uwb_ie_hdr *)bf->IEData;
+
+		result = uwb_ie_dump_hex(ies, ies_len, buf, size);
+	}
+
 	mutex_unlock(&bce->mutex);
+
 	return result;
 }
 
diff --git a/drivers/uwb/ie.c b/drivers/uwb/ie.c
index cf6f3d152b9..ab976686175 100644
--- a/drivers/uwb/ie.c
+++ b/drivers/uwb/ie.c
@@ -25,8 +25,6 @@
  */
 
 #include "uwb-internal.h"
-#define D_LOCAL 0
-#include <linux/uwb/debug.h>
 
 /**
  * uwb_ie_next - get the next IE in a buffer
@@ -60,6 +58,42 @@ struct uwb_ie_hdr *uwb_ie_next(void **ptr, size_t *len)
 }
 EXPORT_SYMBOL_GPL(uwb_ie_next);
 
+/**
+ * uwb_ie_dump_hex - print IEs to a character buffer
+ * @ies: the IEs to print.
+ * @len: length of all the IEs.
+ * @buf: the destination buffer.
+ * @size: size of @buf.
+ *
+ * Returns the number of characters written.
+ */
+int uwb_ie_dump_hex(const struct uwb_ie_hdr *ies, size_t len,
+		    char *buf, size_t size)
+{
+	void *ptr;
+	const struct uwb_ie_hdr *ie;
+	int r = 0;
+	u8 *d;
+
+	ptr = (void *)ies;
+	for (;;) {
+		ie = uwb_ie_next(&ptr, &len);
+		if (!ie)
+			break;
+
+		r += scnprintf(buf + r, size - r, "%02x %02x",
+			       (unsigned)ie->element_id,
+			       (unsigned)ie->length);
+		d = (uint8_t *)ie + sizeof(struct uwb_ie_hdr);
+		while (d != ptr && r < size)
+			r += scnprintf(buf + r, size - r, " %02x", (unsigned)*d++);
+		if (r < size)
+			buf[r++] = '\n';
+	};
+
+	return r;
+}
+
 /**
  * Get the IEs that a radio controller is sending in its beacon
  *
@@ -70,6 +104,7 @@ EXPORT_SYMBOL_GPL(uwb_ie_next);
  * anything. Once done with the iedata buffer, call
  * uwb_rc_ie_release(iedata). Don't call kfree on it.
  */
+static
 ssize_t uwb_rc_get_ie(struct uwb_rc *uwb_rc, struct uwb_rc_evt_get_ie **pget_ie)
 {
 	ssize_t result;
@@ -78,148 +113,35 @@ ssize_t uwb_rc_get_ie(struct uwb_rc *uwb_rc, struct uwb_rc_evt_get_ie **pget_ie)
 	struct uwb_rceb *reply = NULL;
 	struct uwb_rc_evt_get_ie *get_ie;
 
-	d_fnstart(3, dev, "(%p, %p)\n", uwb_rc, pget_ie);
-	result = -ENOMEM;
 	cmd = kzalloc(sizeof(*cmd), GFP_KERNEL);
 	if (cmd == NULL)
-		goto error_kzalloc;
+		return -ENOMEM;
+
 	cmd->bCommandType = UWB_RC_CET_GENERAL;
 	cmd->wCommand = cpu_to_le16(UWB_RC_CMD_GET_IE);
 	result = uwb_rc_vcmd(uwb_rc, "GET_IE", cmd, sizeof(*cmd),
 			     UWB_RC_CET_GENERAL, UWB_RC_CMD_GET_IE,
 			     &reply);
+	kfree(cmd);
 	if (result < 0)
-		goto error_cmd;
+		return result;
+
 	get_ie = container_of(reply, struct uwb_rc_evt_get_ie, rceb);
 	if (result < sizeof(*get_ie)) {
 		dev_err(dev, "not enough data returned for decoding GET IE "
 			"(%zu bytes received vs %zu needed)\n",
 			result, sizeof(*get_ie));
-		result = -EINVAL;
+		return -EINVAL;
 	} else if (result < sizeof(*get_ie) + le16_to_cpu(get_ie->wIELength)) {
 		dev_err(dev, "not enough data returned for decoding GET IE "
 			"payload (%zu bytes received vs %zu needed)\n", result,
 			sizeof(*get_ie) + le16_to_cpu(get_ie->wIELength));
-		result = -EINVAL;
-	} else
-		*pget_ie = get_ie;
-error_cmd:
-	kfree(cmd);
-error_kzalloc:
-	d_fnend(3, dev, "(%p, %p) = %d\n", uwb_rc, pget_ie, (int)result);
-	return result;
-}
-EXPORT_SYMBOL_GPL(uwb_rc_get_ie);
-
-
-/*
- * Given a pointer to an IE, print it in ASCII/hex followed by a new line
- *
- * @ie_hdr: pointer to the IE header. Length is in there, and it is
- *          guaranteed that the ie_hdr->length bytes following it are
- *          safely accesible.
- *
- * @_data: context data passed from uwb_ie_for_each(), an struct output_ctx
- */
-int uwb_ie_dump_hex(struct uwb_dev *uwb_dev, const struct uwb_ie_hdr *ie_hdr,
-		    size_t offset, void *_ctx)
-{
-	struct uwb_buf_ctx *ctx = _ctx;
-	const u8 *pl = (void *)(ie_hdr + 1);
-	u8 pl_itr;
-
-	ctx->bytes += scnprintf(ctx->buf + ctx->bytes, ctx->size - ctx->bytes,
-				"%02x %02x ", (unsigned) ie_hdr->element_id,
-				(unsigned) ie_hdr->length);
-	pl_itr = 0;
-	while (pl_itr < ie_hdr->length && ctx->bytes < ctx->size)
-		ctx->bytes += scnprintf(ctx->buf + ctx->bytes,
-					ctx->size - ctx->bytes,
-					"%02x ", (unsigned) pl[pl_itr++]);
-	if (ctx->bytes < ctx->size)
-		ctx->buf[ctx->bytes++] = '\n';
-	return 0;
-}
-EXPORT_SYMBOL_GPL(uwb_ie_dump_hex);
-
-
-/**
- * Verify that a pointer in a buffer points to valid IE
- *
- * @start: pointer to start of buffer in which IE appears
- * @itr:   pointer to IE inside buffer that will be verified
- * @top:   pointer to end of buffer
- *
- * @returns: 0 if IE is valid, <0 otherwise
- *
- * Verification involves checking that the buffer can contain a
- * header and the amount of data reported in the IE header can be found in
- * the buffer.
- */
-static
-int uwb_rc_ie_verify(struct uwb_dev *uwb_dev, const void *start,
-		     const void *itr, const void *top)
-{
-	struct device *dev = &uwb_dev->dev;
-	const struct uwb_ie_hdr *ie_hdr;
-
-	if (top - itr < sizeof(*ie_hdr)) {
-		dev_err(dev, "Bad IE: no data to decode header "
-			"(%zu bytes left vs %zu needed) at offset %zu\n",
-			top - itr, sizeof(*ie_hdr), itr - start);
-		return -EINVAL;
-	}
-	ie_hdr = itr;
-	itr += sizeof(*ie_hdr);
-	if (top - itr < ie_hdr->length) {
-		dev_err(dev, "Bad IE: not enough data for payload "
-			"(%zu bytes left vs %zu needed) at offset %zu\n",
-			top - itr, (size_t)ie_hdr->length,
-			(void *)ie_hdr - start);
 		return -EINVAL;
 	}
-	return 0;
-}
 
-
-/**
- * Walk a buffer filled with consecutive IE's a buffer
- *
- * @uwb_dev: UWB device this IEs belong to (for err messages mainly)
- *
- * @fn: function to call with each IE; if it returns 0, we keep
- *      traversing the buffer. If it returns !0, we'll stop and return
- *      that value.
- *
- * @data: pointer passed to @fn
- *
- * @buf: buffer where the consecutive IEs are located
- *
- * @size: size of @buf
- *
- * Each IE is checked for basic correctness (there is space left for
- * the header and the payload). If that test is failed, we stop
- * processing. For every good IE, @fn is called.
- */
-ssize_t uwb_ie_for_each(struct uwb_dev *uwb_dev, uwb_ie_f fn, void *data,
-			const void *buf, size_t size)
-{
-	ssize_t result = 0;
-	const struct uwb_ie_hdr *ie_hdr;
-	const void *itr = buf, *top = itr + size;
-
-	while (itr < top) {
-		if (uwb_rc_ie_verify(uwb_dev, buf, itr, top) != 0)
-			break;
-		ie_hdr = itr;
-		itr += sizeof(*ie_hdr) + ie_hdr->length;
-		result = fn(uwb_dev, ie_hdr, itr - buf, data);
-		if (result != 0)
-			break;
-	}
+	*pget_ie = get_ie;
 	return result;
 }
-EXPORT_SYMBOL_GPL(uwb_ie_for_each);
 
 
 /**
@@ -256,70 +178,6 @@ error_cmd:
 	return result;
 }
 
-/**
- * Determine by IE id if IE is host settable
- * WUSB 1.0 [8.6.2.8 Table 8.85]
- *
- * EXCEPTION:
- * All but UWB_IE_WLP appears in Table 8.85 from WUSB 1.0. Setting this IE
- * is required for the WLP substack to perform association with its WSS so
- * we hope that the WUSB spec will be changed to reflect this.
- */
-static
-int uwb_rc_ie_is_host_settable(enum uwb_ie element_id)
-{
-	if (element_id == UWB_PCA_AVAILABILITY ||
-	    element_id == UWB_BP_SWITCH_IE ||
-	    element_id == UWB_MAC_CAPABILITIES_IE ||
-	    element_id == UWB_PHY_CAPABILITIES_IE ||
-	    element_id == UWB_APP_SPEC_PROBE_IE ||
-	    element_id == UWB_IDENTIFICATION_IE ||
-	    element_id == UWB_MASTER_KEY_ID_IE ||
-	    element_id == UWB_IE_WLP ||
-	    element_id == UWB_APP_SPEC_IE)
-		return 1;
-	return 0;
-}
-
-
-/**
- * Extract Host Settable IEs from IE
- *
- * @ie_data: pointer to buffer containing all IEs
- * @size:    size of buffer
- *
- * @returns: length of buffer that only includes host settable IEs
- *
- * Given a buffer of IEs we move all Host Settable IEs to front of buffer
- * by overwriting the IEs that are not Host Settable.
- * Buffer length is adjusted accordingly.
- */
-static
-ssize_t uwb_rc_parse_host_settable_ie(struct uwb_dev *uwb_dev,
-				      void *ie_data, size_t size)
-{
-	size_t new_len = size;
-	struct uwb_ie_hdr *ie_hdr;
-	size_t ie_length;
-	void *itr = ie_data, *top = itr + size;
-
-	while (itr < top) {
-		if (uwb_rc_ie_verify(uwb_dev, ie_data, itr, top) != 0)
-			break;
-		ie_hdr = itr;
-		ie_length = sizeof(*ie_hdr) + ie_hdr->length;
-		if (uwb_rc_ie_is_host_settable(ie_hdr->element_id)) {
-			itr += ie_length;
-		} else {
-			memmove(itr, itr + ie_length, top - (itr + ie_length));
-			new_len -= ie_length;
-			top -= ie_length;
-		}
-	}
-	return new_len;
-}
-
-
 /* Cleanup the whole IE management subsystem */
 void uwb_rc_ie_init(struct uwb_rc *uwb_rc)
 {
@@ -328,49 +186,34 @@ void uwb_rc_ie_init(struct uwb_rc *uwb_rc)
 
 
 /**
- * Set up cache for host settable IEs currently being transmitted
+ * uwb_rc_ie_setup - setup a radio controller's IE manager
+ * @uwb_rc: the radio controller.
  *
- * First we just call GET-IE to get the current IEs being transmitted
- * (or we workaround and pretend we did) and (because the format is
- * the same) reuse that as the IE cache (with the command prefix, as
- * explained in 'struct uwb_rc').
+ * The current set of IEs are obtained from the hardware with a GET-IE
+ * command (since the radio controller is not yet beaconing this will
+ * be just the hardware's MAC and PHY Capability IEs).
  *
- * @returns: size of cache created
+ * Returns 0 on success; -ve on an error.
  */
-ssize_t uwb_rc_ie_setup(struct uwb_rc *uwb_rc)
+int uwb_rc_ie_setup(struct uwb_rc *uwb_rc)
 {
-	struct device *dev = &uwb_rc->uwb_dev.dev;
-	ssize_t result;
-	size_t capacity;
-	struct uwb_rc_evt_get_ie *ie_info;
+	struct uwb_rc_evt_get_ie *ie_info = NULL;
+	int capacity;
+
+	capacity = uwb_rc_get_ie(uwb_rc, &ie_info);
+	if (capacity < 0)
+		return capacity;
 
-	d_fnstart(3, dev, "(%p)\n", uwb_rc);
 	mutex_lock(&uwb_rc->ies_mutex);
-	result = uwb_rc_get_ie(uwb_rc, &ie_info);
-	if (result < 0)
-		goto error_get_ie;
-	capacity = result;
-	d_printf(5, dev, "Got IEs %zu bytes (%zu long at %p)\n", result,
-		 (size_t)le16_to_cpu(ie_info->wIELength), ie_info);
-
-	/* Remove IEs that host should not set. */
-	result = uwb_rc_parse_host_settable_ie(&uwb_rc->uwb_dev,
-			ie_info->IEData, le16_to_cpu(ie_info->wIELength));
-	if (result < 0)
-		goto error_parse;
-	d_printf(5, dev, "purged non-settable IEs to %zu bytes\n", result);
-	uwb_rc->ies = (void *) ie_info;
+
+	uwb_rc->ies = (struct uwb_rc_cmd_set_ie *)ie_info;
 	uwb_rc->ies->rccb.bCommandType = UWB_RC_CET_GENERAL;
 	uwb_rc->ies->rccb.wCommand = cpu_to_le16(UWB_RC_CMD_SET_IE);
 	uwb_rc->ies_capacity = capacity;
-	d_printf(5, dev, "IE cache at %p %zu bytes, %zu capacity\n",
-		 ie_info, result, capacity);
-	result = 0;
-error_parse:
-error_get_ie:
+
 	mutex_unlock(&uwb_rc->ies_mutex);
-	d_fnend(3, dev, "(%p) = %zu\n", uwb_rc, result);
-	return result;
+
+	return 0;
 }
 
 
@@ -383,26 +226,47 @@ void uwb_rc_ie_release(struct uwb_rc *uwb_rc)
 }
 
 
-static
-int __acc_size(struct uwb_dev *uwb_dev, const struct uwb_ie_hdr *ie_hdr,
-	       size_t offset, void *_ctx)
+static int uwb_rc_ie_add_one(struct uwb_rc *rc, const struct uwb_ie_hdr *new_ie)
 {
-	size_t *acc_size = _ctx;
-	*acc_size += sizeof(*ie_hdr) + ie_hdr->length;
-	d_printf(6, &uwb_dev->dev, "new acc size %zu\n", *acc_size);
+	struct uwb_rc_cmd_set_ie *new_ies;
+	void *ptr, *prev_ie;
+	struct uwb_ie_hdr *ie;
+	size_t length, new_ie_len, new_capacity, size, prev_size;
+
+	length = le16_to_cpu(rc->ies->wIELength);
+	new_ie_len = sizeof(struct uwb_ie_hdr) + new_ie->length;
+	new_capacity = sizeof(struct uwb_rc_cmd_set_ie) + length + new_ie_len;
+
+	if (new_capacity > rc->ies_capacity) {
+		new_ies = krealloc(rc->ies, new_capacity, GFP_KERNEL);
+		if (!new_ies)
+			return -ENOMEM;
+		rc->ies = new_ies;
+	}
+
+	ptr = rc->ies->IEData;
+	size = length;
+	for (;;) {
+		prev_ie = ptr;
+		prev_size = size;
+		ie = uwb_ie_next(&ptr, &size);
+		if (!ie || ie->element_id > new_ie->element_id)
+			break;
+	}
+
+	memmove(prev_ie + new_ie_len, prev_ie, prev_size);
+	memcpy(prev_ie, new_ie, new_ie_len);
+	rc->ies->wIELength = cpu_to_le16(length + new_ie_len);
+
 	return 0;
 }
 
-
 /**
- * Add a new IE to IEs currently being transmitted by device
- *
+ * uwb_rc_ie_add - add new IEs to the radio controller's beacon
+ * @uwb_rc: the radio controller.
  * @ies: the buffer containing the new IE or IEs to be added to
- *       the device's beacon. The buffer will be verified for
- *       consistence (meaning the headers should be right) and
- *       consistent with the buffer size.
- * @size: size of @ies (in bytes, total buffer size)
- * @returns: 0 if ok, <0 errno code on error
+ *       the device's beacon.
+ * @size: length of all the IEs.
  *
  * According to WHCI 0.95 [4.13.6] the driver will only receive the RCEB
  * after the device sent the first beacon that includes the IEs specified
@@ -411,66 +275,40 @@ int __acc_size(struct uwb_dev *uwb_dev, const struct uwb_ie_hdr *ie_hdr,
  * we start beaconing.
  *
  * Setting an IE on the device will overwrite all current IEs in device. So
- * we take the current IEs being transmitted by the device, append the
+ * we take the current IEs being transmitted by the device, insert the
  * new one, and call SET IE with all the IEs needed.
  *
- * The local IE cache will only be updated with the new IE if SET IE
- * completed successfully.
+ * Returns 0 on success; or -ENOMEM.
  */
 int uwb_rc_ie_add(struct uwb_rc *uwb_rc,
 		  const struct uwb_ie_hdr *ies, size_t size)
 {
 	int result = 0;
-	struct device *dev = &uwb_rc->uwb_dev.dev;
-	struct uwb_rc_cmd_set_ie *new_ies;
-	size_t ies_size, total_size, acc_size = 0;
-
-	if (uwb_rc->ies == NULL)
-		return -ESHUTDOWN;
-	uwb_ie_for_each(&uwb_rc->uwb_dev, __acc_size, &acc_size, ies, size);
-	if (acc_size != size) {
-		dev_err(dev, "BUG: bad IEs, misconstructed headers "
-			"[%zu bytes reported vs %zu calculated]\n",
-			size, acc_size);
-		WARN_ON(1);
-		return -EINVAL;
-	}
+	void *ptr;
+	const struct uwb_ie_hdr *ie;
+
 	mutex_lock(&uwb_rc->ies_mutex);
-	ies_size = le16_to_cpu(uwb_rc->ies->wIELength);
-	total_size = sizeof(*uwb_rc->ies) + ies_size;
-	if (total_size + size > uwb_rc->ies_capacity) {
-		d_printf(4, dev, "Reallocating IE cache from %p capacity %zu "
-			 "to capacity %zu\n", uwb_rc->ies, uwb_rc->ies_capacity,
-			 total_size + size);
-		new_ies = kzalloc(total_size + size, GFP_KERNEL);
-		if (new_ies == NULL) {
-			dev_err(dev, "No memory for adding new IE\n");
-			result = -ENOMEM;
-			goto error_alloc;
-		}
-		memcpy(new_ies, uwb_rc->ies, total_size);
-		uwb_rc->ies_capacity = total_size + size;
-		kfree(uwb_rc->ies);
-		uwb_rc->ies = new_ies;
-		d_printf(4, dev, "New IE cache at %p capacity %zu\n",
-			 uwb_rc->ies, uwb_rc->ies_capacity);
+
+	ptr = (void *)ies;
+	for (;;) {
+		ie = uwb_ie_next(&ptr, &size);
+		if (!ie)
+			break;
+
+		result = uwb_rc_ie_add_one(uwb_rc, ie);
+		if (result < 0)
+			break;
 	}
-	memcpy((void *)uwb_rc->ies + total_size, ies, size);
-	uwb_rc->ies->wIELength = cpu_to_le16(ies_size + size);
-	if (uwb_rc->beaconing != -1) {
-		result = uwb_rc_set_ie(uwb_rc, uwb_rc->ies);
-		if (result < 0) {
-			dev_err(dev, "Cannot set new IE on device: %d\n",
-				result);
-			uwb_rc->ies->wIELength = cpu_to_le16(ies_size);
+	if (result >= 0) {
+		if (size == 0) {
+			if (uwb_rc->beaconing != -1)
+				result = uwb_rc_set_ie(uwb_rc, uwb_rc->ies);
 		} else
-			result = 0;
+			result = -EINVAL;
 	}
-	d_printf(4, dev, "IEs now occupy %hu bytes of %zu capacity at %p\n",
-		 le16_to_cpu(uwb_rc->ies->wIELength), uwb_rc->ies_capacity,
-		 uwb_rc->ies);
-error_alloc:
+
 	mutex_unlock(&uwb_rc->ies_mutex);
+
 	return result;
 }
 EXPORT_SYMBOL_GPL(uwb_rc_ie_add);
@@ -489,53 +327,52 @@ EXPORT_SYMBOL_GPL(uwb_rc_ie_add);
  * beacon. We don't reallocate, we just mark the size smaller.
  */
 static
-int uwb_rc_ie_cache_rm(struct uwb_rc *uwb_rc, enum uwb_ie to_remove)
+void uwb_rc_ie_cache_rm(struct uwb_rc *uwb_rc, enum uwb_ie to_remove)
 {
-	struct uwb_ie_hdr *ie_hdr;
-	size_t new_len = le16_to_cpu(uwb_rc->ies->wIELength);
-	void *itr = uwb_rc->ies->IEData;
-	void *top = itr + new_len;
-
-	while (itr < top) {
-		ie_hdr = itr;
-		if (ie_hdr->element_id != to_remove) {
-			itr += sizeof(*ie_hdr) + ie_hdr->length;
-		} else {
-			int ie_length;
-			ie_length = sizeof(*ie_hdr) + ie_hdr->length;
-			if (top - itr != ie_length)
-				memmove(itr, itr + ie_length, top - itr + ie_length);
-			top -= ie_length;
-			new_len -= ie_length;
+	struct uwb_ie_hdr *ie;
+	size_t len = le16_to_cpu(uwb_rc->ies->wIELength);
+	void *ptr;
+	size_t size;
+
+	ptr = uwb_rc->ies->IEData;
+	size = len;
+	for (;;) {
+		ie = uwb_ie_next(&ptr, &size);
+		if (!ie)
+			break;
+		if (ie->element_id == to_remove) {
+			len -= sizeof(struct uwb_ie_hdr) + ie->length;
+			memmove(ie, ptr, size);
+			ptr = ie;
 		}
 	}
-	uwb_rc->ies->wIELength = cpu_to_le16(new_len);
-	return 0;
+	uwb_rc->ies->wIELength = cpu_to_le16(len);
 }
 
 
 /**
- * Remove an IE currently being transmitted by device
+ * uwb_rc_ie_rm - remove an IE from the radio controller's beacon
+ * @uwb_rc: the radio controller.
+ * @element_id: the element ID of the IE to remove.
  *
- * @element_id: id of IE to be removed from device's beacon
+ * Only IEs previously added with uwb_rc_ie_add() may be removed.
+ *
+ * Returns 0 on success; or -ve the SET-IE command to the radio
+ * controller failed.
  */
 int uwb_rc_ie_rm(struct uwb_rc *uwb_rc, enum uwb_ie element_id)
 {
-	struct device *dev = &uwb_rc->uwb_dev.dev;
-	int result;
+	int result = 0;
 
-	if (uwb_rc->ies == NULL)
-		return -ESHUTDOWN;
 	mutex_lock(&uwb_rc->ies_mutex);
-	result = uwb_rc_ie_cache_rm(uwb_rc, element_id);
-	if (result < 0)
-		dev_err(dev, "Cannot remove IE from cache.\n");
-	if (uwb_rc->beaconing != -1) {
+
+	uwb_rc_ie_cache_rm(uwb_rc, element_id);
+
+	if (uwb_rc->beaconing != -1)
 		result = uwb_rc_set_ie(uwb_rc, uwb_rc->ies);
-		if (result < 0)
-			dev_err(dev, "Cannot set new IE on device.\n");
-	}
+
 	mutex_unlock(&uwb_rc->ies_mutex);
+
 	return result;
 }
 EXPORT_SYMBOL_GPL(uwb_rc_ie_rm);
diff --git a/drivers/uwb/lc-rc.c b/drivers/uwb/lc-rc.c
index ee5772f00d4..1129e8767b5 100644
--- a/drivers/uwb/lc-rc.c
+++ b/drivers/uwb/lc-rc.c
@@ -468,28 +468,3 @@ void uwb_rc_put(struct uwb_rc *rc)
 	__uwb_rc_put(rc);
 }
 EXPORT_SYMBOL_GPL(uwb_rc_put);
-
-/*
- *
- *
- */
-ssize_t uwb_rc_print_IEs(struct uwb_rc *uwb_rc, char *buf, size_t size)
-{
-	ssize_t result;
-	struct uwb_rc_evt_get_ie *ie_info;
-	struct uwb_buf_ctx ctx;
-
-	result = uwb_rc_get_ie(uwb_rc, &ie_info);
-	if (result < 0)
-		goto error_get_ie;
-	ctx.buf = buf;
-	ctx.size = size;
-	ctx.bytes = 0;
-	uwb_ie_for_each(&uwb_rc->uwb_dev, uwb_ie_dump_hex, &ctx,
-			ie_info->IEData, result - sizeof(*ie_info));
-	result = ctx.bytes;
-	kfree(ie_info);
-error_get_ie:
-	return result;
-}
-
diff --git a/drivers/uwb/uwb-internal.h b/drivers/uwb/uwb-internal.h
index 2ad307d1296..983ebc4dd8d 100644
--- a/drivers/uwb/uwb-internal.h
+++ b/drivers/uwb/uwb-internal.h
@@ -66,14 +66,14 @@ extern int uwb_rc_scan(struct uwb_rc *rc,
 		       unsigned channel, enum uwb_scan_type type,
 		       unsigned bpst_offset);
 extern int uwb_rc_send_all_drp_ie(struct uwb_rc *rc);
-extern ssize_t uwb_rc_print_IEs(struct uwb_rc *rc, char *, size_t);
-extern void uwb_rc_ie_init(struct uwb_rc *);
-extern void uwb_rc_ie_init(struct uwb_rc *);
-extern ssize_t uwb_rc_ie_setup(struct uwb_rc *);
-extern void uwb_rc_ie_release(struct uwb_rc *);
-extern int uwb_rc_ie_add(struct uwb_rc *,
-			 const struct uwb_ie_hdr *, size_t);
-extern int uwb_rc_ie_rm(struct uwb_rc *, enum uwb_ie);
+
+void uwb_rc_ie_init(struct uwb_rc *);
+int uwb_rc_ie_setup(struct uwb_rc *);
+void uwb_rc_ie_release(struct uwb_rc *);
+int uwb_ie_dump_hex(const struct uwb_ie_hdr *ies, size_t len,
+		    char *buf, size_t size);
+int uwb_rc_set_ie(struct uwb_rc *, struct uwb_rc_cmd_set_ie *);
+
 
 extern const char *uwb_rc_strerror(unsigned code);
 
diff --git a/drivers/uwb/wlp/wlp-internal.h b/drivers/uwb/wlp/wlp-internal.h
index 1c94fabfb1a..3e8d5de7c5b 100644
--- a/drivers/uwb/wlp/wlp-internal.h
+++ b/drivers/uwb/wlp/wlp-internal.h
@@ -42,10 +42,6 @@ enum wlp_wss_connect {
 extern struct kobj_type wss_ktype;
 extern struct attribute_group wss_attr_group;
 
-extern int uwb_rc_ie_add(struct uwb_rc *, const struct uwb_ie_hdr *, size_t);
-extern int uwb_rc_ie_rm(struct uwb_rc *, enum uwb_ie);
-
-
 /* This should be changed to a dynamic array where entries are sorted
  * by eth_addr and search is done in a binary form
  *
diff --git a/include/linux/uwb.h b/include/linux/uwb.h
index 010ee708304..6d93f54b887 100644
--- a/include/linux/uwb.h
+++ b/include/linux/uwb.h
@@ -444,7 +444,6 @@ ssize_t uwb_rc_vcmd(struct uwb_rc *rc, const char *cmd_name,
 		    struct uwb_rccb *cmd, size_t cmd_size,
 		    u8 expected_type, u16 expected_event,
 		    struct uwb_rceb **preply);
-ssize_t uwb_rc_get_ie(struct uwb_rc *, struct uwb_rc_evt_get_ie **);
 int uwb_bg_joined(struct uwb_rc *rc);
 
 size_t __uwb_addr_print(char *, size_t, const unsigned char *, int);
@@ -653,22 +652,9 @@ static inline int edc_inc(struct edc *err_hist, u16 max_err, u16 timeframe)
 
 /* Information Element handling */
 
-/* For representing the state of writing to a buffer when iterating */
-struct uwb_buf_ctx {
-	char *buf;
-	size_t bytes, size;
-};
-
-typedef int (*uwb_ie_f)(struct uwb_dev *, const struct uwb_ie_hdr *,
-			size_t, void *);
 struct uwb_ie_hdr *uwb_ie_next(void **ptr, size_t *len);
-ssize_t uwb_ie_for_each(struct uwb_dev *uwb_dev, uwb_ie_f fn, void *data,
-			const void *buf, size_t size);
-int uwb_ie_dump_hex(struct uwb_dev *, const struct uwb_ie_hdr *,
-		    size_t, void *);
-int uwb_rc_set_ie(struct uwb_rc *, struct uwb_rc_cmd_set_ie *);
-struct uwb_ie_hdr *uwb_ie_next(void **ptr, size_t *len);
-
+int uwb_rc_ie_add(struct uwb_rc *uwb_rc, const struct uwb_ie_hdr *ies, size_t size);
+int uwb_rc_ie_rm(struct uwb_rc *uwb_rc, enum uwb_ie element_id);
 
 /*
  * Transmission statistics
-- 
cgit v1.2.3-70-g09d2


From def8b4faff5ca349beafbbfeb2c51f3602a6ef3a Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Tue, 28 Oct 2008 13:24:06 -0700
Subject: net: reduce structures when XFRM=n

ifdef out
* struct sk_buff::sp		(pointer)
* struct dst_entry::xfrm	(pointer)
* struct sock::sk_policy	(2 pointers)

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h   | 15 ++++++++++++++-
 include/net/dst.h        |  3 ++-
 include/net/sock.h       |  2 ++
 include/net/xfrm.h       |  4 ++++
 net/core/skbuff.c        |  2 +-
 net/ipv4/icmp.c          |  3 ++-
 net/ipv4/ip_forward.c    |  2 +-
 net/ipv4/route.c         |  2 ++
 net/ipv6/icmp.c          |  3 ++-
 net/ipv6/ip6_output.c    |  2 +-
 security/selinux/hooks.c |  4 ++--
 11 files changed, 33 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 2725f4e5a9b..487e34507b4 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -269,8 +269,9 @@ struct sk_buff {
 		struct  dst_entry	*dst;
 		struct  rtable		*rtable;
 	};
+#ifdef CONFIG_XFRM
 	struct	sec_path	*sp;
-
+#endif
 	/*
 	 * This is the control buffer. It is free to use for every
 	 * layer. Please put your private variables there. If you
@@ -1864,6 +1865,18 @@ static inline void skb_copy_queue_mapping(struct sk_buff *to, const struct sk_bu
 	to->queue_mapping = from->queue_mapping;
 }
 
+#ifdef CONFIG_XFRM
+static inline struct sec_path *skb_sec_path(struct sk_buff *skb)
+{
+	return skb->sp;
+}
+#else
+static inline struct sec_path *skb_sec_path(struct sk_buff *skb)
+{
+	return NULL;
+}
+#endif
+
 static inline int skb_is_gso(const struct sk_buff *skb)
 {
 	return skb_shinfo(skb)->gso_size;
diff --git a/include/net/dst.h b/include/net/dst.h
index 8a8b71e5f3f..f96c4ba4dd3 100644
--- a/include/net/dst.h
+++ b/include/net/dst.h
@@ -59,8 +59,9 @@ struct dst_entry
 
 	struct neighbour	*neighbour;
 	struct hh_cache		*hh;
+#ifdef CONFIG_XFRM
 	struct xfrm_state	*xfrm;
-
+#endif
 	int			(*input)(struct sk_buff*);
 	int			(*output)(struct sk_buff*);
 
diff --git a/include/net/sock.h b/include/net/sock.h
index ada50c04d09..d6b750a2507 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -229,7 +229,9 @@ struct sock {
 	} sk_backlog;
 	wait_queue_head_t	*sk_sleep;
 	struct dst_entry	*sk_dst_cache;
+#ifdef CONFIG_XFRM
 	struct xfrm_policy	*sk_policy[2];
+#endif
 	rwlock_t		sk_dst_lock;
 	atomic_t		sk_rmem_alloc;
 	atomic_t		sk_wmem_alloc;
diff --git a/include/net/xfrm.h b/include/net/xfrm.h
index 11c890ad8eb..f2c5ba28a42 100644
--- a/include/net/xfrm.h
+++ b/include/net/xfrm.h
@@ -882,6 +882,7 @@ struct xfrm_dst
 	u32 path_cookie;
 };
 
+#ifdef CONFIG_XFRM
 static inline void xfrm_dst_destroy(struct xfrm_dst *xdst)
 {
 	dst_release(xdst->route);
@@ -894,6 +895,7 @@ static inline void xfrm_dst_destroy(struct xfrm_dst *xdst)
 	xdst->partner = NULL;
 #endif
 }
+#endif
 
 extern void xfrm_dst_ifdown(struct dst_entry *dst, struct net_device *dev);
 
@@ -1536,9 +1538,11 @@ static inline void xfrm_states_delete(struct xfrm_state **states, int n)
 }
 #endif
 
+#ifdef CONFIG_XFRM
 static inline struct xfrm_state *xfrm_input_state(struct sk_buff *skb)
 {
 	return skb->sp->xvec[skb->sp->len - 1];
 }
+#endif
 
 #endif	/* _NET_XFRM_H */
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 4e22e3a3535..cdfe473181a 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -489,7 +489,7 @@ static void __copy_skb_header(struct sk_buff *new, const struct sk_buff *old)
 	new->network_header	= old->network_header;
 	new->mac_header		= old->mac_header;
 	new->dst		= dst_clone(old->dst);
-#ifdef CONFIG_INET
+#ifdef CONFIG_XFRM
 	new->sp			= secpath_get(old->sp);
 #endif
 	memcpy(new->cb, old->cb, sizeof(old->cb));
diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
index 72b2de76f1c..e9d6ea0b49c 100644
--- a/net/ipv4/icmp.c
+++ b/net/ipv4/icmp.c
@@ -976,9 +976,10 @@ int icmp_rcv(struct sk_buff *skb)
 	struct net *net = dev_net(rt->u.dst.dev);
 
 	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
+		struct sec_path *sp = skb_sec_path(skb);
 		int nh;
 
-		if (!(skb->sp && skb->sp->xvec[skb->sp->len - 1]->props.flags &
+		if (!(sp && sp->xvec[sp->len - 1]->props.flags &
 				 XFRM_STATE_ICMP))
 			goto drop;
 
diff --git a/net/ipv4/ip_forward.c b/net/ipv4/ip_forward.c
index 450016b89a1..df3fe50bbf0 100644
--- a/net/ipv4/ip_forward.c
+++ b/net/ipv4/ip_forward.c
@@ -106,7 +106,7 @@ int ip_forward(struct sk_buff *skb)
 	 *	We now generate an ICMP HOST REDIRECT giving the route
 	 *	we calculated.
 	 */
-	if (rt->rt_flags&RTCF_DOREDIRECT && !opt->srr && !skb->sp)
+	if (rt->rt_flags&RTCF_DOREDIRECT && !opt->srr && !skb_sec_path(skb))
 		ip_rt_send_redirect(skb);
 
 	skb->priority = rt_tos2priority(iph->tos);
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 21ce7e1b228..ffb2c570543 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -1399,7 +1399,9 @@ void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
 				rt->u.dst.path		= &rt->u.dst;
 				rt->u.dst.neighbour	= NULL;
 				rt->u.dst.hh		= NULL;
+#ifdef CONFIG_XFRM
 				rt->u.dst.xfrm		= NULL;
+#endif
 				rt->rt_genid		= rt_genid(net);
 				rt->rt_flags		|= RTCF_REDIRECTED;
 
diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c
index 9b7d19ae5ce..508a713ac04 100644
--- a/net/ipv6/icmp.c
+++ b/net/ipv6/icmp.c
@@ -646,9 +646,10 @@ static int icmpv6_rcv(struct sk_buff *skb)
 	int type;
 
 	if (!xfrm6_policy_check(NULL, XFRM_POLICY_IN, skb)) {
+		struct sec_path *sp = skb_sec_path(skb);
 		int nh;
 
-		if (!(skb->sp && skb->sp->xvec[skb->sp->len - 1]->props.flags &
+		if (!(sp && sp->xvec[sp->len - 1]->props.flags &
 				 XFRM_STATE_ICMP))
 			goto drop_no_count;
 
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index c77db0b95e2..7d92fd97cfb 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -490,7 +490,7 @@ int ip6_forward(struct sk_buff *skb)
 	   We don't send redirects to frames decapsulated from IPsec.
 	 */
 	if (skb->dev == dst->dev && dst->neighbour && opt->srcrt == 0 &&
-	    !skb->sp) {
+	    !skb_sec_path(skb)) {
 		struct in6_addr *target = NULL;
 		struct rt6_info *rt;
 		struct neighbour *n = dst->neighbour;
diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index 3e3fde7c1d2..aedf02b1345 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -4626,7 +4626,7 @@ static unsigned int selinux_ip_postroute(struct sk_buff *skb, int ifindex,
 	 * as fast and as clean as possible. */
 	if (selinux_compat_net || !selinux_policycap_netpeer)
 		return selinux_ip_postroute_compat(skb, ifindex, family);
-
+#ifdef CONFIG_XFRM
 	/* If skb->dst->xfrm is non-NULL then the packet is undergoing an IPsec
 	 * packet transformation so allow the packet to pass without any checks
 	 * since we'll have another chance to perform access control checks
@@ -4635,7 +4635,7 @@ static unsigned int selinux_ip_postroute(struct sk_buff *skb, int ifindex,
 	 *       is NULL, in this case go ahead and apply access control. */
 	if (skb->dst != NULL && skb->dst->xfrm != NULL)
 		return NF_ACCEPT;
-
+#endif
 	secmark_active = selinux_secmark_enabled();
 	peerlbl_active = netlbl_enabled() || selinux_xfrm_enabled();
 	if (!secmark_active && !peerlbl_active)
-- 
cgit v1.2.3-70-g09d2


From 3a2dfbe8acb154905fdc2fd03ec56df42e6c4cc4 Mon Sep 17 00:00:00 2001
From: Martin Willi <martin@strongswan.org>
Date: Tue, 28 Oct 2008 16:01:07 -0700
Subject: xfrm: Notify changes in UDP encapsulation via netlink

Add new_mapping() implementation to the netlink xfrm_mgr to notify
address/port changes detected in UDP encapsulated ESP packets.

Signed-off-by: Martin Willi <martin@strongswan.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/xfrm.h | 14 ++++++++++++++
 net/xfrm/xfrm_user.c | 52 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 66 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/xfrm.h b/include/linux/xfrm.h
index 4bc1e6b86cb..52f3abd453a 100644
--- a/include/linux/xfrm.h
+++ b/include/linux/xfrm.h
@@ -199,6 +199,9 @@ enum {
 #define XFRM_MSG_NEWSPDINFO XFRM_MSG_NEWSPDINFO
 	XFRM_MSG_GETSPDINFO,
 #define XFRM_MSG_GETSPDINFO XFRM_MSG_GETSPDINFO
+
+	XFRM_MSG_MAPPING,
+#define XFRM_MSG_MAPPING XFRM_MSG_MAPPING
 	__XFRM_MSG_MAX
 };
 #define XFRM_MSG_MAX (__XFRM_MSG_MAX - 1)
@@ -438,6 +441,15 @@ struct xfrm_user_migrate {
 	__u16				new_family;
 };
 
+struct xfrm_user_mapping {
+	struct xfrm_usersa_id		id;
+	__u32				reqid;
+	xfrm_address_t			old_saddr;
+	xfrm_address_t			new_saddr;
+	__be16				old_sport;
+	__be16				new_sport;
+};
+
 #ifndef __KERNEL__
 /* backwards compatibility for userspace */
 #define XFRMGRP_ACQUIRE		1
@@ -464,6 +476,8 @@ enum xfrm_nlgroups {
 #define XFRMNLGRP_REPORT	XFRMNLGRP_REPORT
 	XFRMNLGRP_MIGRATE,
 #define XFRMNLGRP_MIGRATE	XFRMNLGRP_MIGRATE
+	XFRMNLGRP_MAPPING,
+#define XFRMNLGRP_MAPPING	XFRMNLGRP_MAPPING
 	__XFRMNLGRP_MAX
 };
 #define XFRMNLGRP_MAX	(__XFRMNLGRP_MAX - 1)
diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c
index 4a8a1abb59e..76cf56d5d83 100644
--- a/net/xfrm/xfrm_user.c
+++ b/net/xfrm/xfrm_user.c
@@ -2503,6 +2503,57 @@ static int xfrm_send_report(u8 proto, struct xfrm_selector *sel,
 	return nlmsg_multicast(xfrm_nl, skb, 0, XFRMNLGRP_REPORT, GFP_ATOMIC);
 }
 
+static inline size_t xfrm_mapping_msgsize(void)
+{
+	return NLMSG_ALIGN(sizeof(struct xfrm_user_mapping));
+}
+
+static int build_mapping(struct sk_buff *skb, struct xfrm_state *x,
+			 xfrm_address_t *new_saddr, __be16 new_sport)
+{
+	struct xfrm_user_mapping *um;
+	struct nlmsghdr *nlh;
+
+	nlh = nlmsg_put(skb, 0, 0, XFRM_MSG_MAPPING, sizeof(*um), 0);
+	if (nlh == NULL)
+		return -EMSGSIZE;
+
+	um = nlmsg_data(nlh);
+
+	memcpy(&um->id.daddr, &x->id.daddr, sizeof(um->id.daddr));
+	um->id.spi = x->id.spi;
+	um->id.family = x->props.family;
+	um->id.proto = x->id.proto;
+	memcpy(&um->new_saddr, new_saddr, sizeof(um->new_saddr));
+	memcpy(&um->old_saddr, &x->props.saddr, sizeof(um->old_saddr));
+	um->new_sport = new_sport;
+	um->old_sport = x->encap->encap_sport;
+	um->reqid = x->props.reqid;
+
+	return nlmsg_end(skb, nlh);
+}
+
+static int xfrm_send_mapping(struct xfrm_state *x, xfrm_address_t *ipaddr,
+			     __be16 sport)
+{
+	struct sk_buff *skb;
+
+	if (x->id.proto != IPPROTO_ESP)
+		return -EINVAL;
+
+	if (!x->encap)
+		return -EINVAL;
+
+	skb = nlmsg_new(xfrm_mapping_msgsize(), GFP_ATOMIC);
+	if (skb == NULL)
+		return -ENOMEM;
+
+	if (build_mapping(skb, x, ipaddr, sport) < 0)
+		BUG();
+
+	return nlmsg_multicast(xfrm_nl, skb, 0, XFRMNLGRP_MAPPING, GFP_ATOMIC);
+}
+
 static struct xfrm_mgr netlink_mgr = {
 	.id		= "netlink",
 	.notify		= xfrm_send_state_notify,
@@ -2511,6 +2562,7 @@ static struct xfrm_mgr netlink_mgr = {
 	.notify_policy	= xfrm_send_policy_notify,
 	.report		= xfrm_send_report,
 	.migrate	= xfrm_send_migrate,
+	.new_mapping	= xfrm_send_mapping,
 };
 
 static int __init xfrm_user_init(void)
-- 
cgit v1.2.3-70-g09d2


From 0c6ce78abf6e228d44c3840edb8a4ae0c1299825 Mon Sep 17 00:00:00 2001
From: Harvey Harrison <harvey.harrison@gmail.com>
Date: Tue, 28 Oct 2008 16:09:23 -0700
Subject: net: replace uses of NIP6_FMT with %p6

Signed-off-by: Harvey Harrison <harvey.harrison@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/sunrpc/svc_xprt.h                |  4 +--
 include/net/ip_vs.h                            |  4 +--
 include/net/netfilter/nf_conntrack_tuple.h     |  6 ++---
 include/net/sctp/sctp.h                        |  4 +--
 net/ipv4/tcp_input.c                           |  4 +--
 net/ipv4/tcp_timer.c                           |  4 +--
 net/ipv6/addrlabel.c                           | 34 +++++++++-----------------
 net/ipv6/ah6.c                                 |  4 +--
 net/ipv6/esp6.c                                |  4 +--
 net/ipv6/exthdrs.c                             |  2 +-
 net/ipv6/icmp.c                                |  4 +--
 net/ipv6/ip6mr.c                               |  5 ++--
 net/ipv6/ipcomp6.c                             |  4 +--
 net/ipv6/ndisc.c                               |  7 ++----
 net/ipv6/netfilter/ip6t_LOG.c                  |  2 +-
 net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c |  5 ++--
 net/ipv6/tcp_ipv6.c                            |  8 +++---
 17 files changed, 43 insertions(+), 62 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/svc_xprt.h b/include/linux/sunrpc/svc_xprt.h
index 6fd7b016517..42e01c93c7e 100644
--- a/include/linux/sunrpc/svc_xprt.h
+++ b/include/linux/sunrpc/svc_xprt.h
@@ -145,8 +145,8 @@ static inline char *__svc_print_addr(struct sockaddr *addr,
 		break;
 
 	case AF_INET6:
-		snprintf(buf, len, "%x:%x:%x:%x:%x:%x:%x:%x, port=%u",
-			NIP6(((struct sockaddr_in6 *) addr)->sin6_addr),
+		snprintf(buf, len, "%p6, port=%u",
+			 &((struct sockaddr_in6 *)addr)->sin6_addr,
 			ntohs(((struct sockaddr_in6 *) addr)->sin6_port));
 		break;
 
diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h
index fe9fcf73c85..6a669206709 100644
--- a/include/net/ip_vs.h
+++ b/include/net/ip_vs.h
@@ -87,8 +87,8 @@ static inline const char *ip_vs_dbg_addr(int af, char *buf, size_t buf_len,
 	int len;
 #ifdef CONFIG_IP_VS_IPV6
 	if (af == AF_INET6)
-		len = snprintf(&buf[*idx], buf_len - *idx, "[" NIP6_FMT "]",
-			       NIP6(addr->in6)) + 1;
+		len = snprintf(&buf[*idx], buf_len - *idx, "[%p6]",
+			       &addr->in6) + 1;
 	else
 #endif
 		len = snprintf(&buf[*idx], buf_len - *idx, NIPQUAD_FMT,
diff --git a/include/net/netfilter/nf_conntrack_tuple.h b/include/net/netfilter/nf_conntrack_tuple.h
index a6874ba22d5..303efaf68d0 100644
--- a/include/net/netfilter/nf_conntrack_tuple.h
+++ b/include/net/netfilter/nf_conntrack_tuple.h
@@ -122,10 +122,10 @@ static inline void nf_ct_dump_tuple_ip(const struct nf_conntrack_tuple *t)
 static inline void nf_ct_dump_tuple_ipv6(const struct nf_conntrack_tuple *t)
 {
 #ifdef DEBUG
-	printk("tuple %p: %u " NIP6_FMT " %hu -> " NIP6_FMT " %hu\n",
+	printk("tuple %p: %u %p6 %hu -> %p6 %hu\n",
 	       t, t->dst.protonum,
-	       NIP6(*(struct in6_addr *)t->src.u3.all), ntohs(t->src.u.all),
-	       NIP6(*(struct in6_addr *)t->dst.u3.all), ntohs(t->dst.u.all));
+	       t->src.u3.all, ntohs(t->src.u.all),
+	       t->dst.u3.all, ntohs(t->dst.u.all));
 #endif
 }
 
diff --git a/include/net/sctp/sctp.h b/include/net/sctp/sctp.h
index ed71b110edf..a84c3976e1b 100644
--- a/include/net/sctp/sctp.h
+++ b/include/net/sctp/sctp.h
@@ -285,9 +285,9 @@ extern int sctp_debug_flag;
 	if (sctp_debug_flag) { \
 		if (saddr->sa.sa_family == AF_INET6) { \
 			printk(KERN_DEBUG \
-			       lead NIP6_FMT trail, \
+			       lead "%p6" trail, \
 			       leadparm, \
-			       NIP6(saddr->v6.sin6_addr), \
+			       &saddr->v6.sin6_addr, \
 			       otherparms); \
 		} else { \
 			printk(KERN_DEBUG \
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index d77c0d29e23..191c06bb0f6 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -2346,9 +2346,9 @@ static void DBGUNDO(struct sock *sk, const char *msg)
 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
 	else if (sk->sk_family == AF_INET6) {
 		struct ipv6_pinfo *np = inet6_sk(sk);
-		printk(KERN_DEBUG "Undo %s " NIP6_FMT "/%u c%u l%u ss%u/%u p%u\n",
+		printk(KERN_DEBUG "Undo %s %p6/%u c%u l%u ss%u/%u p%u\n",
 		       msg,
-		       NIP6(np->daddr), ntohs(inet->dport),
+		       &np->daddr, ntohs(inet->dport),
 		       tp->snd_cwnd, tcp_left_out(tp),
 		       tp->snd_ssthresh, tp->prior_ssthresh,
 		       tp->packets_out);
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index 6b6dff1164b..4e6ee520523 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -306,8 +306,8 @@ static void tcp_retransmit_timer(struct sock *sk)
 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
 		else if (sk->sk_family == AF_INET6) {
 			struct ipv6_pinfo *np = inet6_sk(sk);
-			LIMIT_NETDEBUG(KERN_DEBUG "TCP: Treason uncloaked! Peer " NIP6_FMT ":%u/%u shrinks window %u:%u. Repaired.\n",
-			       NIP6(np->daddr), ntohs(inet->dport),
+			LIMIT_NETDEBUG(KERN_DEBUG "TCP: Treason uncloaked! Peer %p6:%u/%u shrinks window %u:%u. Repaired.\n",
+			       &np->daddr, ntohs(inet->dport),
 			       inet->num, tp->snd_una, tp->snd_nxt);
 		}
 #endif
diff --git a/net/ipv6/addrlabel.c b/net/ipv6/addrlabel.c
index 08909039d87..d28036659d2 100644
--- a/net/ipv6/addrlabel.c
+++ b/net/ipv6/addrlabel.c
@@ -186,10 +186,8 @@ u32 ipv6_addr_label(struct net *net,
 	label = p ? p->label : IPV6_ADDR_LABEL_DEFAULT;
 	rcu_read_unlock();
 
-	ADDRLABEL(KERN_DEBUG "%s(addr=" NIP6_FMT ", type=%d, ifindex=%d) => %08x\n",
-			__func__,
-			NIP6(*addr), type, ifindex,
-			label);
+	ADDRLABEL(KERN_DEBUG "%s(addr=%p6, type=%d, ifindex=%d) => %08x\n",
+		  __func__, addr, type, ifindex, label);
 
 	return label;
 }
@@ -203,11 +201,8 @@ static struct ip6addrlbl_entry *ip6addrlbl_alloc(struct net *net,
 	struct ip6addrlbl_entry *newp;
 	int addrtype;
 
-	ADDRLABEL(KERN_DEBUG "%s(prefix=" NIP6_FMT ", prefixlen=%d, ifindex=%d, label=%u)\n",
-			__func__,
-			NIP6(*prefix), prefixlen,
-			ifindex,
-			(unsigned int)label);
+	ADDRLABEL(KERN_DEBUG "%s(prefix=%p6, prefixlen=%d, ifindex=%d, label=%u)\n",
+		  __func__, prefix, prefixlen, ifindex, (unsigned int)label);
 
 	addrtype = ipv6_addr_type(prefix) & (IPV6_ADDR_MAPPED | IPV6_ADDR_COMPATv4 | IPV6_ADDR_LOOPBACK);
 
@@ -294,12 +289,9 @@ static int ip6addrlbl_add(struct net *net,
 	struct ip6addrlbl_entry *newp;
 	int ret = 0;
 
-	ADDRLABEL(KERN_DEBUG "%s(prefix=" NIP6_FMT ", prefixlen=%d, ifindex=%d, label=%u, replace=%d)\n",
-			__func__,
-			NIP6(*prefix), prefixlen,
-			ifindex,
-			(unsigned int)label,
-			replace);
+	ADDRLABEL(KERN_DEBUG "%s(prefix=%p6, prefixlen=%d, ifindex=%d, label=%u, replace=%d)\n",
+		  __func__, prefix, prefixlen, ifindex, (unsigned int)label,
+		  replace);
 
 	newp = ip6addrlbl_alloc(net, prefix, prefixlen, ifindex, label);
 	if (IS_ERR(newp))
@@ -321,10 +313,8 @@ static int __ip6addrlbl_del(struct net *net,
 	struct hlist_node *pos, *n;
 	int ret = -ESRCH;
 
-	ADDRLABEL(KERN_DEBUG "%s(prefix=" NIP6_FMT ", prefixlen=%d, ifindex=%d)\n",
-			__func__,
-			NIP6(*prefix), prefixlen,
-			ifindex);
+	ADDRLABEL(KERN_DEBUG "%s(prefix=%p6, prefixlen=%d, ifindex=%d)\n",
+		  __func__, prefix, prefixlen, ifindex);
 
 	hlist_for_each_entry_safe(p, pos, n, &ip6addrlbl_table.head, list) {
 		if (p->prefixlen == prefixlen &&
@@ -347,10 +337,8 @@ static int ip6addrlbl_del(struct net *net,
 	struct in6_addr prefix_buf;
 	int ret;
 
-	ADDRLABEL(KERN_DEBUG "%s(prefix=" NIP6_FMT ", prefixlen=%d, ifindex=%d)\n",
-			__func__,
-			NIP6(*prefix), prefixlen,
-			ifindex);
+	ADDRLABEL(KERN_DEBUG "%s(prefix=%p6, prefixlen=%d, ifindex=%d)\n",
+		  __func__, prefix, prefixlen, ifindex);
 
 	ipv6_addr_prefix(&prefix_buf, prefix, prefixlen);
 	spin_lock(&ip6addrlbl_table.lock);
diff --git a/net/ipv6/ah6.c b/net/ipv6/ah6.c
index 2ff0c8233e4..9bc43f2527c 100644
--- a/net/ipv6/ah6.c
+++ b/net/ipv6/ah6.c
@@ -419,8 +419,8 @@ static void ah6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
 	if (!x)
 		return;
 
-	NETDEBUG(KERN_DEBUG "pmtu discovery on SA AH/%08x/" NIP6_FMT "\n",
-		 ntohl(ah->spi), NIP6(iph->daddr));
+	NETDEBUG(KERN_DEBUG "pmtu discovery on SA AH/%08x/%p6\n",
+		 ntohl(ah->spi), &iph->daddr);
 
 	xfrm_state_put(x);
 }
diff --git a/net/ipv6/esp6.c b/net/ipv6/esp6.c
index b181b08fb76..ec4be188c34 100644
--- a/net/ipv6/esp6.c
+++ b/net/ipv6/esp6.c
@@ -367,8 +367,8 @@ static void esp6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
 	x = xfrm_state_lookup((xfrm_address_t *)&iph->daddr, esph->spi, IPPROTO_ESP, AF_INET6);
 	if (!x)
 		return;
-	printk(KERN_DEBUG "pmtu discovery on SA ESP/%08x/" NIP6_FMT "\n",
-			ntohl(esph->spi), NIP6(iph->daddr));
+	printk(KERN_DEBUG "pmtu discovery on SA ESP/%08x/%p6\n",
+			ntohl(esph->spi), &iph->daddr);
 	xfrm_state_put(x);
 }
 
diff --git a/net/ipv6/exthdrs.c b/net/ipv6/exthdrs.c
index 6bfffec2371..a8905146835 100644
--- a/net/ipv6/exthdrs.c
+++ b/net/ipv6/exthdrs.c
@@ -219,7 +219,7 @@ static int ipv6_dest_hao(struct sk_buff *skb, int optoff)
 
 	if (!(ipv6_addr_type(&hao->addr) & IPV6_ADDR_UNICAST)) {
 		LIMIT_NETDEBUG(
-			KERN_DEBUG "hao is not an unicast addr: " NIP6_FMT "\n", NIP6(hao->addr));
+			KERN_DEBUG "hao is not an unicast addr: %p6\n", &hao->addr);
 		goto discard;
 	}
 
diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c
index 508a713ac04..b3fa38e40dc 100644
--- a/net/ipv6/icmp.c
+++ b/net/ipv6/icmp.c
@@ -681,8 +681,8 @@ static int icmpv6_rcv(struct sk_buff *skb)
 		skb->csum = ~csum_unfold(csum_ipv6_magic(saddr, daddr, skb->len,
 					     IPPROTO_ICMPV6, 0));
 		if (__skb_checksum_complete(skb)) {
-			LIMIT_NETDEBUG(KERN_DEBUG "ICMPv6 checksum failed [" NIP6_FMT " > " NIP6_FMT "]\n",
-				       NIP6(*saddr), NIP6(*daddr));
+			LIMIT_NETDEBUG(KERN_DEBUG "ICMPv6 checksum failed [%p6 > %p6]\n",
+				       saddr, daddr);
 			goto discard_it;
 		}
 	}
diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c
index 182f8a177e7..798e6a29dd8 100644
--- a/net/ipv6/ip6mr.c
+++ b/net/ipv6/ip6mr.c
@@ -297,9 +297,8 @@ static int ipmr_mfc_seq_show(struct seq_file *seq, void *v)
 		const struct mfc6_cache *mfc = v;
 		const struct ipmr_mfc_iter *it = seq->private;
 
-		seq_printf(seq,
-			   NIP6_FMT " " NIP6_FMT " %-3d %8ld %8ld %8ld",
-			   NIP6(mfc->mf6c_mcastgrp), NIP6(mfc->mf6c_origin),
+		seq_printf(seq, "%p6 %p6 %-3d %8ld %8ld %8ld",
+			   &mfc->mf6c_mcastgrp, &mfc->mf6c_origin,
 			   mfc->mf6c_parent,
 			   mfc->mfc_un.res.pkt,
 			   mfc->mfc_un.res.bytes,
diff --git a/net/ipv6/ipcomp6.c b/net/ipv6/ipcomp6.c
index 4545e430686..9566d8f7314 100644
--- a/net/ipv6/ipcomp6.c
+++ b/net/ipv6/ipcomp6.c
@@ -67,8 +67,8 @@ static void ipcomp6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
 	if (!x)
 		return;
 
-	printk(KERN_DEBUG "pmtu discovery on SA IPCOMP/%08x/" NIP6_FMT "\n",
-			spi, NIP6(iph->daddr));
+	printk(KERN_DEBUG "pmtu discovery on SA IPCOMP/%08x/%p6\n",
+			spi, &iph->daddr);
 	xfrm_state_put(x);
 }
 
diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c
index 172438320ee..191bb0722a7 100644
--- a/net/ipv6/ndisc.c
+++ b/net/ipv6/ndisc.c
@@ -647,11 +647,8 @@ static void ndisc_solicit(struct neighbour *neigh, struct sk_buff *skb)
 
 	if ((probes -= neigh->parms->ucast_probes) < 0) {
 		if (!(neigh->nud_state & NUD_VALID)) {
-			ND_PRINTK1(KERN_DEBUG
-				   "%s(): trying to ucast probe in NUD_INVALID: "
-				   NIP6_FMT "\n",
-				   __func__,
-				   NIP6(*target));
+			ND_PRINTK1(KERN_DEBUG "%s(): trying to ucast probe in NUD_INVALID: %p6\n",
+				   __func__, target);
 		}
 		ndisc_send_ns(dev, neigh, target, target, saddr);
 	} else if ((probes -= neigh->parms->app_probes) < 0) {
diff --git a/net/ipv6/netfilter/ip6t_LOG.c b/net/ipv6/netfilter/ip6t_LOG.c
index caa441d0956..a61ce301000 100644
--- a/net/ipv6/netfilter/ip6t_LOG.c
+++ b/net/ipv6/netfilter/ip6t_LOG.c
@@ -61,7 +61,7 @@ static void dump_packet(const struct nf_loginfo *info,
 	}
 
 	/* Max length: 88 "SRC=0000.0000.0000.0000.0000.0000.0000.0000 DST=0000.0000.0000.0000.0000.0000.0000.0000 " */
-	printk("SRC=" NIP6_FMT " DST=" NIP6_FMT " ", NIP6(ih->saddr), NIP6(ih->daddr));
+	printk("SRC=%p6 DST=%p6 ", &ih->saddr, &ih->daddr);
 
 	/* Max length: 44 "LEN=65535 TC=255 HOPLIMIT=255 FLOWLBL=FFFFF " */
 	printk("LEN=%Zu TC=%u HOPLIMIT=%u FLOWLBL=%u ",
diff --git a/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c b/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c
index e91db16611d..b165a273c6c 100644
--- a/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c
+++ b/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c
@@ -56,9 +56,8 @@ static bool ipv6_invert_tuple(struct nf_conntrack_tuple *tuple,
 static int ipv6_print_tuple(struct seq_file *s,
 			    const struct nf_conntrack_tuple *tuple)
 {
-	return seq_printf(s, "src=" NIP6_FMT " dst=" NIP6_FMT " ",
-			  NIP6(*((struct in6_addr *)tuple->src.u3.ip6)),
-			  NIP6(*((struct in6_addr *)tuple->dst.u3.ip6)));
+	return seq_printf(s, "src=%p6 dst=%p6 ",
+			  tuple->src.u3.ip6, tuple->dst.u3.ip6);
 }
 
 /*
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index b6b356b7912..483550cfdf3 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -872,12 +872,10 @@ static int tcp_v6_inbound_md5_hash (struct sock *sk, struct sk_buff *skb)
 
 	if (genhash || memcmp(hash_location, newhash, 16) != 0) {
 		if (net_ratelimit()) {
-			printk(KERN_INFO "MD5 Hash %s for "
-			       "(" NIP6_FMT ", %u)->"
-			       "(" NIP6_FMT ", %u)\n",
+			printk(KERN_INFO "MD5 Hash %s for (%p6, %u)->(%p6, %u)\n",
 			       genhash ? "failed" : "mismatch",
-			       NIP6(ip6h->saddr), ntohs(th->source),
-			       NIP6(ip6h->daddr), ntohs(th->dest));
+			       &ip6h->saddr, ntohs(th->source),
+			       &ip6h->daddr, ntohs(th->dest));
 		}
 		return 1;
 	}
-- 
cgit v1.2.3-70-g09d2


From b189db5d299c6824780af5590564ff608adb3dea Mon Sep 17 00:00:00 2001
From: Harvey Harrison <harvey.harrison@gmail.com>
Date: Tue, 28 Oct 2008 22:38:52 -0700
Subject: net: remove NIP6(), NIP6_FMT, NIP6_SEQFMT and final users

Open code NIP6_FMT in the one call inside sscanf and one user
of NIP6() that could use %p6 in the netfilter code.

Signed-off-by: Harvey Harrison <harvey.harrison@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/kernel.h         | 12 ------------
 net/bridge/netfilter/ebt_log.c |  6 ++----
 net/sunrpc/svcauth_unix.c      |  2 +-
 3 files changed, 3 insertions(+), 17 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index 396a350b87a..77777c46009 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -357,18 +357,6 @@ static inline char *pack_hex_byte(char *buf, u8 byte)
 	((unsigned char *)&addr)[3]
 #define NIPQUAD_FMT "%u.%u.%u.%u"
 
-#define NIP6(addr) \
-	ntohs((addr).s6_addr16[0]), \
-	ntohs((addr).s6_addr16[1]), \
-	ntohs((addr).s6_addr16[2]), \
-	ntohs((addr).s6_addr16[3]), \
-	ntohs((addr).s6_addr16[4]), \
-	ntohs((addr).s6_addr16[5]), \
-	ntohs((addr).s6_addr16[6]), \
-	ntohs((addr).s6_addr16[7])
-#define NIP6_FMT "%04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x"
-#define NIP6_SEQFMT "%04x%04x%04x%04x%04x%04x%04x%04x"
-
 #if defined(__LITTLE_ENDIAN)
 #define HIPQUAD(addr) \
 	((unsigned char *)&addr)[3], \
diff --git a/net/bridge/netfilter/ebt_log.c b/net/bridge/netfilter/ebt_log.c
index 3d33c608906..3654f3ebdb8 100644
--- a/net/bridge/netfilter/ebt_log.c
+++ b/net/bridge/netfilter/ebt_log.c
@@ -133,10 +133,8 @@ ebt_log_packet(u_int8_t pf, unsigned int hooknum,
 			printk(" INCOMPLETE IPv6 header");
 			goto out;
 		}
-		printk(" IPv6 SRC=%x:%x:%x:%x:%x:%x:%x:%x "
-		       "IPv6 DST=%x:%x:%x:%x:%x:%x:%x:%x, IPv6 "
-		       "priority=0x%01X, Next Header=%d", NIP6(ih->saddr),
-		       NIP6(ih->daddr), ih->priority, ih->nexthdr);
+		printk(" IPv6 SRC=%p6 IPv6 DST=%p6, IPv6 priority=0x%01X, Next Header=%d",
+		       &ih->saddr, &ih->daddr, ih->priority, ih->nexthdr);
 		nexthdr = ih->nexthdr;
 		offset_ph = ipv6_skip_exthdr(skb, sizeof(_iph), &nexthdr);
 		if (offset_ph == -1)
diff --git a/net/sunrpc/svcauth_unix.c b/net/sunrpc/svcauth_unix.c
index 9a6d0cfd4ce..eb640c1a1bc 100644
--- a/net/sunrpc/svcauth_unix.c
+++ b/net/sunrpc/svcauth_unix.c
@@ -214,7 +214,7 @@ static int ip_map_parse(struct cache_detail *cd,
 		addr.s6_addr32[2] = htonl(0xffff);
 		addr.s6_addr32[3] =
 			htonl((((((b1<<8)|b2)<<8)|b3)<<8)|b4);
-       } else if (sscanf(buf, NIP6_FMT "%c",
+       } else if (sscanf(buf, "%04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x%c",
 			&b1, &b2, &b3, &b4, &b5, &b6, &b7, &b8, &c) == 8) {
 		addr.s6_addr16[0] = htons(b1);
 		addr.s6_addr16[1] = htons(b2);
-- 
cgit v1.2.3-70-g09d2


From 96631ed16c514cf8b28fab991a076985ce378c26 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <dada1@cosmosbay.com>
Date: Wed, 29 Oct 2008 11:19:58 -0700
Subject: udp: introduce sk_for_each_rcu_safenext()

Corey Minyard found a race added in commit 271b72c7fa82c2c7a795bc16896149933110672d
(udp: RCU handling for Unicast packets.)

 "If the socket is moved from one list to another list in-between the
 time the hash is calculated and the next field is accessed, and the
 socket has moved to the end of the new list, the traversal will not
 complete properly on the list it should have, since the socket will
 be on the end of the new list and there's not a way to tell it's on a
 new list and restart the list traversal.  I think that this can be
 solved by pre-fetching the "next" field (with proper barriers) before
 checking the hash."

This patch corrects this problem, introducing a new
sk_for_each_rcu_safenext() macro.

Signed-off-by: Eric Dumazet <dada1@cosmosbay.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/rculist.h | 17 +++++++++++++++++
 include/net/sock.h      |  4 ++--
 net/ipv4/udp.c          |  4 ++--
 net/ipv6/udp.c          |  4 ++--
 4 files changed, 23 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/rculist.h b/include/linux/rculist.h
index e649bd3f2c9..3ba2998b22b 100644
--- a/include/linux/rculist.h
+++ b/include/linux/rculist.h
@@ -383,5 +383,22 @@ static inline void hlist_add_after_rcu(struct hlist_node *prev,
 		({ tpos = hlist_entry(pos, typeof(*tpos), member); 1; }); \
 		pos = rcu_dereference(pos->next))
 
+/**
+ * hlist_for_each_entry_rcu_safenext - iterate over rcu list of given type
+ * @tpos:	the type * to use as a loop cursor.
+ * @pos:	the &struct hlist_node to use as a loop cursor.
+ * @head:	the head for your list.
+ * @member:	the name of the hlist_node within the struct.
+ * @next:       the &struct hlist_node to use as a next cursor
+ *
+ * Special version of hlist_for_each_entry_rcu that make sure
+ * each next pointer is fetched before each iteration.
+ */
+#define hlist_for_each_entry_rcu_safenext(tpos, pos, head, member, next) \
+	for (pos = rcu_dereference((head)->first);			 \
+		pos && ({ next = pos->next; smp_rmb(); prefetch(next); 1; }) &&	\
+		({ tpos = hlist_entry(pos, typeof(*tpos), member); 1; }); \
+		pos = rcu_dereference(next))
+
 #endif	/* __KERNEL__ */
 #endif
diff --git a/include/net/sock.h b/include/net/sock.h
index 0bea25db547..a4f6d3fc047 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -419,8 +419,8 @@ static __inline__ void sk_add_bind_node(struct sock *sk,
 
 #define sk_for_each(__sk, node, list) \
 	hlist_for_each_entry(__sk, node, list, sk_node)
-#define sk_for_each_rcu(__sk, node, list) \
-	hlist_for_each_entry_rcu(__sk, node, list, sk_node)
+#define sk_for_each_rcu_safenext(__sk, node, list, next) \
+	hlist_for_each_entry_rcu_safenext(__sk, node, list, sk_node, next)
 #define sk_for_each_from(__sk, node) \
 	if (__sk && ({ node = &(__sk)->sk_node; 1; })) \
 		hlist_for_each_entry_from(__sk, node, sk_node)
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index ced820318f9..c3ecec8a9e1 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -256,7 +256,7 @@ static struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr,
 		int dif, struct udp_table *udptable)
 {
 	struct sock *sk, *result;
-	struct hlist_node *node;
+	struct hlist_node *node, *next;
 	unsigned short hnum = ntohs(dport);
 	unsigned int hash = udp_hashfn(net, hnum);
 	struct udp_hslot *hslot = &udptable->hash[hash];
@@ -266,7 +266,7 @@ static struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr,
 begin:
 	result = NULL;
 	badness = -1;
-	sk_for_each_rcu(sk, node, &hslot->head) {
+	sk_for_each_rcu_safenext(sk, node, &hslot->head, next) {
 		/*
 		 * lockless reader, and SLAB_DESTROY_BY_RCU items:
 		 * We must check this item was not moved to another chain
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index 1d9790e43df..32d914db6c4 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -98,7 +98,7 @@ static struct sock *__udp6_lib_lookup(struct net *net,
 				      int dif, struct udp_table *udptable)
 {
 	struct sock *sk, *result;
-	struct hlist_node *node;
+	struct hlist_node *node, *next;
 	unsigned short hnum = ntohs(dport);
 	unsigned int hash = udp_hashfn(net, hnum);
 	struct udp_hslot *hslot = &udptable->hash[hash];
@@ -108,7 +108,7 @@ static struct sock *__udp6_lib_lookup(struct net *net,
 begin:
 	result = NULL;
 	badness = -1;
-	sk_for_each_rcu(sk, node, &hslot->head) {
+	sk_for_each_rcu_safenext(sk, node, &hslot->head, next) {
 		/*
 		 * lockless reader, and SLAB_DESTROY_BY_RCU items:
 		 * We must check this item was not moved to another chain
-- 
cgit v1.2.3-70-g09d2


From 5b095d98928fdb9e3b75be20a54b7a6cbf6ca9ad Mon Sep 17 00:00:00 2001
From: Harvey Harrison <harvey.harrison@gmail.com>
Date: Wed, 29 Oct 2008 12:52:50 -0700
Subject: net: replace %p6 with %pI6

Signed-off-by: Harvey Harrison <harvey.harrison@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/firmware/iscsi_ibft.c                  |  2 +-
 drivers/infiniband/core/sysfs.c                |  2 +-
 drivers/infiniband/hw/mthca/mthca_mcg.c        |  4 ++--
 drivers/infiniband/ulp/ipoib/ipoib_cm.c        |  4 ++--
 drivers/infiniband/ulp/ipoib/ipoib_main.c      | 12 +++++------
 drivers/infiniband/ulp/ipoib/ipoib_multicast.c | 30 +++++++++++++-------------
 drivers/infiniband/ulp/srp/ib_srp.c            |  6 +++---
 drivers/net/mlx4/mcg.c                         |  4 ++--
 drivers/scsi/iscsi_tcp.c                       |  2 +-
 fs/lockd/host.c                                |  2 +-
 fs/nfs/super.c                                 |  2 +-
 include/linux/sunrpc/svc_xprt.h                |  2 +-
 include/net/ip_vs.h                            |  2 +-
 include/net/netfilter/nf_conntrack_tuple.h     |  2 +-
 include/net/sctp/sctp.h                        |  2 +-
 net/bridge/netfilter/ebt_log.c                 |  2 +-
 net/ipv4/tcp_input.c                           |  2 +-
 net/ipv4/tcp_timer.c                           |  2 +-
 net/ipv6/addrlabel.c                           | 10 ++++-----
 net/ipv6/ah6.c                                 |  2 +-
 net/ipv6/esp6.c                                |  2 +-
 net/ipv6/exthdrs.c                             |  2 +-
 net/ipv6/icmp.c                                |  2 +-
 net/ipv6/ip6mr.c                               |  2 +-
 net/ipv6/ipcomp6.c                             |  2 +-
 net/ipv6/ndisc.c                               |  2 +-
 net/ipv6/netfilter/ip6t_LOG.c                  |  2 +-
 net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c |  2 +-
 net/ipv6/tcp_ipv6.c                            |  2 +-
 net/netfilter/ipvs/ip_vs_conn.c                |  4 ++--
 net/netfilter/ipvs/ip_vs_core.c                |  4 ++--
 net/netfilter/ipvs/ip_vs_ctl.c                 |  4 ++--
 net/netfilter/ipvs/ip_vs_proto.c               |  6 +++---
 net/netfilter/ipvs/ip_vs_proto_ah_esp.c        |  2 +-
 net/netfilter/ipvs/ip_vs_xmit.c                |  8 +++----
 net/netfilter/nf_conntrack_ftp.c               |  2 +-
 net/netfilter/nf_conntrack_h323_main.c         |  4 ++--
 net/netfilter/xt_hashlimit.c                   |  2 +-
 net/netfilter/xt_recent.c                      |  2 +-
 net/netlabel/netlabel_addrlist.c               |  2 +-
 net/sctp/ipv6.c                                | 18 ++++++++--------
 net/sctp/sm_statefuns.c                        |  2 +-
 net/sunrpc/clnt.c                              |  2 +-
 net/sunrpc/rpcb_clnt.c                         |  2 +-
 net/sunrpc/svcauth_unix.c                      |  4 ++--
 net/sunrpc/xprtsock.c                          |  8 +++----
 net/xfrm/xfrm_policy.c                         |  4 ++--
 net/xfrm/xfrm_state.c                          |  4 ++--
 security/selinux/avc.c                         |  2 +-
 49 files changed, 100 insertions(+), 100 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/firmware/iscsi_ibft.c b/drivers/firmware/iscsi_ibft.c
index 0a6472097a8..acb82aff880 100644
--- a/drivers/firmware/iscsi_ibft.c
+++ b/drivers/firmware/iscsi_ibft.c
@@ -290,7 +290,7 @@ static ssize_t sprintf_ipaddr(char *buf, u8 *ip)
 		/*
 		 * IPv6
 		 */
-		str += sprintf(str, "%p6", ip);
+		str += sprintf(str, "%pI6", ip);
 	}
 	str += sprintf(str, "\n");
 	return str - buf;
diff --git a/drivers/infiniband/core/sysfs.c b/drivers/infiniband/core/sysfs.c
index e985193d631..4f4d1bb9f06 100644
--- a/drivers/infiniband/core/sysfs.c
+++ b/drivers/infiniband/core/sysfs.c
@@ -262,7 +262,7 @@ static ssize_t show_port_gid(struct ib_port *p, struct port_attribute *attr,
 	if (ret)
 		return ret;
 
-	return sprintf(buf, "%p6\n", gid.raw);
+	return sprintf(buf, "%pI6\n", gid.raw);
 }
 
 static ssize_t show_port_pkey(struct ib_port *p, struct port_attribute *attr,
diff --git a/drivers/infiniband/hw/mthca/mthca_mcg.c b/drivers/infiniband/hw/mthca/mthca_mcg.c
index 693bed0b2d1..d4c81053e43 100644
--- a/drivers/infiniband/hw/mthca/mthca_mcg.c
+++ b/drivers/infiniband/hw/mthca/mthca_mcg.c
@@ -87,7 +87,7 @@ static int find_mgm(struct mthca_dev *dev,
 	}
 
 	if (0)
-		mthca_dbg(dev, "Hash for %p6 is %04x\n", gid, *hash);
+		mthca_dbg(dev, "Hash for %pI6 is %04x\n", gid, *hash);
 
 	*index = *hash;
 	*prev  = -1;
@@ -254,7 +254,7 @@ int mthca_multicast_detach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid)
 		goto out;
 
 	if (index == -1) {
-		mthca_err(dev, "MGID %p6 not found\n", gid->raw);
+		mthca_err(dev, "MGID %pI6 not found\n", gid->raw);
 		err = -EINVAL;
 		goto out;
 	}
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_cm.c b/drivers/infiniband/ulp/ipoib/ipoib_cm.c
index d98d87bfe36..47d588ba2a7 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_cm.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_cm.c
@@ -1128,7 +1128,7 @@ static int ipoib_cm_tx_init(struct ipoib_cm_tx *p, u32 qpn,
 		goto err_send_cm;
 	}
 
-	ipoib_dbg(priv, "Request connection 0x%x for gid %p6 qpn 0x%x\n",
+	ipoib_dbg(priv, "Request connection 0x%x for gid %pI6 qpn 0x%x\n",
 		  p->qp->qp_num, pathrec->dgid.raw, qpn);
 
 	return 0;
@@ -1276,7 +1276,7 @@ void ipoib_cm_destroy_tx(struct ipoib_cm_tx *tx)
 	if (test_and_clear_bit(IPOIB_FLAG_INITIALIZED, &tx->flags)) {
 		list_move(&tx->list, &priv->cm.reap_list);
 		queue_work(ipoib_workqueue, &priv->cm.reap_task);
-		ipoib_dbg(priv, "Reap connection for gid %p6\n",
+		ipoib_dbg(priv, "Reap connection for gid %pI6\n",
 			  tx->neigh->dgid.raw);
 		tx->neigh = NULL;
 	}
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_main.c b/drivers/infiniband/ulp/ipoib/ipoib_main.c
index e7f4f94c3e9..b3a671895bd 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_main.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c
@@ -359,7 +359,7 @@ void ipoib_mark_paths_invalid(struct net_device *dev)
 	spin_lock_irq(&priv->lock);
 
 	list_for_each_entry_safe(path, tp, &priv->path_list, list) {
-		ipoib_dbg(priv, "mark path LID 0x%04x GID %p6 invalid\n",
+		ipoib_dbg(priv, "mark path LID 0x%04x GID %pI6 invalid\n",
 			be16_to_cpu(path->pathrec.dlid),
 			path->pathrec.dgid.raw);
 		path->valid =  0;
@@ -413,10 +413,10 @@ static void path_rec_completion(int status,
 	unsigned long flags;
 
 	if (!status)
-		ipoib_dbg(priv, "PathRec LID 0x%04x for GID %p6\n",
+		ipoib_dbg(priv, "PathRec LID 0x%04x for GID %pI6\n",
 			  be16_to_cpu(pathrec->dlid), pathrec->dgid.raw);
 	else
-		ipoib_dbg(priv, "PathRec status %d for GID %p6\n",
+		ipoib_dbg(priv, "PathRec status %d for GID %pI6\n",
 			  status, path->pathrec.dgid.raw);
 
 	skb_queue_head_init(&skqueue);
@@ -527,7 +527,7 @@ static int path_rec_start(struct net_device *dev,
 {
 	struct ipoib_dev_priv *priv = netdev_priv(dev);
 
-	ipoib_dbg(priv, "Start path record lookup for %p6\n",
+	ipoib_dbg(priv, "Start path record lookup for %pI6\n",
 		  path->pathrec.dgid.raw);
 
 	init_completion(&path->done);
@@ -764,7 +764,7 @@ static int ipoib_start_xmit(struct sk_buff *skb, struct net_device *dev)
 
 			if ((be16_to_cpup((__be16 *) skb->data) != ETH_P_ARP) &&
 			    (be16_to_cpup((__be16 *) skb->data) != ETH_P_RARP)) {
-				ipoib_warn(priv, "Unicast, no %s: type %04x, QPN %06x %p6\n",
+				ipoib_warn(priv, "Unicast, no %s: type %04x, QPN %06x %pI6\n",
 					   skb->dst ? "neigh" : "dst",
 					   be16_to_cpup((__be16 *) skb->data),
 					   IPOIB_QPN(phdr->hwaddr),
@@ -844,7 +844,7 @@ static void ipoib_neigh_cleanup(struct neighbour *n)
 	else
 		return;
 	ipoib_dbg(priv,
-		  "neigh_cleanup for %06x %p6\n",
+		  "neigh_cleanup for %06x %pI6\n",
 		  IPOIB_QPN(n->ha),
 		  n->ha + 4);
 
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
index 0de79cf4c07..a2eb3b9789e 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
@@ -71,7 +71,7 @@ static void ipoib_mcast_free(struct ipoib_mcast *mcast)
 	struct ipoib_neigh *neigh, *tmp;
 	int tx_dropped = 0;
 
-	ipoib_dbg_mcast(netdev_priv(dev), "deleting multicast group %p6\n",
+	ipoib_dbg_mcast(netdev_priv(dev), "deleting multicast group %pI6\n",
 			mcast->mcmember.mgid.raw);
 
 	spin_lock_irq(&priv->lock);
@@ -204,7 +204,7 @@ static int ipoib_mcast_join_finish(struct ipoib_mcast *mcast,
 
 	if (!test_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags)) {
 		if (test_and_set_bit(IPOIB_MCAST_FLAG_ATTACHED, &mcast->flags)) {
-			ipoib_warn(priv, "multicast group %p6 already attached\n",
+			ipoib_warn(priv, "multicast group %pI6 already attached\n",
 				   mcast->mcmember.mgid.raw);
 
 			return 0;
@@ -213,7 +213,7 @@ static int ipoib_mcast_join_finish(struct ipoib_mcast *mcast,
 		ret = ipoib_mcast_attach(dev, be16_to_cpu(mcast->mcmember.mlid),
 					 &mcast->mcmember.mgid, set_qkey);
 		if (ret < 0) {
-			ipoib_warn(priv, "couldn't attach QP to multicast group %p6\n",
+			ipoib_warn(priv, "couldn't attach QP to multicast group %pI6\n",
 				   mcast->mcmember.mgid.raw);
 
 			clear_bit(IPOIB_MCAST_FLAG_ATTACHED, &mcast->flags);
@@ -245,7 +245,7 @@ static int ipoib_mcast_join_finish(struct ipoib_mcast *mcast,
 			mcast->ah = ah;
 			spin_unlock_irq(&priv->lock);
 
-			ipoib_dbg_mcast(priv, "MGID %p6 AV %p, LID 0x%04x, SL %d\n",
+			ipoib_dbg_mcast(priv, "MGID %pI6 AV %p, LID 0x%04x, SL %d\n",
 					mcast->mcmember.mgid.raw,
 					mcast->ah->ah,
 					be16_to_cpu(mcast->mcmember.mlid),
@@ -291,7 +291,7 @@ ipoib_mcast_sendonly_join_complete(int status,
 
 	if (status) {
 		if (mcast->logcount++ < 20)
-			ipoib_dbg_mcast(netdev_priv(dev), "multicast join failed for %p6, status %d\n",
+			ipoib_dbg_mcast(netdev_priv(dev), "multicast join failed for %pI6, status %d\n",
 					mcast->mcmember.mgid.raw, status);
 
 		/* Flush out any queued packets */
@@ -351,7 +351,7 @@ static int ipoib_mcast_sendonly_join(struct ipoib_mcast *mcast)
 		ipoib_warn(priv, "ib_sa_join_multicast failed (ret = %d)\n",
 			   ret);
 	} else {
-		ipoib_dbg_mcast(priv, "no multicast record for %p6, starting join\n",
+		ipoib_dbg_mcast(priv, "no multicast record for %pI6, starting join\n",
 				mcast->mcmember.mgid.raw);
 	}
 
@@ -380,7 +380,7 @@ static int ipoib_mcast_join_complete(int status,
 	struct net_device *dev = mcast->dev;
 	struct ipoib_dev_priv *priv = netdev_priv(dev);
 
-	ipoib_dbg_mcast(priv, "join completion for %p6 (status %d)\n",
+	ipoib_dbg_mcast(priv, "join completion for %pI6 (status %d)\n",
 			mcast->mcmember.mgid.raw, status);
 
 	/* We trap for port events ourselves. */
@@ -410,10 +410,10 @@ static int ipoib_mcast_join_complete(int status,
 
 	if (mcast->logcount++ < 20) {
 		if (status == -ETIMEDOUT) {
-			ipoib_dbg_mcast(priv, "multicast join failed for %p6, status %d\n",
+			ipoib_dbg_mcast(priv, "multicast join failed for %pI6, status %d\n",
 					mcast->mcmember.mgid.raw, status);
 		} else {
-			ipoib_warn(priv, "multicast join failed for %p6, status %d\n",
+			ipoib_warn(priv, "multicast join failed for %pI6, status %d\n",
 				   mcast->mcmember.mgid.raw, status);
 		}
 	}
@@ -446,7 +446,7 @@ static void ipoib_mcast_join(struct net_device *dev, struct ipoib_mcast *mcast,
 	ib_sa_comp_mask comp_mask;
 	int ret = 0;
 
-	ipoib_dbg_mcast(priv, "joining MGID %p6\n", mcast->mcmember.mgid.raw);
+	ipoib_dbg_mcast(priv, "joining MGID %pI6\n", mcast->mcmember.mgid.raw);
 
 	rec.mgid     = mcast->mcmember.mgid;
 	rec.port_gid = priv->local_gid;
@@ -631,7 +631,7 @@ static int ipoib_mcast_leave(struct net_device *dev, struct ipoib_mcast *mcast)
 		ib_sa_free_multicast(mcast->mc);
 
 	if (test_and_clear_bit(IPOIB_MCAST_FLAG_ATTACHED, &mcast->flags)) {
-		ipoib_dbg_mcast(priv, "leaving MGID %p6\n",
+		ipoib_dbg_mcast(priv, "leaving MGID %pI6\n",
 				mcast->mcmember.mgid.raw);
 
 		/* Remove ourselves from the multicast group */
@@ -663,7 +663,7 @@ void ipoib_mcast_send(struct net_device *dev, void *mgid, struct sk_buff *skb)
 	mcast = __ipoib_mcast_find(dev, mgid);
 	if (!mcast) {
 		/* Let's create a new send only group now */
-		ipoib_dbg_mcast(priv, "setting up send only multicast group for %p6\n",
+		ipoib_dbg_mcast(priv, "setting up send only multicast group for %pI6\n",
 				mgid);
 
 		mcast = ipoib_mcast_alloc(dev, 0);
@@ -797,13 +797,13 @@ void ipoib_mcast_restart_task(struct work_struct *work)
 			/* ignore group which is directly joined by userspace */
 			if (test_bit(IPOIB_FLAG_UMCAST, &priv->flags) &&
 			    !ib_sa_get_mcmember_rec(priv->ca, priv->port, &mgid, &rec)) {
-				ipoib_dbg_mcast(priv, "ignoring multicast entry for mgid %p6\n",
+				ipoib_dbg_mcast(priv, "ignoring multicast entry for mgid %pI6\n",
 						mgid.raw);
 				continue;
 			}
 
 			/* Not found or send-only group, let's add a new entry */
-			ipoib_dbg_mcast(priv, "adding multicast entry for mgid %p6\n",
+			ipoib_dbg_mcast(priv, "adding multicast entry for mgid %pI6\n",
 					mgid.raw);
 
 			nmcast = ipoib_mcast_alloc(dev, 0);
@@ -837,7 +837,7 @@ void ipoib_mcast_restart_task(struct work_struct *work)
 	list_for_each_entry_safe(mcast, tmcast, &priv->multicast_list, list) {
 		if (!test_bit(IPOIB_MCAST_FLAG_FOUND, &mcast->flags) &&
 		    !test_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags)) {
-			ipoib_dbg_mcast(priv, "deleting multicast group %p6\n",
+			ipoib_dbg_mcast(priv, "deleting multicast group %pI6\n",
 					mcast->mcmember.mgid.raw);
 
 			rb_erase(&mcast->rb_node, &priv->multicast_tree);
diff --git a/drivers/infiniband/ulp/srp/ib_srp.c b/drivers/infiniband/ulp/srp/ib_srp.c
index bc825310c6d..7c13db885bf 100644
--- a/drivers/infiniband/ulp/srp/ib_srp.c
+++ b/drivers/infiniband/ulp/srp/ib_srp.c
@@ -1514,7 +1514,7 @@ static ssize_t show_dgid(struct device *dev, struct device_attribute *attr,
 	    target->state == SRP_TARGET_REMOVED)
 		return -ENODEV;
 
-	return sprintf(buf, "%p6\n", target->path.dgid.raw);
+	return sprintf(buf, "%pI6\n", target->path.dgid.raw);
 }
 
 static ssize_t show_orig_dgid(struct device *dev,
@@ -1526,7 +1526,7 @@ static ssize_t show_orig_dgid(struct device *dev,
 	    target->state == SRP_TARGET_REMOVED)
 		return -ENODEV;
 
-	return sprintf(buf, "%p6\n", target->orig_dgid);
+	return sprintf(buf, "%pI6\n", target->orig_dgid);
 }
 
 static ssize_t show_zero_req_lim(struct device *dev,
@@ -1867,7 +1867,7 @@ static ssize_t srp_create_target(struct device *dev,
 
 	shost_printk(KERN_DEBUG, target->scsi_host, PFX
 		     "new target: id_ext %016llx ioc_guid %016llx pkey %04x "
-		     "service_id %016llx dgid %p6\n",
+		     "service_id %016llx dgid %pI6\n",
 	       (unsigned long long) be64_to_cpu(target->id_ext),
 	       (unsigned long long) be64_to_cpu(target->ioc_guid),
 	       be16_to_cpu(target->path.pkey),
diff --git a/drivers/net/mlx4/mcg.c b/drivers/net/mlx4/mcg.c
index 6f79e84a5c9..b1622062b12 100644
--- a/drivers/net/mlx4/mcg.c
+++ b/drivers/net/mlx4/mcg.c
@@ -118,7 +118,7 @@ static int find_mgm(struct mlx4_dev *dev,
 		return err;
 
 	if (0)
-		mlx4_dbg(dev, "Hash for %p6 is %04x\n", gid, *hash);
+		mlx4_dbg(dev, "Hash for %pI6 is %04x\n", gid, *hash);
 
 	*index = *hash;
 	*prev  = -1;
@@ -267,7 +267,7 @@ int mlx4_multicast_detach(struct mlx4_dev *dev, struct mlx4_qp *qp, u8 gid[16])
 		goto out;
 
 	if (index == -1) {
-		mlx4_err(dev, "MGID %p6 not found\n", gid);
+		mlx4_err(dev, "MGID %pI6 not found\n", gid);
 		err = -EINVAL;
 		goto out;
 	}
diff --git a/drivers/scsi/iscsi_tcp.c b/drivers/scsi/iscsi_tcp.c
index ef929aef7c1..24d09028a27 100644
--- a/drivers/scsi/iscsi_tcp.c
+++ b/drivers/scsi/iscsi_tcp.c
@@ -1608,7 +1608,7 @@ static int iscsi_tcp_get_addr(struct iscsi_conn *conn, struct socket *sock,
 	case AF_INET6:
 		sin6 = (struct sockaddr_in6 *)addr;
 		spin_lock_bh(&conn->session->lock);
-		sprintf(buf, "%p6", &sin6->sin6_addr);
+		sprintf(buf, "%pI6", &sin6->sin6_addr);
 		*port = be16_to_cpu(sin6->sin6_port);
 		spin_unlock_bh(&conn->session->lock);
 		break;
diff --git a/fs/lockd/host.c b/fs/lockd/host.c
index 344e6b475e0..c8ab7d70390 100644
--- a/fs/lockd/host.c
+++ b/fs/lockd/host.c
@@ -122,7 +122,7 @@ static void nlm_display_address(const struct sockaddr *sap,
 			snprintf(buf, len, NIPQUAD_FMT,
 				 NIPQUAD(sin6->sin6_addr.s6_addr32[3]));
 		else
-			snprintf(buf, len, "%p6", &sin6->sin6_addr);
+			snprintf(buf, len, "%pI6", &sin6->sin6_addr);
 		break;
 	default:
 		snprintf(buf, len, "unsupported address family");
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index 5fe77219df7..eb391d8d70b 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -468,7 +468,7 @@ static void nfs_show_mountd_options(struct seq_file *m, struct nfs_server *nfss,
 	}
 	case AF_INET6: {
 		struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)sap;
-		seq_printf(m, ",mountaddr=%p6", &sin6->sin6_addr);
+		seq_printf(m, ",mountaddr=%pI6", &sin6->sin6_addr);
 		break;
 	}
 	default:
diff --git a/include/linux/sunrpc/svc_xprt.h b/include/linux/sunrpc/svc_xprt.h
index 42e01c93c7e..51cb75ea42d 100644
--- a/include/linux/sunrpc/svc_xprt.h
+++ b/include/linux/sunrpc/svc_xprt.h
@@ -145,7 +145,7 @@ static inline char *__svc_print_addr(struct sockaddr *addr,
 		break;
 
 	case AF_INET6:
-		snprintf(buf, len, "%p6, port=%u",
+		snprintf(buf, len, "%pI6, port=%u",
 			 &((struct sockaddr_in6 *)addr)->sin6_addr,
 			ntohs(((struct sockaddr_in6 *) addr)->sin6_port));
 		break;
diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h
index 6a669206709..af48cada561 100644
--- a/include/net/ip_vs.h
+++ b/include/net/ip_vs.h
@@ -87,7 +87,7 @@ static inline const char *ip_vs_dbg_addr(int af, char *buf, size_t buf_len,
 	int len;
 #ifdef CONFIG_IP_VS_IPV6
 	if (af == AF_INET6)
-		len = snprintf(&buf[*idx], buf_len - *idx, "[%p6]",
+		len = snprintf(&buf[*idx], buf_len - *idx, "[%pI6]",
 			       &addr->in6) + 1;
 	else
 #endif
diff --git a/include/net/netfilter/nf_conntrack_tuple.h b/include/net/netfilter/nf_conntrack_tuple.h
index 303efaf68d0..42f1fc96f3e 100644
--- a/include/net/netfilter/nf_conntrack_tuple.h
+++ b/include/net/netfilter/nf_conntrack_tuple.h
@@ -122,7 +122,7 @@ static inline void nf_ct_dump_tuple_ip(const struct nf_conntrack_tuple *t)
 static inline void nf_ct_dump_tuple_ipv6(const struct nf_conntrack_tuple *t)
 {
 #ifdef DEBUG
-	printk("tuple %p: %u %p6 %hu -> %p6 %hu\n",
+	printk("tuple %p: %u %pI6 %hu -> %pI6 %hu\n",
 	       t, t->dst.protonum,
 	       t->src.u3.all, ntohs(t->src.u.all),
 	       t->dst.u3.all, ntohs(t->dst.u.all));
diff --git a/include/net/sctp/sctp.h b/include/net/sctp/sctp.h
index a84c3976e1b..e71b0f7ce88 100644
--- a/include/net/sctp/sctp.h
+++ b/include/net/sctp/sctp.h
@@ -285,7 +285,7 @@ extern int sctp_debug_flag;
 	if (sctp_debug_flag) { \
 		if (saddr->sa.sa_family == AF_INET6) { \
 			printk(KERN_DEBUG \
-			       lead "%p6" trail, \
+			       lead "%pI6" trail, \
 			       leadparm, \
 			       &saddr->v6.sin6_addr, \
 			       otherparms); \
diff --git a/net/bridge/netfilter/ebt_log.c b/net/bridge/netfilter/ebt_log.c
index 3654f3ebdb8..5e7cff3542f 100644
--- a/net/bridge/netfilter/ebt_log.c
+++ b/net/bridge/netfilter/ebt_log.c
@@ -133,7 +133,7 @@ ebt_log_packet(u_int8_t pf, unsigned int hooknum,
 			printk(" INCOMPLETE IPv6 header");
 			goto out;
 		}
-		printk(" IPv6 SRC=%p6 IPv6 DST=%p6, IPv6 priority=0x%01X, Next Header=%d",
+		printk(" IPv6 SRC=%pI6 IPv6 DST=%pI6, IPv6 priority=0x%01X, Next Header=%d",
 		       &ih->saddr, &ih->daddr, ih->priority, ih->nexthdr);
 		nexthdr = ih->nexthdr;
 		offset_ph = ipv6_skip_exthdr(skb, sizeof(_iph), &nexthdr);
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 191c06bb0f6..04909e4b3c4 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -2346,7 +2346,7 @@ static void DBGUNDO(struct sock *sk, const char *msg)
 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
 	else if (sk->sk_family == AF_INET6) {
 		struct ipv6_pinfo *np = inet6_sk(sk);
-		printk(KERN_DEBUG "Undo %s %p6/%u c%u l%u ss%u/%u p%u\n",
+		printk(KERN_DEBUG "Undo %s %pI6/%u c%u l%u ss%u/%u p%u\n",
 		       msg,
 		       &np->daddr, ntohs(inet->dport),
 		       tp->snd_cwnd, tcp_left_out(tp),
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index 4e6ee520523..979c9d604eb 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -306,7 +306,7 @@ static void tcp_retransmit_timer(struct sock *sk)
 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
 		else if (sk->sk_family == AF_INET6) {
 			struct ipv6_pinfo *np = inet6_sk(sk);
-			LIMIT_NETDEBUG(KERN_DEBUG "TCP: Treason uncloaked! Peer %p6:%u/%u shrinks window %u:%u. Repaired.\n",
+			LIMIT_NETDEBUG(KERN_DEBUG "TCP: Treason uncloaked! Peer %pI6:%u/%u shrinks window %u:%u. Repaired.\n",
 			       &np->daddr, ntohs(inet->dport),
 			       inet->num, tp->snd_una, tp->snd_nxt);
 		}
diff --git a/net/ipv6/addrlabel.c b/net/ipv6/addrlabel.c
index d28036659d2..6ff73c4c126 100644
--- a/net/ipv6/addrlabel.c
+++ b/net/ipv6/addrlabel.c
@@ -186,7 +186,7 @@ u32 ipv6_addr_label(struct net *net,
 	label = p ? p->label : IPV6_ADDR_LABEL_DEFAULT;
 	rcu_read_unlock();
 
-	ADDRLABEL(KERN_DEBUG "%s(addr=%p6, type=%d, ifindex=%d) => %08x\n",
+	ADDRLABEL(KERN_DEBUG "%s(addr=%pI6, type=%d, ifindex=%d) => %08x\n",
 		  __func__, addr, type, ifindex, label);
 
 	return label;
@@ -201,7 +201,7 @@ static struct ip6addrlbl_entry *ip6addrlbl_alloc(struct net *net,
 	struct ip6addrlbl_entry *newp;
 	int addrtype;
 
-	ADDRLABEL(KERN_DEBUG "%s(prefix=%p6, prefixlen=%d, ifindex=%d, label=%u)\n",
+	ADDRLABEL(KERN_DEBUG "%s(prefix=%pI6, prefixlen=%d, ifindex=%d, label=%u)\n",
 		  __func__, prefix, prefixlen, ifindex, (unsigned int)label);
 
 	addrtype = ipv6_addr_type(prefix) & (IPV6_ADDR_MAPPED | IPV6_ADDR_COMPATv4 | IPV6_ADDR_LOOPBACK);
@@ -289,7 +289,7 @@ static int ip6addrlbl_add(struct net *net,
 	struct ip6addrlbl_entry *newp;
 	int ret = 0;
 
-	ADDRLABEL(KERN_DEBUG "%s(prefix=%p6, prefixlen=%d, ifindex=%d, label=%u, replace=%d)\n",
+	ADDRLABEL(KERN_DEBUG "%s(prefix=%pI6, prefixlen=%d, ifindex=%d, label=%u, replace=%d)\n",
 		  __func__, prefix, prefixlen, ifindex, (unsigned int)label,
 		  replace);
 
@@ -313,7 +313,7 @@ static int __ip6addrlbl_del(struct net *net,
 	struct hlist_node *pos, *n;
 	int ret = -ESRCH;
 
-	ADDRLABEL(KERN_DEBUG "%s(prefix=%p6, prefixlen=%d, ifindex=%d)\n",
+	ADDRLABEL(KERN_DEBUG "%s(prefix=%pI6, prefixlen=%d, ifindex=%d)\n",
 		  __func__, prefix, prefixlen, ifindex);
 
 	hlist_for_each_entry_safe(p, pos, n, &ip6addrlbl_table.head, list) {
@@ -337,7 +337,7 @@ static int ip6addrlbl_del(struct net *net,
 	struct in6_addr prefix_buf;
 	int ret;
 
-	ADDRLABEL(KERN_DEBUG "%s(prefix=%p6, prefixlen=%d, ifindex=%d)\n",
+	ADDRLABEL(KERN_DEBUG "%s(prefix=%pI6, prefixlen=%d, ifindex=%d)\n",
 		  __func__, prefix, prefixlen, ifindex);
 
 	ipv6_addr_prefix(&prefix_buf, prefix, prefixlen);
diff --git a/net/ipv6/ah6.c b/net/ipv6/ah6.c
index 9bc43f2527c..7a8a01369e5 100644
--- a/net/ipv6/ah6.c
+++ b/net/ipv6/ah6.c
@@ -419,7 +419,7 @@ static void ah6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
 	if (!x)
 		return;
 
-	NETDEBUG(KERN_DEBUG "pmtu discovery on SA AH/%08x/%p6\n",
+	NETDEBUG(KERN_DEBUG "pmtu discovery on SA AH/%08x/%pI6\n",
 		 ntohl(ah->spi), &iph->daddr);
 
 	xfrm_state_put(x);
diff --git a/net/ipv6/esp6.c b/net/ipv6/esp6.c
index ec4be188c34..c02a6308def 100644
--- a/net/ipv6/esp6.c
+++ b/net/ipv6/esp6.c
@@ -367,7 +367,7 @@ static void esp6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
 	x = xfrm_state_lookup((xfrm_address_t *)&iph->daddr, esph->spi, IPPROTO_ESP, AF_INET6);
 	if (!x)
 		return;
-	printk(KERN_DEBUG "pmtu discovery on SA ESP/%08x/%p6\n",
+	printk(KERN_DEBUG "pmtu discovery on SA ESP/%08x/%pI6\n",
 			ntohl(esph->spi), &iph->daddr);
 	xfrm_state_put(x);
 }
diff --git a/net/ipv6/exthdrs.c b/net/ipv6/exthdrs.c
index a8905146835..1c7f400a3cf 100644
--- a/net/ipv6/exthdrs.c
+++ b/net/ipv6/exthdrs.c
@@ -219,7 +219,7 @@ static int ipv6_dest_hao(struct sk_buff *skb, int optoff)
 
 	if (!(ipv6_addr_type(&hao->addr) & IPV6_ADDR_UNICAST)) {
 		LIMIT_NETDEBUG(
-			KERN_DEBUG "hao is not an unicast addr: %p6\n", &hao->addr);
+			KERN_DEBUG "hao is not an unicast addr: %pI6\n", &hao->addr);
 		goto discard;
 	}
 
diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c
index b3fa38e40dc..3c2821f9b52 100644
--- a/net/ipv6/icmp.c
+++ b/net/ipv6/icmp.c
@@ -681,7 +681,7 @@ static int icmpv6_rcv(struct sk_buff *skb)
 		skb->csum = ~csum_unfold(csum_ipv6_magic(saddr, daddr, skb->len,
 					     IPPROTO_ICMPV6, 0));
 		if (__skb_checksum_complete(skb)) {
-			LIMIT_NETDEBUG(KERN_DEBUG "ICMPv6 checksum failed [%p6 > %p6]\n",
+			LIMIT_NETDEBUG(KERN_DEBUG "ICMPv6 checksum failed [%pI6 > %pI6]\n",
 				       saddr, daddr);
 			goto discard_it;
 		}
diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c
index 798e6a29dd8..c491fb98a5e 100644
--- a/net/ipv6/ip6mr.c
+++ b/net/ipv6/ip6mr.c
@@ -297,7 +297,7 @@ static int ipmr_mfc_seq_show(struct seq_file *seq, void *v)
 		const struct mfc6_cache *mfc = v;
 		const struct ipmr_mfc_iter *it = seq->private;
 
-		seq_printf(seq, "%p6 %p6 %-3d %8ld %8ld %8ld",
+		seq_printf(seq, "%pI6 %pI6 %-3d %8ld %8ld %8ld",
 			   &mfc->mf6c_mcastgrp, &mfc->mf6c_origin,
 			   mfc->mf6c_parent,
 			   mfc->mfc_un.res.pkt,
diff --git a/net/ipv6/ipcomp6.c b/net/ipv6/ipcomp6.c
index 9566d8f7314..d4576a9c154 100644
--- a/net/ipv6/ipcomp6.c
+++ b/net/ipv6/ipcomp6.c
@@ -67,7 +67,7 @@ static void ipcomp6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
 	if (!x)
 		return;
 
-	printk(KERN_DEBUG "pmtu discovery on SA IPCOMP/%08x/%p6\n",
+	printk(KERN_DEBUG "pmtu discovery on SA IPCOMP/%08x/%pI6\n",
 			spi, &iph->daddr);
 	xfrm_state_put(x);
 }
diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c
index 191bb0722a7..2a6752dae09 100644
--- a/net/ipv6/ndisc.c
+++ b/net/ipv6/ndisc.c
@@ -647,7 +647,7 @@ static void ndisc_solicit(struct neighbour *neigh, struct sk_buff *skb)
 
 	if ((probes -= neigh->parms->ucast_probes) < 0) {
 		if (!(neigh->nud_state & NUD_VALID)) {
-			ND_PRINTK1(KERN_DEBUG "%s(): trying to ucast probe in NUD_INVALID: %p6\n",
+			ND_PRINTK1(KERN_DEBUG "%s(): trying to ucast probe in NUD_INVALID: %pI6\n",
 				   __func__, target);
 		}
 		ndisc_send_ns(dev, neigh, target, target, saddr);
diff --git a/net/ipv6/netfilter/ip6t_LOG.c b/net/ipv6/netfilter/ip6t_LOG.c
index a61ce301000..02885e8bb69 100644
--- a/net/ipv6/netfilter/ip6t_LOG.c
+++ b/net/ipv6/netfilter/ip6t_LOG.c
@@ -61,7 +61,7 @@ static void dump_packet(const struct nf_loginfo *info,
 	}
 
 	/* Max length: 88 "SRC=0000.0000.0000.0000.0000.0000.0000.0000 DST=0000.0000.0000.0000.0000.0000.0000.0000 " */
-	printk("SRC=%p6 DST=%p6 ", &ih->saddr, &ih->daddr);
+	printk("SRC=%pI6 DST=%pI6 ", &ih->saddr, &ih->daddr);
 
 	/* Max length: 44 "LEN=65535 TC=255 HOPLIMIT=255 FLOWLBL=FFFFF " */
 	printk("LEN=%Zu TC=%u HOPLIMIT=%u FLOWLBL=%u ",
diff --git a/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c b/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c
index b165a273c6c..727b9530448 100644
--- a/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c
+++ b/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c
@@ -56,7 +56,7 @@ static bool ipv6_invert_tuple(struct nf_conntrack_tuple *tuple,
 static int ipv6_print_tuple(struct seq_file *s,
 			    const struct nf_conntrack_tuple *tuple)
 {
-	return seq_printf(s, "src=%p6 dst=%p6 ",
+	return seq_printf(s, "src=%pI6 dst=%pI6 ",
 			  tuple->src.u3.ip6, tuple->dst.u3.ip6);
 }
 
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 483550cfdf3..984276463a8 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -872,7 +872,7 @@ static int tcp_v6_inbound_md5_hash (struct sock *sk, struct sk_buff *skb)
 
 	if (genhash || memcmp(hash_location, newhash, 16) != 0) {
 		if (net_ratelimit()) {
-			printk(KERN_INFO "MD5 Hash %s for (%p6, %u)->(%p6, %u)\n",
+			printk(KERN_INFO "MD5 Hash %s for (%pI6, %u)->(%pI6, %u)\n",
 			       genhash ? "failed" : "mismatch",
 			       &ip6h->saddr, ntohs(th->source),
 			       &ip6h->daddr, ntohs(th->dest));
diff --git a/net/netfilter/ipvs/ip_vs_conn.c b/net/netfilter/ipvs/ip_vs_conn.c
index 89bf65be6f2..60aba45023f 100644
--- a/net/netfilter/ipvs/ip_vs_conn.c
+++ b/net/netfilter/ipvs/ip_vs_conn.c
@@ -820,7 +820,7 @@ static int ip_vs_conn_seq_show(struct seq_file *seq, void *v)
 
 #ifdef CONFIG_IP_VS_IPV6
 		if (cp->af == AF_INET6)
-			seq_printf(seq, "%-3s %p6 %04X %p6 %04X %p6 %04X %-11s %7lu\n",
+			seq_printf(seq, "%-3s %pI6 %04X %pI6 %04X %pI6 %04X %-11s %7lu\n",
 				ip_vs_proto_name(cp->protocol),
 				&cp->caddr.in6, ntohs(cp->cport),
 				&cp->vaddr.in6, ntohs(cp->vport),
@@ -881,7 +881,7 @@ static int ip_vs_conn_sync_seq_show(struct seq_file *seq, void *v)
 
 #ifdef CONFIG_IP_VS_IPV6
 		if (cp->af == AF_INET6)
-			seq_printf(seq, "%-3s %p6 %04X %p6 %04X %p6 %04X %-11s %-6s %7lu\n",
+			seq_printf(seq, "%-3s %pI6 %04X %pI6 %04X %pI6 %04X %-11s %-6s %7lu\n",
 				ip_vs_proto_name(cp->protocol),
 				&cp->caddr.in6, ntohs(cp->cport),
 				&cp->vaddr.in6, ntohs(cp->vport),
diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c
index 9400587a01e..c3c68443b5b 100644
--- a/net/netfilter/ipvs/ip_vs_core.c
+++ b/net/netfilter/ipvs/ip_vs_core.c
@@ -805,7 +805,7 @@ static int ip_vs_out_icmp_v6(struct sk_buff *skb, int *related)
 	if (ic == NULL)
 		return NF_DROP;
 
-	IP_VS_DBG(12, "Outgoing ICMPv6 (%d,%d) %p6->%p6\n",
+	IP_VS_DBG(12, "Outgoing ICMPv6 (%d,%d) %pI6->%pI6\n",
 		  ic->icmp6_type, ntohs(icmpv6_id(ic)),
 		  &iph->saddr, &iph->daddr);
 
@@ -1175,7 +1175,7 @@ ip_vs_in_icmp_v6(struct sk_buff *skb, int *related, unsigned int hooknum)
 	if (ic == NULL)
 		return NF_DROP;
 
-	IP_VS_DBG(12, "Incoming ICMPv6 (%d,%d) %p6->%p6\n",
+	IP_VS_DBG(12, "Incoming ICMPv6 (%d,%d) %pI6->%pI6\n",
 		  ic->icmp6_type, ntohs(icmpv6_id(ic)),
 		  &iph->saddr, &iph->daddr);
 
diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c
index 28c47c0d514..76db27ec963 100644
--- a/net/netfilter/ipvs/ip_vs_ctl.c
+++ b/net/netfilter/ipvs/ip_vs_ctl.c
@@ -1867,7 +1867,7 @@ static int ip_vs_info_seq_show(struct seq_file *seq, void *v)
 		if (iter->table == ip_vs_svc_table) {
 #ifdef CONFIG_IP_VS_IPV6
 			if (svc->af == AF_INET6)
-				seq_printf(seq, "%s  [%p6]:%04X %s ",
+				seq_printf(seq, "%s  [%pI6]:%04X %s ",
 					   ip_vs_proto_name(svc->protocol),
 					   &svc->addr.in6,
 					   ntohs(svc->port),
@@ -1895,7 +1895,7 @@ static int ip_vs_info_seq_show(struct seq_file *seq, void *v)
 #ifdef CONFIG_IP_VS_IPV6
 			if (dest->af == AF_INET6)
 				seq_printf(seq,
-					   "  -> [%p6]:%04X"
+					   "  -> [%pI6]:%04X"
 					   "      %-7s %-6d %-10d %-10d\n",
 					   &dest->addr.in6,
 					   ntohs(dest->port),
diff --git a/net/netfilter/ipvs/ip_vs_proto.c b/net/netfilter/ipvs/ip_vs_proto.c
index d7ce4f1839c..54cd67fbfe7 100644
--- a/net/netfilter/ipvs/ip_vs_proto.c
+++ b/net/netfilter/ipvs/ip_vs_proto.c
@@ -203,7 +203,7 @@ ip_vs_tcpudp_debug_packet_v6(struct ip_vs_protocol *pp,
 	if (ih == NULL)
 		sprintf(buf, "%s TRUNCATED", pp->name);
 	else if (ih->nexthdr == IPPROTO_FRAGMENT)
-		sprintf(buf, "%s %p6->%p6 frag",
+		sprintf(buf, "%s %pI6->%pI6 frag",
 			pp->name, &ih->saddr, &ih->daddr);
 	else {
 		__be16 _ports[2], *pptr;
@@ -211,10 +211,10 @@ ip_vs_tcpudp_debug_packet_v6(struct ip_vs_protocol *pp,
 		pptr = skb_header_pointer(skb, offset + sizeof(struct ipv6hdr),
 					  sizeof(_ports), _ports);
 		if (pptr == NULL)
-			sprintf(buf, "%s TRUNCATED %p6->%p6",
+			sprintf(buf, "%s TRUNCATED %pI6->%pI6",
 				pp->name, &ih->saddr, &ih->daddr);
 		else
-			sprintf(buf, "%s %p6:%u->%p6:%u",
+			sprintf(buf, "%s %pI6:%u->%pI6:%u",
 				pp->name,
 				&ih->saddr, ntohs(pptr[0]),
 				&ih->daddr, ntohs(pptr[1]));
diff --git a/net/netfilter/ipvs/ip_vs_proto_ah_esp.c b/net/netfilter/ipvs/ip_vs_proto_ah_esp.c
index 59f2d11b683..6ede8881204 100644
--- a/net/netfilter/ipvs/ip_vs_proto_ah_esp.c
+++ b/net/netfilter/ipvs/ip_vs_proto_ah_esp.c
@@ -154,7 +154,7 @@ ah_esp_debug_packet_v6(struct ip_vs_protocol *pp, const struct sk_buff *skb,
 	if (ih == NULL)
 		sprintf(buf, "%s TRUNCATED", pp->name);
 	else
-		sprintf(buf, "%s %p6->%p6",
+		sprintf(buf, "%s %pI6->%pI6",
 			pp->name, &ih->saddr, &ih->daddr);
 
 	printk(KERN_DEBUG "IPVS: %s: %s\n", msg, buf);
diff --git a/net/netfilter/ipvs/ip_vs_xmit.c b/net/netfilter/ipvs/ip_vs_xmit.c
index be34c335cab..fc342dda950 100644
--- a/net/netfilter/ipvs/ip_vs_xmit.c
+++ b/net/netfilter/ipvs/ip_vs_xmit.c
@@ -141,12 +141,12 @@ __ip_vs_get_out_rt_v6(struct ip_vs_conn *cp)
 								 NULL, &fl);
 			if (!rt) {
 				spin_unlock(&dest->dst_lock);
-				IP_VS_DBG_RL("ip6_route_output error, dest: %p6\n",
+				IP_VS_DBG_RL("ip6_route_output error, dest: %pI6\n",
 					     &dest->addr.in6);
 				return NULL;
 			}
 			__ip_vs_dst_set(dest, 0, dst_clone(&rt->u.dst));
-			IP_VS_DBG(10, "new dst %p6, refcnt=%d\n",
+			IP_VS_DBG(10, "new dst %pI6, refcnt=%d\n",
 				  &dest->addr.in6,
 				  atomic_read(&rt->u.dst.__refcnt));
 		}
@@ -166,7 +166,7 @@ __ip_vs_get_out_rt_v6(struct ip_vs_conn *cp)
 
 		rt = (struct rt6_info *)ip6_route_output(&init_net, NULL, &fl);
 		if (!rt) {
-			IP_VS_DBG_RL("ip6_route_output error, dest: %p6\n",
+			IP_VS_DBG_RL("ip6_route_output error, dest: %pI6\n",
 				     &cp->daddr.in6);
 			return NULL;
 		}
@@ -300,7 +300,7 @@ ip_vs_bypass_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
 
 	rt = (struct rt6_info *)ip6_route_output(&init_net, NULL, &fl);
 	if (!rt) {
-		IP_VS_DBG_RL("ip_vs_bypass_xmit_v6(): ip6_route_output error, dest: %p6\n",
+		IP_VS_DBG_RL("ip_vs_bypass_xmit_v6(): ip6_route_output error, dest: %pI6\n",
 			     &iph->daddr);
 		goto tx_error_icmp;
 	}
diff --git a/net/netfilter/nf_conntrack_ftp.c b/net/netfilter/nf_conntrack_ftp.c
index 05bf82d345c..8cab6d59590 100644
--- a/net/netfilter/nf_conntrack_ftp.c
+++ b/net/netfilter/nf_conntrack_ftp.c
@@ -467,7 +467,7 @@ static int help(struct sk_buff *skb,
 				 NIPQUAD(cmd.u3.ip),
 				 NIPQUAD(ct->tuplehash[dir].tuple.src.u3.ip));
 		} else {
-			pr_debug("conntrack_ftp: NOT RECORDING: %p6 != %p6\n",
+			pr_debug("conntrack_ftp: NOT RECORDING: %pI6 != %pI6\n",
 				 cmd.u3.ip6,
 				 ct->tuplehash[dir].tuple.src.u3.ip6);
 		}
diff --git a/net/netfilter/nf_conntrack_h323_main.c b/net/netfilter/nf_conntrack_h323_main.c
index 29e49b1c80b..99bc803d1dd 100644
--- a/net/netfilter/nf_conntrack_h323_main.c
+++ b/net/netfilter/nf_conntrack_h323_main.c
@@ -850,7 +850,7 @@ static int process_setup(struct sk_buff *skb, struct nf_conn *ct,
 	    get_h225_addr(ct, *data, &setup->destCallSignalAddress,
 			  &addr, &port) &&
 	    memcmp(&addr, &ct->tuplehash[!dir].tuple.src.u3, sizeof(addr))) {
-		pr_debug("nf_ct_q931: set destCallSignalAddress %p6:%hu->%p6:%hu\n",
+		pr_debug("nf_ct_q931: set destCallSignalAddress %pI6:%hu->%pI6:%hu\n",
 			 &addr, ntohs(port), &ct->tuplehash[!dir].tuple.src.u3,
 			 ntohs(ct->tuplehash[!dir].tuple.src.u.tcp.port));
 		ret = set_h225_addr(skb, data, dataoff,
@@ -866,7 +866,7 @@ static int process_setup(struct sk_buff *skb, struct nf_conn *ct,
 	    get_h225_addr(ct, *data, &setup->sourceCallSignalAddress,
 			  &addr, &port) &&
 	    memcmp(&addr, &ct->tuplehash[!dir].tuple.dst.u3, sizeof(addr))) {
-		pr_debug("nf_ct_q931: set sourceCallSignalAddress %p6:%hu->%p6:%hu\n",
+		pr_debug("nf_ct_q931: set sourceCallSignalAddress %pI6:%hu->%pI6:%hu\n",
 			 &addr, ntohs(port), &ct->tuplehash[!dir].tuple.dst.u3,
 			 ntohs(ct->tuplehash[!dir].tuple.dst.u.tcp.port));
 		ret = set_h225_addr(skb, data, dataoff,
diff --git a/net/netfilter/xt_hashlimit.c b/net/netfilter/xt_hashlimit.c
index f04c6ed4367..6379717f904 100644
--- a/net/netfilter/xt_hashlimit.c
+++ b/net/netfilter/xt_hashlimit.c
@@ -904,7 +904,7 @@ static int dl_seq_real_show(struct dsthash_ent *ent, u_int8_t family,
 				 ent->rateinfo.cost);
 #if defined(CONFIG_IP6_NF_IPTABLES) || defined(CONFIG_IP6_NF_IPTABLES_MODULE)
 	case NFPROTO_IPV6:
-		return seq_printf(s, "%ld %p6:%u->%p6:%u %u %u %u\n",
+		return seq_printf(s, "%ld %pI6:%u->%pI6:%u %u %u %u\n",
 				 (long)(ent->expires - jiffies)/HZ,
 				 &ent->dst.ip6.src,
 				 ntohs(ent->dst.src_port),
diff --git a/net/netfilter/xt_recent.c b/net/netfilter/xt_recent.c
index a377ea333e1..b785727a5bf 100644
--- a/net/netfilter/xt_recent.c
+++ b/net/netfilter/xt_recent.c
@@ -426,7 +426,7 @@ static int recent_seq_show(struct seq_file *seq, void *v)
 			   "oldest_pkt: %u", NIPQUAD(e->addr.ip), e->ttl,
 			   e->stamps[i], e->index);
 	else
-		seq_printf(seq, "src=%p6 ttl: %u last_seen: %lu oldest_pkt: %u",
+		seq_printf(seq, "src=%pI6 ttl: %u last_seen: %lu oldest_pkt: %u",
 			   &e->addr.in6, e->ttl, e->stamps[i], e->index);
 	for (i = 0; i < e->nstamps; i++)
 		seq_printf(seq, "%s %lu", i ? "," : "", e->stamps[i]);
diff --git a/net/netlabel/netlabel_addrlist.c b/net/netlabel/netlabel_addrlist.c
index 614c95ec39d..8b1c58b0820 100644
--- a/net/netlabel/netlabel_addrlist.c
+++ b/net/netlabel/netlabel_addrlist.c
@@ -370,7 +370,7 @@ void netlbl_af6list_audit_addr(struct audit_buffer *audit_buf,
 
 	if (dev != NULL)
 		audit_log_format(audit_buf, " netif=%s", dev);
-	audit_log_format(audit_buf, " %s=%p6", dir, addr);
+	audit_log_format(audit_buf, " %s=%pI6", dir, addr);
 	if (ntohl(mask->s6_addr32[3]) != 0xffffffff) {
 		u32 mask_len = 0;
 		u32 mask_val;
diff --git a/net/sctp/ipv6.c b/net/sctp/ipv6.c
index e82668bd2b5..ceaa4aa066e 100644
--- a/net/sctp/ipv6.c
+++ b/net/sctp/ipv6.c
@@ -223,7 +223,7 @@ static int sctp_v6_xmit(struct sk_buff *skb, struct sctp_transport *transport)
 		ipv6_addr_copy(&fl.fl6_dst, rt0->addr);
 	}
 
-	SCTP_DEBUG_PRINTK("%s: skb:%p, len:%d, src:%p6 dst:%p6\n",
+	SCTP_DEBUG_PRINTK("%s: skb:%p, len:%d, src:%pI6 dst:%pI6\n",
 			  __func__, skb, skb->len,
 			  &fl.fl6_src, &fl.fl6_dst);
 
@@ -251,18 +251,18 @@ static struct dst_entry *sctp_v6_get_dst(struct sctp_association *asoc,
 		fl.oif = daddr->v6.sin6_scope_id;
 
 
-	SCTP_DEBUG_PRINTK("%s: DST=%p6 ", __func__, &fl.fl6_dst);
+	SCTP_DEBUG_PRINTK("%s: DST=%pI6 ", __func__, &fl.fl6_dst);
 
 	if (saddr) {
 		ipv6_addr_copy(&fl.fl6_src, &saddr->v6.sin6_addr);
-		SCTP_DEBUG_PRINTK("SRC=%p6 - ", &fl.fl6_src);
+		SCTP_DEBUG_PRINTK("SRC=%pI6 - ", &fl.fl6_src);
 	}
 
 	dst = ip6_route_output(&init_net, NULL, &fl);
 	if (!dst->error) {
 		struct rt6_info *rt;
 		rt = (struct rt6_info *)dst;
-		SCTP_DEBUG_PRINTK("rt6_dst:%p6 rt6_src:%p6\n",
+		SCTP_DEBUG_PRINTK("rt6_dst:%pI6 rt6_src:%pI6\n",
 			&rt->rt6i_dst.addr, &rt->rt6i_src.addr);
 		return dst;
 	}
@@ -309,7 +309,7 @@ static void sctp_v6_get_saddr(struct sctp_sock *sk,
 	__u8 matchlen = 0;
 	__u8 bmatchlen;
 
-	SCTP_DEBUG_PRINTK("%s: asoc:%p dst:%p daddr:%p6 ",
+	SCTP_DEBUG_PRINTK("%s: asoc:%p dst:%p daddr:%pI6 ",
 			  __func__, asoc, dst, &daddr->v6.sin6_addr);
 
 	if (!asoc) {
@@ -318,7 +318,7 @@ static void sctp_v6_get_saddr(struct sctp_sock *sk,
 				   &daddr->v6.sin6_addr,
 				   inet6_sk(&sk->inet.sk)->srcprefs,
 				   &saddr->v6.sin6_addr);
-		SCTP_DEBUG_PRINTK("saddr from ipv6_get_saddr: %p6\n",
+		SCTP_DEBUG_PRINTK("saddr from ipv6_get_saddr: %pI6\n",
 				  &saddr->v6.sin6_addr);
 		return;
 	}
@@ -347,10 +347,10 @@ static void sctp_v6_get_saddr(struct sctp_sock *sk,
 
 	if (baddr) {
 		memcpy(saddr, baddr, sizeof(union sctp_addr));
-		SCTP_DEBUG_PRINTK("saddr: %p6\n", &saddr->v6.sin6_addr);
+		SCTP_DEBUG_PRINTK("saddr: %pI6\n", &saddr->v6.sin6_addr);
 	} else {
 		printk(KERN_ERR "%s: asoc:%p Could not find a valid source "
-		       "address for the dest:%p6\n",
+		       "address for the dest:%pI6\n",
 		       __func__, asoc, &daddr->v6.sin6_addr);
 	}
 
@@ -720,7 +720,7 @@ static int sctp_v6_is_ce(const struct sk_buff *skb)
 /* Dump the v6 addr to the seq file. */
 static void sctp_v6_seq_dump_addr(struct seq_file *seq, union sctp_addr *addr)
 {
-	seq_printf(seq, "%p6 ", &addr->v6.sin6_addr);
+	seq_printf(seq, "%pI6 ", &addr->v6.sin6_addr);
 }
 
 static void sctp_v6_ecn_capable(struct sock *sk)
diff --git a/net/sctp/sm_statefuns.c b/net/sctp/sm_statefuns.c
index 9f370964e73..d07b484b873 100644
--- a/net/sctp/sm_statefuns.c
+++ b/net/sctp/sm_statefuns.c
@@ -1123,7 +1123,7 @@ sctp_disposition_t sctp_sf_backbeat_8_3(const struct sctp_endpoint *ep,
 		if (from_addr.sa.sa_family == AF_INET6) {
 			if (net_ratelimit())
 				printk(KERN_WARNING
-				    "%s association %p could not find address %p6\n",
+				    "%s association %p could not find address %pI6\n",
 				    __func__,
 				    asoc,
 				    &from_addr.v6.sin6_addr);
diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index 26f61fd11ea..8f067497c21 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -278,7 +278,7 @@ struct rpc_clnt *rpc_create(struct rpc_create_args *args)
 		case AF_INET6: {
 			struct sockaddr_in6 *sin =
 					(struct sockaddr_in6 *)args->address;
-			snprintf(servername, sizeof(servername), "%p6",
+			snprintf(servername, sizeof(servername), "%pI6",
 				 &sin->sin6_addr);
 			break;
 		}
diff --git a/net/sunrpc/rpcb_clnt.c b/net/sunrpc/rpcb_clnt.c
index 968ec1f66bc..4c8adadc214 100644
--- a/net/sunrpc/rpcb_clnt.c
+++ b/net/sunrpc/rpcb_clnt.c
@@ -305,7 +305,7 @@ static int rpcb_register_netid6(struct sockaddr_in6 *address_to_register,
 		snprintf(buf, sizeof(buf), "::.%u.%u",
 				port >> 8, port & 0xff);
 	else
-		snprintf(buf, sizeof(buf), "%p6.%u.%u",
+		snprintf(buf, sizeof(buf), "%pI6.%u.%u",
 			 &address_to_register->sin6_addr,
 			 port >> 8, port & 0xff);
 	map->r_addr = buf;
diff --git a/net/sunrpc/svcauth_unix.c b/net/sunrpc/svcauth_unix.c
index eb640c1a1bc..16f714a247b 100644
--- a/net/sunrpc/svcauth_unix.c
+++ b/net/sunrpc/svcauth_unix.c
@@ -168,7 +168,7 @@ static void ip_map_request(struct cache_detail *cd,
 				ntohl(im->m_addr.s6_addr32[3]) >>  8 & 0xff,
 				ntohl(im->m_addr.s6_addr32[3]) >>  0 & 0xff);
 	} else {
-		snprintf(text_addr, 40, "%p6", &im->m_addr);
+		snprintf(text_addr, 40, "%pI6", &im->m_addr);
 	}
 	qword_add(bpp, blen, im->m_class);
 	qword_add(bpp, blen, text_addr);
@@ -286,7 +286,7 @@ static int ip_map_show(struct seq_file *m,
 			ntohl(addr.s6_addr32[3]) >>  0 & 0xff,
 			dom);
 	} else {
-		seq_printf(m, "%s %p6 %s\n", im->m_class, &addr, dom);
+		seq_printf(m, "%s %pI6 %s\n", im->m_class, &addr, dom);
 	}
 	return 0;
 }
diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
index 3c9aff58457..f9ce3c9949d 100644
--- a/net/sunrpc/xprtsock.c
+++ b/net/sunrpc/xprtsock.c
@@ -341,7 +341,7 @@ static void xs_format_ipv6_peer_addresses(struct rpc_xprt *xprt,
 
 	buf = kzalloc(40, GFP_KERNEL);
 	if (buf) {
-		snprintf(buf, 40, "%p6",&addr->sin6_addr);
+		snprintf(buf, 40, "%pI6",&addr->sin6_addr);
 	}
 	xprt->address_strings[RPC_DISPLAY_ADDR] = buf;
 
@@ -356,7 +356,7 @@ static void xs_format_ipv6_peer_addresses(struct rpc_xprt *xprt,
 
 	buf = kzalloc(64, GFP_KERNEL);
 	if (buf) {
-		snprintf(buf, 64, "addr=%p6 port=%u proto=%s",
+		snprintf(buf, 64, "addr=%pI6 port=%u proto=%s",
 				&addr->sin6_addr,
 				ntohs(addr->sin6_port),
 				protocol);
@@ -378,7 +378,7 @@ static void xs_format_ipv6_peer_addresses(struct rpc_xprt *xprt,
 
 	buf = kzalloc(50, GFP_KERNEL);
 	if (buf) {
-		snprintf(buf, 50, "%p6.%u.%u",
+		snprintf(buf, 50, "%pI6.%u.%u",
 			 &addr->sin6_addr,
 			 ntohs(addr->sin6_port) >> 8,
 			 ntohs(addr->sin6_port) & 0xff);
@@ -1407,7 +1407,7 @@ static int xs_bind6(struct sock_xprt *transport, struct socket *sock)
 		if (port > last)
 			nloop++;
 	} while (err == -EADDRINUSE && nloop != 2);
-	dprintk("RPC:       xs_bind6 %p6:%u: %s (%d)\n",
+	dprintk("RPC:       xs_bind6 %pI6:%u: %s (%d)\n",
 		&myaddr.sin6_addr, port, err ? "failed" : "ok", err);
 	return err;
 }
diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c
index f052b069f98..80b13eea30e 100644
--- a/net/xfrm/xfrm_policy.c
+++ b/net/xfrm/xfrm_policy.c
@@ -2467,11 +2467,11 @@ static void xfrm_audit_common_policyinfo(struct xfrm_policy *xp,
 					 sel->prefixlen_d);
 		break;
 	case AF_INET6:
-		audit_log_format(audit_buf, " src=%p6", sel->saddr.a6);
+		audit_log_format(audit_buf, " src=%pI6", sel->saddr.a6);
 		if (sel->prefixlen_s != 128)
 			audit_log_format(audit_buf, " src_prefixlen=%d",
 					 sel->prefixlen_s);
-		audit_log_format(audit_buf, " dst=%p6", sel->daddr.a6);
+		audit_log_format(audit_buf, " dst=%pI6", sel->daddr.a6);
 		if (sel->prefixlen_d != 128)
 			audit_log_format(audit_buf, " dst_prefixlen=%d",
 					 sel->prefixlen_d);
diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c
index 7944861fb9b..304eca4ac97 100644
--- a/net/xfrm/xfrm_state.c
+++ b/net/xfrm/xfrm_state.c
@@ -2115,7 +2115,7 @@ static void xfrm_audit_helper_sainfo(struct xfrm_state *x,
 				 NIPQUAD(x->id.daddr.a4));
 		break;
 	case AF_INET6:
-		audit_log_format(audit_buf, " src=%p6 dst=%p6",
+		audit_log_format(audit_buf, " src=%pI6 dst=%pI6",
 				 x->props.saddr.a6, x->id.daddr.a6);
 		break;
 	}
@@ -2140,7 +2140,7 @@ static void xfrm_audit_helper_pktinfo(struct sk_buff *skb, u16 family,
 	case AF_INET6:
 		iph6 = ipv6_hdr(skb);
 		audit_log_format(audit_buf,
-				 " src=%p6 dst=%p6 flowlbl=0x%x%02x%02x",
+				 " src=%pI6 dst=%pI6 flowlbl=0x%x%02x%02x",
 				 &iph6->saddr,&iph6->daddr,
 				 iph6->flow_lbl[0] & 0x0f,
 				 iph6->flow_lbl[1],
diff --git a/security/selinux/avc.c b/security/selinux/avc.c
index c91008f438a..ed6af12cdf4 100644
--- a/security/selinux/avc.c
+++ b/security/selinux/avc.c
@@ -495,7 +495,7 @@ static inline void avc_print_ipv6_addr(struct audit_buffer *ab,
 				       char *name1, char *name2)
 {
 	if (!ipv6_addr_any(addr))
-		audit_log_format(ab, " %s=%p6", name1, addr);
+		audit_log_format(ab, " %s=%pI6", name1, addr);
 	if (port)
 		audit_log_format(ab, " %s=%d", name2, ntohs(port));
 }
-- 
cgit v1.2.3-70-g09d2


From 2cb1599f9b2ecdd7a9e59feeee647eb258966839 Mon Sep 17 00:00:00 2001
From: David Chinner <david@fromorbit.com>
Date: Thu, 30 Oct 2008 17:32:23 +1100
Subject: Inode: Allow external initialisers

To allow XFS to combine the XFS and linux inodes into a single
structure, we need to drive inode lookup from the XFS inode cache,
not the generic inode cache. This means that we need initialise a
struct inode from a context outside alloc_inode() as it is no longer
used by XFS.

Factor and export the struct inode initialisation code from
alloc_inode() to inode_init_always() as a counterpart to
inode_init_once().  i.e. we have to call this init function for each
inode instantiation (always), as opposed inode_init_once() which is
only called on slab object instantiation (once).

Signed-off-by: Dave Chinner <david@fromorbit.com>
Signed-off-by: Christoph Hellwig <hch@infradead.org>
Signed-off-by: Lachlan McIlroy <lachlan@sgi.com>
---
 fs/inode.c         | 140 +++++++++++++++++++++++++++++------------------------
 include/linux/fs.h |   1 +
 2 files changed, 79 insertions(+), 62 deletions(-)

(limited to 'include/linux')

diff --git a/fs/inode.c b/fs/inode.c
index 0487ddba139..e7ee99907d6 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -108,84 +108,100 @@ static void wake_up_inode(struct inode *inode)
 	wake_up_bit(&inode->i_state, __I_LOCK);
 }
 
-static struct inode *alloc_inode(struct super_block *sb)
+/**
+ * inode_init_always - perform inode structure intialisation
+ * @sb		- superblock inode belongs to.
+ * @inode	- inode to initialise
+ *
+ * These are initializations that need to be done on every inode
+ * allocation as the fields are not initialised by slab allocation.
+ */
+struct inode *inode_init_always(struct super_block *sb, struct inode *inode)
 {
 	static const struct address_space_operations empty_aops;
 	static struct inode_operations empty_iops;
 	static const struct file_operations empty_fops;
-	struct inode *inode;
-
-	if (sb->s_op->alloc_inode)
-		inode = sb->s_op->alloc_inode(sb);
-	else
-		inode = (struct inode *) kmem_cache_alloc(inode_cachep, GFP_KERNEL);
 
-	if (inode) {
-		struct address_space * const mapping = &inode->i_data;
-
-		inode->i_sb = sb;
-		inode->i_blkbits = sb->s_blocksize_bits;
-		inode->i_flags = 0;
-		atomic_set(&inode->i_count, 1);
-		inode->i_op = &empty_iops;
-		inode->i_fop = &empty_fops;
-		inode->i_nlink = 1;
-		atomic_set(&inode->i_writecount, 0);
-		inode->i_size = 0;
-		inode->i_blocks = 0;
-		inode->i_bytes = 0;
-		inode->i_generation = 0;
+	struct address_space * const mapping = &inode->i_data;
+
+	inode->i_sb = sb;
+	inode->i_blkbits = sb->s_blocksize_bits;
+	inode->i_flags = 0;
+	atomic_set(&inode->i_count, 1);
+	inode->i_op = &empty_iops;
+	inode->i_fop = &empty_fops;
+	inode->i_nlink = 1;
+	atomic_set(&inode->i_writecount, 0);
+	inode->i_size = 0;
+	inode->i_blocks = 0;
+	inode->i_bytes = 0;
+	inode->i_generation = 0;
 #ifdef CONFIG_QUOTA
-		memset(&inode->i_dquot, 0, sizeof(inode->i_dquot));
+	memset(&inode->i_dquot, 0, sizeof(inode->i_dquot));
 #endif
-		inode->i_pipe = NULL;
-		inode->i_bdev = NULL;
-		inode->i_cdev = NULL;
-		inode->i_rdev = 0;
-		inode->dirtied_when = 0;
-		if (security_inode_alloc(inode)) {
-			if (inode->i_sb->s_op->destroy_inode)
-				inode->i_sb->s_op->destroy_inode(inode);
-			else
-				kmem_cache_free(inode_cachep, (inode));
-			return NULL;
-		}
+	inode->i_pipe = NULL;
+	inode->i_bdev = NULL;
+	inode->i_cdev = NULL;
+	inode->i_rdev = 0;
+	inode->dirtied_when = 0;
+	if (security_inode_alloc(inode)) {
+		if (inode->i_sb->s_op->destroy_inode)
+			inode->i_sb->s_op->destroy_inode(inode);
+		else
+			kmem_cache_free(inode_cachep, (inode));
+		return NULL;
+	}
 
-		spin_lock_init(&inode->i_lock);
-		lockdep_set_class(&inode->i_lock, &sb->s_type->i_lock_key);
+	spin_lock_init(&inode->i_lock);
+	lockdep_set_class(&inode->i_lock, &sb->s_type->i_lock_key);
 
-		mutex_init(&inode->i_mutex);
-		lockdep_set_class(&inode->i_mutex, &sb->s_type->i_mutex_key);
+	mutex_init(&inode->i_mutex);
+	lockdep_set_class(&inode->i_mutex, &sb->s_type->i_mutex_key);
 
-		init_rwsem(&inode->i_alloc_sem);
-		lockdep_set_class(&inode->i_alloc_sem, &sb->s_type->i_alloc_sem_key);
+	init_rwsem(&inode->i_alloc_sem);
+	lockdep_set_class(&inode->i_alloc_sem, &sb->s_type->i_alloc_sem_key);
 
-		mapping->a_ops = &empty_aops;
- 		mapping->host = inode;
-		mapping->flags = 0;
-		mapping_set_gfp_mask(mapping, GFP_HIGHUSER_PAGECACHE);
-		mapping->assoc_mapping = NULL;
-		mapping->backing_dev_info = &default_backing_dev_info;
-		mapping->writeback_index = 0;
+	mapping->a_ops = &empty_aops;
+	mapping->host = inode;
+	mapping->flags = 0;
+	mapping_set_gfp_mask(mapping, GFP_HIGHUSER_PAGECACHE);
+	mapping->assoc_mapping = NULL;
+	mapping->backing_dev_info = &default_backing_dev_info;
+	mapping->writeback_index = 0;
 
-		/*
-		 * If the block_device provides a backing_dev_info for client
-		 * inodes then use that.  Otherwise the inode share the bdev's
-		 * backing_dev_info.
-		 */
-		if (sb->s_bdev) {
-			struct backing_dev_info *bdi;
+	/*
+	 * If the block_device provides a backing_dev_info for client
+	 * inodes then use that.  Otherwise the inode share the bdev's
+	 * backing_dev_info.
+	 */
+	if (sb->s_bdev) {
+		struct backing_dev_info *bdi;
 
-			bdi = sb->s_bdev->bd_inode_backing_dev_info;
-			if (!bdi)
-				bdi = sb->s_bdev->bd_inode->i_mapping->backing_dev_info;
-			mapping->backing_dev_info = bdi;
-		}
-		inode->i_private = NULL;
-		inode->i_mapping = mapping;
+		bdi = sb->s_bdev->bd_inode_backing_dev_info;
+		if (!bdi)
+			bdi = sb->s_bdev->bd_inode->i_mapping->backing_dev_info;
+		mapping->backing_dev_info = bdi;
 	}
+	inode->i_private = NULL;
+	inode->i_mapping = mapping;
+
 	return inode;
 }
+EXPORT_SYMBOL(inode_init_always);
+
+static struct inode *alloc_inode(struct super_block *sb)
+{
+	struct inode *inode;
+
+	if (sb->s_op->alloc_inode)
+		inode = sb->s_op->alloc_inode(sb);
+	else
+		inode = kmem_cache_alloc(inode_cachep, GFP_KERNEL);
+
+	if (inode)
+		return inode_init_always(sb, inode);
+	return NULL;
+}
 
 void destroy_inode(struct inode *inode) 
 {
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 5b248d61430..04abead4b02 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1881,6 +1881,7 @@ extern loff_t default_llseek(struct file *file, loff_t offset, int origin);
 
 extern loff_t vfs_llseek(struct file *file, loff_t offset, int origin);
 
+extern struct inode * inode_init_always(struct super_block *, struct inode *);
 extern void inode_init_once(struct inode *);
 extern void iput(struct inode *);
 extern struct inode * igrab(struct inode *);
-- 
cgit v1.2.3-70-g09d2


From 8290c35f87304a6b73d4fd17b03580b4f7425de8 Mon Sep 17 00:00:00 2001
From: David Chinner <david@fromorbit.com>
Date: Thu, 30 Oct 2008 17:35:24 +1100
Subject: Inode: Allow external list initialisation

To allow XFS to combine the XFS and linux inodes into a single
structure, we need to drive inode lookup from the XFS inode cache,
not the generic inode cache. This means that we need initialise a
struct inode from a context outside alloc_inode() as it is no longer
used by XFS.

After inode allocation and initialisation, we need to add the inode
to the superblock list, the in-use list, hash it and do some
accounting. This all needs to be done with the inode_lock held and
there are already several places in fs/inode.c that do this list
manipulation.  Factor out the common code, add a locking wrapper and
export the function so ti can be called from XFS.

Signed-off-by: Dave Chinner <david@fromorbit.com>
Signed-off-by: Christoph Hellwig <hch@infradead.org>
Signed-off-by: Lachlan McIlroy <lachlan@sgi.com>
---
 fs/inode.c         | 67 +++++++++++++++++++++++++++++++++++++-----------------
 include/linux/fs.h |  1 +
 2 files changed, 47 insertions(+), 21 deletions(-)

(limited to 'include/linux')

diff --git a/fs/inode.c b/fs/inode.c
index e7ee99907d6..fbcf6c5e760 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -550,6 +550,49 @@ repeat:
 	return node ? inode : NULL;
 }
 
+static unsigned long hash(struct super_block *sb, unsigned long hashval)
+{
+	unsigned long tmp;
+
+	tmp = (hashval * (unsigned long)sb) ^ (GOLDEN_RATIO_PRIME + hashval) /
+			L1_CACHE_BYTES;
+	tmp = tmp ^ ((tmp ^ GOLDEN_RATIO_PRIME) >> I_HASHBITS);
+	return tmp & I_HASHMASK;
+}
+
+static inline void
+__inode_add_to_lists(struct super_block *sb, struct hlist_head *head,
+			struct inode *inode)
+{
+	inodes_stat.nr_inodes++;
+	list_add(&inode->i_list, &inode_in_use);
+	list_add(&inode->i_sb_list, &sb->s_inodes);
+	if (head)
+		hlist_add_head(&inode->i_hash, head);
+}
+
+/**
+ * inode_add_to_lists - add a new inode to relevant lists
+ * @sb		- superblock inode belongs to.
+ * @inode	- inode to mark in use
+ *
+ * When an inode is allocated it needs to be accounted for, added to the in use
+ * list, the owning superblock and the inode hash. This needs to be done under
+ * the inode_lock, so export a function to do this rather than the inode lock
+ * itself. We calculate the hash list to add to here so it is all internal
+ * which requires the caller to have already set up the inode number in the
+ * inode to add.
+ */
+void inode_add_to_lists(struct super_block *sb, struct inode *inode)
+{
+	struct hlist_head *head = inode_hashtable + hash(sb, inode->i_ino);
+
+	spin_lock(&inode_lock);
+	__inode_add_to_lists(sb, head, inode);
+	spin_unlock(&inode_lock);
+}
+EXPORT_SYMBOL_GPL(inode_add_to_lists);
+
 /**
  *	new_inode 	- obtain an inode
  *	@sb: superblock
@@ -577,9 +620,7 @@ struct inode *new_inode(struct super_block *sb)
 	inode = alloc_inode(sb);
 	if (inode) {
 		spin_lock(&inode_lock);
-		inodes_stat.nr_inodes++;
-		list_add(&inode->i_list, &inode_in_use);
-		list_add(&inode->i_sb_list, &sb->s_inodes);
+		__inode_add_to_lists(sb, NULL, inode);
 		inode->i_ino = ++last_ino;
 		inode->i_state = 0;
 		spin_unlock(&inode_lock);
@@ -638,10 +679,7 @@ static struct inode * get_new_inode(struct super_block *sb, struct hlist_head *h
 			if (set(inode, data))
 				goto set_failed;
 
-			inodes_stat.nr_inodes++;
-			list_add(&inode->i_list, &inode_in_use);
-			list_add(&inode->i_sb_list, &sb->s_inodes);
-			hlist_add_head(&inode->i_hash, head);
+			__inode_add_to_lists(sb, head, inode);
 			inode->i_state = I_LOCK|I_NEW;
 			spin_unlock(&inode_lock);
 
@@ -687,10 +725,7 @@ static struct inode * get_new_inode_fast(struct super_block *sb, struct hlist_he
 		old = find_inode_fast(sb, head, ino);
 		if (!old) {
 			inode->i_ino = ino;
-			inodes_stat.nr_inodes++;
-			list_add(&inode->i_list, &inode_in_use);
-			list_add(&inode->i_sb_list, &sb->s_inodes);
-			hlist_add_head(&inode->i_hash, head);
+			__inode_add_to_lists(sb, head, inode);
 			inode->i_state = I_LOCK|I_NEW;
 			spin_unlock(&inode_lock);
 
@@ -714,16 +749,6 @@ static struct inode * get_new_inode_fast(struct super_block *sb, struct hlist_he
 	return inode;
 }
 
-static unsigned long hash(struct super_block *sb, unsigned long hashval)
-{
-	unsigned long tmp;
-
-	tmp = (hashval * (unsigned long)sb) ^ (GOLDEN_RATIO_PRIME + hashval) /
-			L1_CACHE_BYTES;
-	tmp = tmp ^ ((tmp ^ GOLDEN_RATIO_PRIME) >> I_HASHBITS);
-	return tmp & I_HASHMASK;
-}
-
 /**
  *	iunique - get a unique inode number
  *	@sb: superblock
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 04abead4b02..1deedf235d5 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1883,6 +1883,7 @@ extern loff_t vfs_llseek(struct file *file, loff_t offset, int origin);
 
 extern struct inode * inode_init_always(struct super_block *, struct inode *);
 extern void inode_init_once(struct inode *);
+extern void inode_add_to_lists(struct super_block *, struct inode *);
 extern void iput(struct inode *);
 extern struct inode * igrab(struct inode *);
 extern ino_t iunique(struct super_block *, ino_t);
-- 
cgit v1.2.3-70-g09d2


From d98d38f2014ab79f28c126ff175d034891f7aefc Mon Sep 17 00:00:00 2001
From: Arjan van de Ven <arjan@linux.intel.com>
Date: Wed, 29 Oct 2008 14:24:09 -0700
Subject: mutex: improve header comment to be actually informative about the
 API

Impact: improve documentation

It's nice to say that mutex_trylock follows the spin_trylock convention.
It's a lot nicer if the comment also says which that is...  make it so.

Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mutex.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/mutex.h b/include/linux/mutex.h
index bc6da10ceee..7a0e5c4f807 100644
--- a/include/linux/mutex.h
+++ b/include/linux/mutex.h
@@ -144,6 +144,8 @@ extern int __must_check mutex_lock_killable(struct mutex *lock);
 /*
  * NOTE: mutex_trylock() follows the spin_trylock() convention,
  *       not the down_trylock() convention!
+ *
+ * Returns 1 if the mutex has been acquired successfully, and 0 on contention.
  */
 extern int mutex_trylock(struct mutex *lock);
 extern void mutex_unlock(struct mutex *lock);
-- 
cgit v1.2.3-70-g09d2


From 17666f02b118099028522dfc3df00a235700e216 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Thu, 30 Oct 2008 16:08:32 -0400
Subject: ftrace: nmi safe code modification

Impact: fix crashes that can occur in NMI handlers, if their code is modified

Modifying code is something that needs special care. On SMP boxes,
if code that is being modified is also being executed on another CPU,
that CPU will have undefined results.

The dynamic ftrace uses kstop_machine to make the system act like a
uniprocessor system. But this does not address NMIs, that can still
run on other CPUs.

One approach to handle this is to make all code that are used by NMIs
not be traced. But NMIs can call notifiers that spread throughout the
kernel and this will be very hard to maintain, and the chance of missing
a function is very high.

The approach that this patch takes is to have the NMIs modify the code
if the modification is taking place. The way this works is that just
writing to code executing on another CPU is not harmful if what is
written is the same as what exists.

Two buffers are used: an IP buffer and a "code" buffer.

The steps that the patcher takes are:

 1) Put in the instruction pointer into the IP buffer
    and the new code into the "code" buffer.
 2) Set a flag that says we are modifying code
 3) Wait for any running NMIs to finish.
 4) Write the code
 5) clear the flag.
 6) Wait for any running NMIs to finish.

If an NMI is executed, it will also write the pending code.
Multiple writes are OK, because what is being written is the same.
Then the patcher must wait for all running NMIs to finish before
going to the next line that must be patched.

This is basically the RCU approach to code modification.

Thanks to Ingo Molnar for suggesting the idea, and to Arjan van de Ven
for his guidence on what is safe and what is not.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/arm/include/asm/ftrace.h     |   5 ++
 arch/powerpc/include/asm/ftrace.h |   5 ++
 arch/sh/include/asm/ftrace.h      |   5 ++
 arch/sparc/include/asm/ftrace.h   |   5 ++
 arch/x86/include/asm/ftrace.h     |  15 ++++++
 arch/x86/kernel/ftrace.c          | 107 +++++++++++++++++++++++++++++++++++++-
 include/linux/hardirq.h           |  15 +++++-
 7 files changed, 154 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arm/include/asm/ftrace.h b/arch/arm/include/asm/ftrace.h
index 39c8bc1a006..d4c24a7a928 100644
--- a/arch/arm/include/asm/ftrace.h
+++ b/arch/arm/include/asm/ftrace.h
@@ -1,6 +1,11 @@
 #ifndef _ASM_ARM_FTRACE
 #define _ASM_ARM_FTRACE
 
+#ifndef __ASSEMBLY__
+#define ftrace_nmi_enter()	do { } while (0)
+#define ftrace_nmi_exit()	do { } while (0)
+#endif
+
 #ifdef CONFIG_FUNCTION_TRACER
 #define MCOUNT_ADDR		((long)(mcount))
 #define MCOUNT_INSN_SIZE	4 /* sizeof mcount call */
diff --git a/arch/powerpc/include/asm/ftrace.h b/arch/powerpc/include/asm/ftrace.h
index b298f7a631e..7652755dc00 100644
--- a/arch/powerpc/include/asm/ftrace.h
+++ b/arch/powerpc/include/asm/ftrace.h
@@ -1,6 +1,11 @@
 #ifndef _ASM_POWERPC_FTRACE
 #define _ASM_POWERPC_FTRACE
 
+#ifndef __ASSEMBLY__
+#define ftrace_nmi_enter()	do { } while (0)
+#define ftrace_nmi_exit()	do { } while (0)
+#endif
+
 #ifdef CONFIG_FUNCTION_TRACER
 #define MCOUNT_ADDR		((long)(_mcount))
 #define MCOUNT_INSN_SIZE	4 /* sizeof mcount call */
diff --git a/arch/sh/include/asm/ftrace.h b/arch/sh/include/asm/ftrace.h
index 3aed362c946..cdf2cb0b9ff 100644
--- a/arch/sh/include/asm/ftrace.h
+++ b/arch/sh/include/asm/ftrace.h
@@ -1,6 +1,11 @@
 #ifndef __ASM_SH_FTRACE_H
 #define __ASM_SH_FTRACE_H
 
+#ifndef __ASSEMBLY__
+#define ftrace_nmi_enter()	do { } while (0)
+#define ftrace_nmi_exit()	do { } while (0)
+#endif
+
 #ifndef __ASSEMBLY__
 extern void mcount(void);
 #endif
diff --git a/arch/sparc/include/asm/ftrace.h b/arch/sparc/include/asm/ftrace.h
index d27716cd38c..33a95feeb13 100644
--- a/arch/sparc/include/asm/ftrace.h
+++ b/arch/sparc/include/asm/ftrace.h
@@ -1,6 +1,11 @@
 #ifndef _ASM_SPARC64_FTRACE
 #define _ASM_SPARC64_FTRACE
 
+#ifndef __ASSEMBLY__
+#define ftrace_nmi_enter()	do { } while (0)
+#define ftrace_nmi_exit()	do { } while (0)
+#endif
+
 #ifdef CONFIG_MCOUNT
 #define MCOUNT_ADDR		((long)(_mcount))
 #define MCOUNT_INSN_SIZE	4 /* sizeof mcount call */
diff --git a/arch/x86/include/asm/ftrace.h b/arch/x86/include/asm/ftrace.h
index 9e8bc29b8b1..f2ed6b704a7 100644
--- a/arch/x86/include/asm/ftrace.h
+++ b/arch/x86/include/asm/ftrace.h
@@ -17,6 +17,21 @@ static inline unsigned long ftrace_call_adjust(unsigned long addr)
 	 */
 	return addr - 1;
 }
+
+#ifdef CONFIG_DYNAMIC_FTRACE
+extern void ftrace_nmi_enter(void);
+extern void ftrace_nmi_exit(void);
+#else
+#define ftrace_nmi_enter()	do { } while (0)
+#define ftrace_nmi_exit()	do { } while (0)
+#endif
+#endif
+
+#else /* CONFIG_FUNCTION_TRACER */
+
+#ifndef __ASSEMBLY__
+#define ftrace_nmi_enter()	do { } while (0)
+#define ftrace_nmi_exit()	do { } while (0)
 #endif
 
 #endif /* CONFIG_FUNCTION_TRACER */
diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c
index 50ea0ac8c9b..fe5f859130b 100644
--- a/arch/x86/kernel/ftrace.c
+++ b/arch/x86/kernel/ftrace.c
@@ -56,6 +56,111 @@ unsigned char *ftrace_call_replace(unsigned long ip, unsigned long addr)
 	return calc.code;
 }
 
+/*
+ * Modifying code must take extra care. On an SMP machine, if
+ * the code being modified is also being executed on another CPU
+ * that CPU will have undefined results and possibly take a GPF.
+ * We use kstop_machine to stop other CPUS from exectuing code.
+ * But this does not stop NMIs from happening. We still need
+ * to protect against that. We separate out the modification of
+ * the code to take care of this.
+ *
+ * Two buffers are added: An IP buffer and a "code" buffer.
+ *
+ * 1) Put in the instruction pointer into the IP buffer
+ *    and the new code into the "code" buffer.
+ * 2) Set a flag that says we are modifying code
+ * 3) Wait for any running NMIs to finish.
+ * 4) Write the code
+ * 5) clear the flag.
+ * 6) Wait for any running NMIs to finish.
+ *
+ * If an NMI is executed, the first thing it does is to call
+ * "ftrace_nmi_enter". This will check if the flag is set to write
+ * and if it is, it will write what is in the IP and "code" buffers.
+ *
+ * The trick is, it does not matter if everyone is writing the same
+ * content to the code location. Also, if a CPU is executing code
+ * it is OK to write to that code location if the contents being written
+ * are the same as what exists.
+ */
+
+static atomic_t in_nmi;
+static int mod_code_status;
+static int mod_code_write;
+static void *mod_code_ip;
+static void *mod_code_newcode;
+
+static void ftrace_mod_code(void)
+{
+	/*
+	 * Yes, more than one CPU process can be writing to mod_code_status.
+	 *    (and the code itself)
+	 * But if one were to fail, then they all should, and if one were
+	 * to succeed, then they all should.
+	 */
+	mod_code_status = probe_kernel_write(mod_code_ip, mod_code_newcode,
+					     MCOUNT_INSN_SIZE);
+
+}
+
+void ftrace_nmi_enter(void)
+{
+	atomic_inc(&in_nmi);
+	/* Must have in_nmi seen before reading write flag */
+	smp_mb();
+	if (mod_code_write)
+		ftrace_mod_code();
+}
+
+void ftrace_nmi_exit(void)
+{
+	/* Finish all executions before clearing in_nmi */
+	smp_wmb();
+	atomic_dec(&in_nmi);
+}
+
+static void wait_for_nmi(void)
+{
+	while (atomic_read(&in_nmi))
+		cpu_relax();
+}
+
+static int
+do_ftrace_mod_code(unsigned long ip, void *new_code)
+{
+	mod_code_ip = (void *)ip;
+	mod_code_newcode = new_code;
+
+	/* The buffers need to be visible before we let NMIs write them */
+	smp_wmb();
+
+	mod_code_write = 1;
+
+	/* Make sure write bit is visible before we wait on NMIs */
+	smp_mb();
+
+	wait_for_nmi();
+
+	/* Make sure all running NMIs have finished before we write the code */
+	smp_mb();
+
+	ftrace_mod_code();
+
+	/* Make sure the write happens before clearing the bit */
+	smp_wmb();
+
+	mod_code_write = 0;
+
+	/* make sure NMIs see the cleared bit */
+	smp_mb();
+
+	wait_for_nmi();
+
+	return mod_code_status;
+}
+
+
 int
 ftrace_modify_code(unsigned long ip, unsigned char *old_code,
 		   unsigned char *new_code)
@@ -81,7 +186,7 @@ ftrace_modify_code(unsigned long ip, unsigned char *old_code,
 		return -EINVAL;
 
 	/* replace the text with the new text */
-	if (probe_kernel_write((void *)ip, new_code, MCOUNT_INSN_SIZE))
+	if (do_ftrace_mod_code(ip, new_code))
 		return -EPERM;
 
 	sync_core();
diff --git a/include/linux/hardirq.h b/include/linux/hardirq.h
index 181006cc94a..0087cb43bec 100644
--- a/include/linux/hardirq.h
+++ b/include/linux/hardirq.h
@@ -5,6 +5,7 @@
 #include <linux/smp_lock.h>
 #include <linux/lockdep.h>
 #include <asm/hardirq.h>
+#include <asm/ftrace.h>
 #include <asm/system.h>
 
 /*
@@ -161,7 +162,17 @@ extern void irq_enter(void);
  */
 extern void irq_exit(void);
 
-#define nmi_enter()		do { lockdep_off(); __irq_enter(); } while (0)
-#define nmi_exit()		do { __irq_exit(); lockdep_on(); } while (0)
+#define nmi_enter()				\
+	do {					\
+		ftrace_nmi_enter();		\
+		lockdep_off();			\
+		__irq_enter();			\
+	} while (0)
+#define nmi_exit()				\
+	do {					\
+		__irq_exit();			\
+		lockdep_on();			\
+		ftrace_nmi_exit();		\
+	} while (0)
 
 #endif /* LINUX_HARDIRQ_H */
-- 
cgit v1.2.3-70-g09d2


From 3685f25de1b0447fff381c420de1e25bd57c9efb Mon Sep 17 00:00:00 2001
From: Harvey Harrison <harvey.harrison@gmail.com>
Date: Fri, 31 Oct 2008 00:56:49 -0700
Subject: misc: replace NIPQUAD()

Using NIPQUAD() with NIPQUAD_FMT, %d.%d.%d.%d or %u.%u.%u.%u
can be replaced with %pI4

Signed-off-by: Harvey Harrison <harvey.harrison@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/sunrpc/svc_xprt.h            | 4 ++--
 include/net/ip_vs.h                        | 4 ++--
 include/net/netfilter/nf_conntrack_tuple.h | 6 +++---
 include/net/sctp/sctp.h                    | 4 ++--
 security/selinux/avc.c                     | 2 +-
 5 files changed, 10 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/svc_xprt.h b/include/linux/sunrpc/svc_xprt.h
index 51cb75ea42d..0127daca435 100644
--- a/include/linux/sunrpc/svc_xprt.h
+++ b/include/linux/sunrpc/svc_xprt.h
@@ -139,8 +139,8 @@ static inline char *__svc_print_addr(struct sockaddr *addr,
 {
 	switch (addr->sa_family) {
 	case AF_INET:
-		snprintf(buf, len, "%u.%u.%u.%u, port=%u",
-			NIPQUAD(((struct sockaddr_in *) addr)->sin_addr),
+		snprintf(buf, len, "%pI4, port=%u",
+			&((struct sockaddr_in *)addr)->sin_addr,
 			ntohs(((struct sockaddr_in *) addr)->sin_port));
 		break;
 
diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h
index af48cada561..fc63353779f 100644
--- a/include/net/ip_vs.h
+++ b/include/net/ip_vs.h
@@ -91,8 +91,8 @@ static inline const char *ip_vs_dbg_addr(int af, char *buf, size_t buf_len,
 			       &addr->in6) + 1;
 	else
 #endif
-		len = snprintf(&buf[*idx], buf_len - *idx, NIPQUAD_FMT,
-			       NIPQUAD(addr->ip)) + 1;
+		len = snprintf(&buf[*idx], buf_len - *idx, "%pI4",
+			       &addr->ip) + 1;
 
 	*idx += len;
 	BUG_ON(*idx > buf_len + 1);
diff --git a/include/net/netfilter/nf_conntrack_tuple.h b/include/net/netfilter/nf_conntrack_tuple.h
index 42f1fc96f3e..f2f6aa73dc1 100644
--- a/include/net/netfilter/nf_conntrack_tuple.h
+++ b/include/net/netfilter/nf_conntrack_tuple.h
@@ -112,10 +112,10 @@ struct nf_conntrack_tuple_mask
 static inline void nf_ct_dump_tuple_ip(const struct nf_conntrack_tuple *t)
 {
 #ifdef DEBUG
-	printk("tuple %p: %u " NIPQUAD_FMT ":%hu -> " NIPQUAD_FMT ":%hu\n",
+	printk("tuple %p: %u %pI4:%hu -> %pI4:%hu\n",
 	       t, t->dst.protonum,
-	       NIPQUAD(t->src.u3.ip), ntohs(t->src.u.all),
-	       NIPQUAD(t->dst.u3.ip), ntohs(t->dst.u.all));
+	       &t->src.u3.ip, ntohs(t->src.u.all),
+	       &t->dst.u3.ip, ntohs(t->dst.u.all));
 #endif
 }
 
diff --git a/include/net/sctp/sctp.h b/include/net/sctp/sctp.h
index e71b0f7ce88..23797506f59 100644
--- a/include/net/sctp/sctp.h
+++ b/include/net/sctp/sctp.h
@@ -291,9 +291,9 @@ extern int sctp_debug_flag;
 			       otherparms); \
 		} else { \
 			printk(KERN_DEBUG \
-			       lead NIPQUAD_FMT trail, \
+			       lead "%pI4" trail, \
 			       leadparm, \
-			       NIPQUAD(saddr->v4.sin_addr.s_addr), \
+			       &saddr->v4.sin_addr.s_addr, \
 			       otherparms); \
 		} \
 	}
diff --git a/security/selinux/avc.c b/security/selinux/avc.c
index ed6af12cdf4..d43bd6baeea 100644
--- a/security/selinux/avc.c
+++ b/security/selinux/avc.c
@@ -504,7 +504,7 @@ static inline void avc_print_ipv4_addr(struct audit_buffer *ab, __be32 addr,
 				       __be16 port, char *name1, char *name2)
 {
 	if (addr)
-		audit_log_format(ab, " %s=" NIPQUAD_FMT, name1, NIPQUAD(addr));
+		audit_log_format(ab, " %s=%pI4", name1, &addr);
 	if (port)
 		audit_log_format(ab, " %s=%d", name2, ntohs(port));
 }
-- 
cgit v1.2.3-70-g09d2


From 92be3d6bdf2cb34972ab50e12ad4da1076e690da Mon Sep 17 00:00:00 2001
From: Huang Ying <ying.huang@intel.com>
Date: Fri, 31 Oct 2008 09:48:08 +0800
Subject: kexec/i386: allocate page table pages dynamically

Impact: save .text size when kexec is built in but not loaded

This patch adds an architecture specific struct kimage_arch into
struct kimage. The pointers to page table pages used by kexec are
added to struct kimage_arch. The page tables pages are dynamically
allocated in machine_kexec_prepare instead of statically from BSS
segment. This will save up to 20k memory when kexec image is not
loaded.

Signed-off-by: Huang Ying <ying.huang@intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/include/asm/kexec.h       | 14 ++++++++
 arch/x86/kernel/machine_kexec_32.c | 67 ++++++++++++++++++++++++++------------
 include/linux/kexec.h              |  4 +++
 3 files changed, 64 insertions(+), 21 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/include/asm/kexec.h b/arch/x86/include/asm/kexec.h
index a1f22771a15..df9c41a9c6a 100644
--- a/arch/x86/include/asm/kexec.h
+++ b/arch/x86/include/asm/kexec.h
@@ -170,6 +170,20 @@ relocate_kernel(unsigned long indirection_page,
 		unsigned long start_address) ATTRIB_NORET;
 #endif
 
+#ifdef CONFIG_X86_32
+#define ARCH_HAS_KIMAGE_ARCH
+
+struct kimage_arch {
+	pgd_t *pgd;
+#ifdef CONFIG_X86_PAE
+	pmd_t *pmd0;
+	pmd_t *pmd1;
+#endif
+	pte_t *pte0;
+	pte_t *pte1;
+};
+#endif
+
 #endif /* __ASSEMBLY__ */
 
 #endif /* _ASM_X86_KEXEC_H */
diff --git a/arch/x86/kernel/machine_kexec_32.c b/arch/x86/kernel/machine_kexec_32.c
index 7a385746509..1100312847a 100644
--- a/arch/x86/kernel/machine_kexec_32.c
+++ b/arch/x86/kernel/machine_kexec_32.c
@@ -13,6 +13,7 @@
 #include <linux/numa.h>
 #include <linux/ftrace.h>
 #include <linux/suspend.h>
+#include <linux/gfp.h>
 
 #include <asm/pgtable.h>
 #include <asm/pgalloc.h>
@@ -25,15 +26,6 @@
 #include <asm/system.h>
 #include <asm/cacheflush.h>
 
-#define PAGE_ALIGNED __attribute__ ((__aligned__(PAGE_SIZE)))
-static u32 kexec_pgd[1024] PAGE_ALIGNED;
-#ifdef CONFIG_X86_PAE
-static u32 kexec_pmd0[1024] PAGE_ALIGNED;
-static u32 kexec_pmd1[1024] PAGE_ALIGNED;
-#endif
-static u32 kexec_pte0[1024] PAGE_ALIGNED;
-static u32 kexec_pte1[1024] PAGE_ALIGNED;
-
 static void set_idt(void *newidt, __u16 limit)
 {
 	struct desc_ptr curidt;
@@ -76,6 +68,37 @@ static void load_segments(void)
 #undef __STR
 }
 
+static void machine_kexec_free_page_tables(struct kimage *image)
+{
+	free_page((unsigned long)image->arch.pgd);
+#ifdef CONFIG_X86_PAE
+	free_page((unsigned long)image->arch.pmd0);
+	free_page((unsigned long)image->arch.pmd1);
+#endif
+	free_page((unsigned long)image->arch.pte0);
+	free_page((unsigned long)image->arch.pte1);
+}
+
+static int machine_kexec_alloc_page_tables(struct kimage *image)
+{
+	image->arch.pgd = (pgd_t *)get_zeroed_page(GFP_KERNEL);
+#ifdef CONFIG_X86_PAE
+	image->arch.pmd0 = (pmd_t *)get_zeroed_page(GFP_KERNEL);
+	image->arch.pmd1 = (pmd_t *)get_zeroed_page(GFP_KERNEL);
+#endif
+	image->arch.pte0 = (pte_t *)get_zeroed_page(GFP_KERNEL);
+	image->arch.pte1 = (pte_t *)get_zeroed_page(GFP_KERNEL);
+	if (!image->arch.pgd ||
+#ifdef CONFIG_X86_PAE
+	    !image->arch.pmd0 || !image->arch.pmd1 ||
+#endif
+	    !image->arch.pte0 || !image->arch.pte1) {
+		machine_kexec_free_page_tables(image);
+		return -ENOMEM;
+	}
+	return 0;
+}
+
 /*
  * A architecture hook called to validate the
  * proposed image and prepare the control pages
@@ -87,13 +110,14 @@ static void load_segments(void)
  * reboot code buffer to allow us to avoid allocations
  * later.
  *
- * Make control page executable.
+ * - Make control page executable.
+ * - Allocate page tables
  */
 int machine_kexec_prepare(struct kimage *image)
 {
 	if (nx_enabled)
 		set_pages_x(image->control_code_page, 1);
-	return 0;
+	return machine_kexec_alloc_page_tables(image);
 }
 
 /*
@@ -104,6 +128,7 @@ void machine_kexec_cleanup(struct kimage *image)
 {
 	if (nx_enabled)
 		set_pages_nx(image->control_code_page, 1);
+	machine_kexec_free_page_tables(image);
 }
 
 /*
@@ -150,18 +175,18 @@ void machine_kexec(struct kimage *image)
 	relocate_kernel_ptr = control_page;
 	page_list[PA_CONTROL_PAGE] = __pa(control_page);
 	page_list[VA_CONTROL_PAGE] = (unsigned long)control_page;
-	page_list[PA_PGD] = __pa(kexec_pgd);
-	page_list[VA_PGD] = (unsigned long)kexec_pgd;
+	page_list[PA_PGD] = __pa(image->arch.pgd);
+	page_list[VA_PGD] = (unsigned long)image->arch.pgd;
 #ifdef CONFIG_X86_PAE
-	page_list[PA_PMD_0] = __pa(kexec_pmd0);
-	page_list[VA_PMD_0] = (unsigned long)kexec_pmd0;
-	page_list[PA_PMD_1] = __pa(kexec_pmd1);
-	page_list[VA_PMD_1] = (unsigned long)kexec_pmd1;
+	page_list[PA_PMD_0] = __pa(image->arch.pmd0);
+	page_list[VA_PMD_0] = (unsigned long)image->arch.pmd0;
+	page_list[PA_PMD_1] = __pa(image->arch.pmd1);
+	page_list[VA_PMD_1] = (unsigned long)image->arch.pmd1;
 #endif
-	page_list[PA_PTE_0] = __pa(kexec_pte0);
-	page_list[VA_PTE_0] = (unsigned long)kexec_pte0;
-	page_list[PA_PTE_1] = __pa(kexec_pte1);
-	page_list[VA_PTE_1] = (unsigned long)kexec_pte1;
+	page_list[PA_PTE_0] = __pa(image->arch.pte0);
+	page_list[VA_PTE_0] = (unsigned long)image->arch.pte0;
+	page_list[PA_PTE_1] = __pa(image->arch.pte1);
+	page_list[VA_PTE_1] = (unsigned long)image->arch.pte1;
 
 	if (image->type == KEXEC_TYPE_DEFAULT)
 		page_list[PA_SWAP_PAGE] = (page_to_pfn(image->swap_page)
diff --git a/include/linux/kexec.h b/include/linux/kexec.h
index 17f76fc0517..adc34f2c6ef 100644
--- a/include/linux/kexec.h
+++ b/include/linux/kexec.h
@@ -100,6 +100,10 @@ struct kimage {
 #define KEXEC_TYPE_DEFAULT 0
 #define KEXEC_TYPE_CRASH   1
 	unsigned int preserve_context : 1;
+
+#ifdef ARCH_HAS_KIMAGE_ARCH
+	struct kimage_arch arch;
+#endif
 };
 
 
-- 
cgit v1.2.3-70-g09d2


From a26a2a27396c0a0877aa701f8f92d08ba550a6c9 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Fri, 31 Oct 2008 00:03:22 -0400
Subject: ftrace: nmi safe code clean ups

Impact: cleanup

This patch cleans up the NMI safe code for dynamic ftrace as suggested
by Andrew Morton.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/arm/include/asm/ftrace.h     |  4 ++--
 arch/powerpc/include/asm/ftrace.h |  4 ++--
 arch/sh/include/asm/ftrace.h      |  4 ++--
 arch/sparc/include/asm/ftrace.h   |  4 ++--
 arch/x86/include/asm/ftrace.h     | 10 +++++-----
 arch/x86/kernel/ftrace.c          | 16 ++++++++--------
 include/linux/ftrace.h            |  3 +++
 kernel/trace/trace.c              |  9 ++++-----
 8 files changed, 28 insertions(+), 26 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arm/include/asm/ftrace.h b/arch/arm/include/asm/ftrace.h
index d4c24a7a928..3f3a1d1508e 100644
--- a/arch/arm/include/asm/ftrace.h
+++ b/arch/arm/include/asm/ftrace.h
@@ -2,8 +2,8 @@
 #define _ASM_ARM_FTRACE
 
 #ifndef __ASSEMBLY__
-#define ftrace_nmi_enter()	do { } while (0)
-#define ftrace_nmi_exit()	do { } while (0)
+static inline void ftrace_nmi_enter(void) { }
+static inline void ftrace_nmi_exit(void) { }
 #endif
 
 #ifdef CONFIG_FUNCTION_TRACER
diff --git a/arch/powerpc/include/asm/ftrace.h b/arch/powerpc/include/asm/ftrace.h
index 7652755dc00..1cd72700fbc 100644
--- a/arch/powerpc/include/asm/ftrace.h
+++ b/arch/powerpc/include/asm/ftrace.h
@@ -2,8 +2,8 @@
 #define _ASM_POWERPC_FTRACE
 
 #ifndef __ASSEMBLY__
-#define ftrace_nmi_enter()	do { } while (0)
-#define ftrace_nmi_exit()	do { } while (0)
+static inline void ftrace_nmi_enter(void) { }
+static inline void ftrace_nmi_exit(void) { }
 #endif
 
 #ifdef CONFIG_FUNCTION_TRACER
diff --git a/arch/sh/include/asm/ftrace.h b/arch/sh/include/asm/ftrace.h
index cdf2cb0b9ff..31ada0370cb 100644
--- a/arch/sh/include/asm/ftrace.h
+++ b/arch/sh/include/asm/ftrace.h
@@ -2,8 +2,8 @@
 #define __ASM_SH_FTRACE_H
 
 #ifndef __ASSEMBLY__
-#define ftrace_nmi_enter()	do { } while (0)
-#define ftrace_nmi_exit()	do { } while (0)
+static inline void ftrace_nmi_enter(void) { }
+static inline void ftrace_nmi_exit(void) { }
 #endif
 
 #ifndef __ASSEMBLY__
diff --git a/arch/sparc/include/asm/ftrace.h b/arch/sparc/include/asm/ftrace.h
index 33a95feeb13..62055ac0496 100644
--- a/arch/sparc/include/asm/ftrace.h
+++ b/arch/sparc/include/asm/ftrace.h
@@ -2,8 +2,8 @@
 #define _ASM_SPARC64_FTRACE
 
 #ifndef __ASSEMBLY__
-#define ftrace_nmi_enter()	do { } while (0)
-#define ftrace_nmi_exit()	do { } while (0)
+static inline void ftrace_nmi_enter(void) { }
+static inline void ftrace_nmi_exit(void) { }
 #endif
 
 #ifdef CONFIG_MCOUNT
diff --git a/arch/x86/include/asm/ftrace.h b/arch/x86/include/asm/ftrace.h
index f2ed6b704a7..a23468194b8 100644
--- a/arch/x86/include/asm/ftrace.h
+++ b/arch/x86/include/asm/ftrace.h
@@ -22,16 +22,16 @@ static inline unsigned long ftrace_call_adjust(unsigned long addr)
 extern void ftrace_nmi_enter(void);
 extern void ftrace_nmi_exit(void);
 #else
-#define ftrace_nmi_enter()	do { } while (0)
-#define ftrace_nmi_exit()	do { } while (0)
-#endif
+static inline void ftrace_nmi_enter(void) { }
+static inline void ftrace_nmi_exit(void) { }
 #endif
+#endif /* __ASSEMBLY__ */
 
 #else /* CONFIG_FUNCTION_TRACER */
 
 #ifndef __ASSEMBLY__
-#define ftrace_nmi_enter()	do { } while (0)
-#define ftrace_nmi_exit()	do { } while (0)
+static inline void ftrace_nmi_enter(void) { }
+static inline void ftrace_nmi_exit(void) { }
 #endif
 
 #endif /* CONFIG_FUNCTION_TRACER */
diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c
index 6685b0fc1b4..69149337f2f 100644
--- a/arch/x86/kernel/ftrace.c
+++ b/arch/x86/kernel/ftrace.c
@@ -67,7 +67,7 @@ unsigned char *ftrace_call_replace(unsigned long ip, unsigned long addr)
  *
  * Two buffers are added: An IP buffer and a "code" buffer.
  *
- * 1) Put in the instruction pointer into the IP buffer
+ * 1) Put the instruction pointer into the IP buffer
  *    and the new code into the "code" buffer.
  * 2) Set a flag that says we are modifying code
  * 3) Wait for any running NMIs to finish.
@@ -85,14 +85,14 @@ unsigned char *ftrace_call_replace(unsigned long ip, unsigned long addr)
  * are the same as what exists.
  */
 
-static atomic_t in_nmi;
-static int mod_code_status;
-static int mod_code_write;
-static void *mod_code_ip;
-static void *mod_code_newcode;
+static atomic_t in_nmi = ATOMIC_INIT(0);
+static int mod_code_status;		/* holds return value of text write */
+static int mod_code_write;		/* set when NMI should do the write */
+static void *mod_code_ip;		/* holds the IP to write to */
+static void *mod_code_newcode;		/* holds the text to write to the IP */
 
-static int nmi_wait_count;
-static atomic_t nmi_update_count;
+static unsigned nmi_wait_count;
+static atomic_t nmi_update_count = ATOMIC_INIT(0);
 
 int ftrace_arch_read_dyn_info(char *buf, int size)
 {
diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index 703eb53cfa2..22240dfe912 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -74,6 +74,9 @@ extern void ftrace_caller(void);
 extern void ftrace_call(void);
 extern void mcount_call(void);
 
+/* May be defined in arch */
+extern int ftrace_arch_read_dyn_info(char *buf, int size);
+
 /**
  * ftrace_modify_code - modify code segment
  * @ip: the address of the code segment
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index bc36febc077..7f86067d760 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -2815,10 +2815,6 @@ static struct file_operations tracing_mark_fops = {
 
 #ifdef CONFIG_DYNAMIC_FTRACE
 
-#define DYN_INFO_BUF_SIZE 1023
-static char ftrace_dyn_info_buffer[DYN_INFO_BUF_SIZE+1];
-static DEFINE_MUTEX(dyn_info_mutex);
-
 int __weak ftrace_arch_read_dyn_info(char *buf, int size)
 {
 	return 0;
@@ -2828,14 +2824,17 @@ static ssize_t
 tracing_read_dyn_info(struct file *filp, char __user *ubuf,
 		  size_t cnt, loff_t *ppos)
 {
+	static char ftrace_dyn_info_buffer[1024];
+	static DEFINE_MUTEX(dyn_info_mutex);
 	unsigned long *p = filp->private_data;
 	char *buf = ftrace_dyn_info_buffer;
+	int size = ARRAY_SIZE(ftrace_dyn_info_buffer);
 	int r;
 
 	mutex_lock(&dyn_info_mutex);
 	r = sprintf(buf, "%ld ", *p);
 
-	r += ftrace_arch_read_dyn_info(buf+r, DYN_INFO_BUF_SIZE-r);
+	r += ftrace_arch_read_dyn_info(buf+r, (size-1)-r);
 	buf[r++] = '\n';
 
 	r = simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
-- 
cgit v1.2.3-70-g09d2


From ae9eba0e2744f1aa15cdc97cd39277a84723ae23 Mon Sep 17 00:00:00 2001
From: Kay Sievers <kay.sievers@vrfy.org>
Date: Thu, 30 Oct 2008 20:06:16 +0100
Subject: uwb: struct device - replace bus_id with dev_name(), dev_set_name()

Cc: David Vrabel <david.vrabel@csr.com>
Acked-by: Greg Kroah-Hartman <gregkh@suse.de>
Signed-Off-By: Kay Sievers <kay.sievers@vrfy.org>
Signed-off-by: David Vrabel <david.vrabel@csr.com>
---
 drivers/uwb/umc-dev.c     | 3 +--
 drivers/uwb/whci.c        | 2 +-
 include/linux/uwb/debug.h | 2 +-
 3 files changed, 3 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/uwb/umc-dev.c b/drivers/uwb/umc-dev.c
index aa44e1c1a10..53207e14cd8 100644
--- a/drivers/uwb/umc-dev.c
+++ b/drivers/uwb/umc-dev.c
@@ -31,8 +31,7 @@ struct umc_dev *umc_device_create(struct device *parent, int n)
 
 	umc = kzalloc(sizeof(struct umc_dev), GFP_KERNEL);
 	if (umc) {
-		snprintf(umc->dev.bus_id, sizeof(umc->dev.bus_id), "%s-%d",
-			 parent->bus_id, n);
+		dev_set_name(&umc->dev, "%s-%d", dev_name(parent), n);
 		umc->dev.parent  = parent;
 		umc->dev.bus     = &umc_bus_type;
 		umc->dev.release = umc_device_release;
diff --git a/drivers/uwb/whci.c b/drivers/uwb/whci.c
index 3df2388f908..e626467f95e 100644
--- a/drivers/uwb/whci.c
+++ b/drivers/uwb/whci.c
@@ -111,7 +111,7 @@ static int whci_add_cap(struct whci_card *card, int n)
 		+ UWBCAPDATA_TO_OFFSET(capdata);
 	umc->resource.end    = umc->resource.start
 		+ (n == 0 ? 0x20 : UWBCAPDATA_TO_SIZE(capdata)) - 1;
-	umc->resource.name   = umc->dev.bus_id;
+	umc->resource.name   = dev_name(&umc->dev);
 	umc->resource.flags  = card->pci->resource[bar].flags;
 	umc->resource.parent = &card->pci->resource[bar];
 	umc->irq             = card->pci->irq;
diff --git a/include/linux/uwb/debug.h b/include/linux/uwb/debug.h
index a86a73fe303..67a24052714 100644
--- a/include/linux/uwb/debug.h
+++ b/include/linux/uwb/debug.h
@@ -60,7 +60,7 @@ do {									\
 				snprintf(__head, sizeof(__head),	\
 					 "%s %s: ",			\
 					 dev_driver_string(__dev),	\
-					 __dev->bus_id);		\
+					 dev_name(__dev));		\
 		}							\
 		printk(KERN_ERR "%s%s" _tag ": " f, __head,		\
 			__func__, ## a);				\
-- 
cgit v1.2.3-70-g09d2


From d9fe60dea7779d412b34679f1177c5ca1940ea8d Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes@sipsolutions.net>
Date: Thu, 9 Oct 2008 12:13:49 +0200
Subject: 802.11: clean up/fix HT support

This patch cleans up a number of things:
 * the unusable definition of the HT capabilities/HT information
   information elements
 * variable names that are hard to understand
 * mac80211: move ieee80211_handle_ht to ht.c and remove the unused
             enable_ht parameter
 * mac80211: fix bug with MCS rate 32 in ieee80211_handle_ht
 * mac80211: fix bug with casting the result of ieee80211_bss_get_ie
             to an information element _contents_ rather than the
             whole element, add size checking (another out-of-bounds
             access bug fixed!)
 * mac80211: remove some unused return values in favour of BUG_ON
             checking
 * a few minor other things

Signed-off-by: Johannes Berg <johannes@sipsolutions.net>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 drivers/net/wireless/ath9k/main.c         |  57 ++++++-----
 drivers/net/wireless/ath9k/rc.c           |  10 +-
 drivers/net/wireless/ath9k/rc.h           |   1 -
 drivers/net/wireless/ath9k/recv.c         |   2 +-
 drivers/net/wireless/ath9k/xmit.c         |   2 +-
 drivers/net/wireless/iwlwifi/iwl-agn-rs.c |  18 ++--
 drivers/net/wireless/iwlwifi/iwl-agn.c    |  20 ++--
 drivers/net/wireless/iwlwifi/iwl-core.c   |  71 +++++++-------
 drivers/net/wireless/iwlwifi/iwl-core.h   |   2 +-
 drivers/net/wireless/iwlwifi/iwl-dev.h    |   4 +-
 drivers/net/wireless/iwlwifi/iwl-scan.c   |  12 +--
 drivers/net/wireless/iwlwifi/iwl-sta.c    |   6 +-
 drivers/net/wireless/mac80211_hwsim.c     |  19 ++--
 include/linux/ieee80211.h                 | 133 ++++++++++++++++++--------
 include/net/mac80211.h                    |  12 +--
 include/net/wireless.h                    |  15 +--
 net/mac80211/cfg.c                        |   7 +-
 net/mac80211/ht.c                         | 151 +++++++++++++++++++++++++-----
 net/mac80211/ieee80211_i.h                |  16 ++--
 net/mac80211/main.c                       |  94 -------------------
 net/mac80211/mlme.c                       |  45 ++++-----
 net/mac80211/util.c                       |   4 +-
 net/mac80211/wext.c                       |   4 +-
 23 files changed, 386 insertions(+), 319 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/wireless/ath9k/main.c b/drivers/net/wireless/ath9k/main.c
index 186d75acb32..5e087c92a6d 100644
--- a/drivers/net/wireless/ath9k/main.c
+++ b/drivers/net/wireless/ath9k/main.c
@@ -61,24 +61,24 @@ static u32 ath_get_extchanmode(struct ath_softc *sc,
 
 	switch (chan->band) {
 	case IEEE80211_BAND_2GHZ:
-		if ((ext_chan_offset == IEEE80211_HT_IE_CHA_SEC_NONE) &&
+		if ((ext_chan_offset == IEEE80211_HT_PARAM_CHA_SEC_NONE) &&
 		    (tx_chan_width == ATH9K_HT_MACMODE_20))
 			chanmode = CHANNEL_G_HT20;
-		if ((ext_chan_offset == IEEE80211_HT_IE_CHA_SEC_ABOVE) &&
+		if ((ext_chan_offset == IEEE80211_HT_PARAM_CHA_SEC_ABOVE) &&
 		    (tx_chan_width == ATH9K_HT_MACMODE_2040))
 			chanmode = CHANNEL_G_HT40PLUS;
-		if ((ext_chan_offset == IEEE80211_HT_IE_CHA_SEC_BELOW) &&
+		if ((ext_chan_offset == IEEE80211_HT_PARAM_CHA_SEC_BELOW) &&
 		    (tx_chan_width == ATH9K_HT_MACMODE_2040))
 			chanmode = CHANNEL_G_HT40MINUS;
 		break;
 	case IEEE80211_BAND_5GHZ:
-		if ((ext_chan_offset == IEEE80211_HT_IE_CHA_SEC_NONE) &&
+		if ((ext_chan_offset == IEEE80211_HT_PARAM_CHA_SEC_NONE) &&
 		    (tx_chan_width == ATH9K_HT_MACMODE_20))
 			chanmode = CHANNEL_A_HT20;
-		if ((ext_chan_offset == IEEE80211_HT_IE_CHA_SEC_ABOVE) &&
+		if ((ext_chan_offset == IEEE80211_HT_PARAM_CHA_SEC_ABOVE) &&
 		    (tx_chan_width == ATH9K_HT_MACMODE_2040))
 			chanmode = CHANNEL_A_HT40PLUS;
-		if ((ext_chan_offset == IEEE80211_HT_IE_CHA_SEC_BELOW) &&
+		if ((ext_chan_offset == IEEE80211_HT_PARAM_CHA_SEC_BELOW) &&
 		    (tx_chan_width == ATH9K_HT_MACMODE_2040))
 			chanmode = CHANNEL_A_HT40MINUS;
 		break;
@@ -215,24 +215,24 @@ static void ath_key_delete(struct ath_softc *sc, struct ieee80211_key_conf *key)
 	ath_key_reset(sc, key->keyidx, freeslot);
 }
 
-static void setup_ht_cap(struct ieee80211_ht_info *ht_info)
+static void setup_ht_cap(struct ieee80211_sta_ht_cap *ht_info)
 {
 #define	ATH9K_HT_CAP_MAXRXAMPDU_65536 0x3	/* 2 ^ 16 */
 #define	ATH9K_HT_CAP_MPDUDENSITY_8 0x6		/* 8 usec */
 
-	ht_info->ht_supported = 1;
-	ht_info->cap = (u16)IEEE80211_HT_CAP_SUP_WIDTH
-			|(u16)IEEE80211_HT_CAP_SM_PS
-			|(u16)IEEE80211_HT_CAP_SGI_40
-			|(u16)IEEE80211_HT_CAP_DSSSCCK40;
+	ht_info->ht_supported = true;
+	ht_info->cap = IEEE80211_HT_CAP_SUP_WIDTH_20_40 |
+		       IEEE80211_HT_CAP_SM_PS |
+		       IEEE80211_HT_CAP_SGI_40 |
+		       IEEE80211_HT_CAP_DSSSCCK40;
 
 	ht_info->ampdu_factor = ATH9K_HT_CAP_MAXRXAMPDU_65536;
 	ht_info->ampdu_density = ATH9K_HT_CAP_MPDUDENSITY_8;
-	/* setup supported mcs set */
-	memset(ht_info->supp_mcs_set, 0, 16);
-	ht_info->supp_mcs_set[0] = 0xff;
-	ht_info->supp_mcs_set[1] = 0xff;
-	ht_info->supp_mcs_set[12] = IEEE80211_HT_CAP_MCS_TX_DEFINED;
+	/* set up supported mcs set */
+	memset(&ht_info->mcs, 0, sizeof(ht_info->mcs));
+	ht_info->mcs.rx_mask[0] = 0xff;
+	ht_info->mcs.rx_mask[1] = 0xff;
+	ht_info->mcs.tx_params = IEEE80211_HT_MCS_TX_DEFINED;
 }
 
 static int ath_rate2idx(struct ath_softc *sc, int rate)
@@ -328,31 +328,28 @@ static u8 parse_mpdudensity(u8 mpdudensity)
 static void ath9k_ht_conf(struct ath_softc *sc,
 			  struct ieee80211_bss_conf *bss_conf)
 {
-#define IEEE80211_HT_CAP_40MHZ_INTOLERANT BIT(14)
 	struct ath_ht_info *ht_info = &sc->sc_ht_info;
 
 	if (bss_conf->assoc_ht) {
 		ht_info->ext_chan_offset =
 			bss_conf->ht_bss_conf->bss_cap &
-				IEEE80211_HT_IE_CHA_SEC_OFFSET;
+				IEEE80211_HT_PARAM_CHA_SEC_OFFSET;
 
-		if (!(bss_conf->ht_conf->cap &
+		if (!(bss_conf->ht_cap->cap &
 			IEEE80211_HT_CAP_40MHZ_INTOLERANT) &&
 			    (bss_conf->ht_bss_conf->bss_cap &
-				IEEE80211_HT_IE_CHA_WIDTH))
+				IEEE80211_HT_PARAM_CHAN_WIDTH_ANY))
 			ht_info->tx_chan_width = ATH9K_HT_MACMODE_2040;
 		else
 			ht_info->tx_chan_width = ATH9K_HT_MACMODE_20;
 
 		ath9k_hw_set11nmac2040(sc->sc_ah, ht_info->tx_chan_width);
 		ht_info->maxampdu = 1 << (IEEE80211_HTCAP_MAXRXAMPDU_FACTOR +
-					bss_conf->ht_conf->ampdu_factor);
+					bss_conf->ht_cap->ampdu_factor);
 		ht_info->mpdudensity =
-			parse_mpdudensity(bss_conf->ht_conf->ampdu_density);
+			parse_mpdudensity(bss_conf->ht_cap->ampdu_density);
 
 	}
-
-#undef IEEE80211_HT_CAP_40MHZ_INTOLERANT
 }
 
 static void ath9k_bss_assoc_info(struct ath_softc *sc,
@@ -411,7 +408,7 @@ static void ath9k_bss_assoc_info(struct ath_softc *sc,
 			return;
 		}
 
-		if (hw->conf.ht_conf.ht_supported)
+		if (hw->conf.ht_cap.ht_supported)
 			sc->sc_ah->ah_channels[pos].chanmode =
 				ath_get_extchanmode(sc, curchan);
 		else
@@ -534,7 +531,7 @@ int _ath_rx_indicate(struct ath_softc *sc,
 
 	if (an) {
 		ath_rx_input(sc, an,
-			     hw->conf.ht_conf.ht_supported,
+			     hw->conf.ht_cap.ht_supported,
 			     skb, status, &st);
 	}
 	if (!an || (st != ATH_RX_CONSUMED))
@@ -943,7 +940,7 @@ static int ath_attach(u16 devid,
 
 	if (sc->sc_ah->ah_caps.hw_caps & ATH9K_HW_CAP_HT)
 		/* Setup HT capabilities for 2.4Ghz*/
-		setup_ht_cap(&sc->sbands[IEEE80211_BAND_2GHZ].ht_info);
+		setup_ht_cap(&sc->sbands[IEEE80211_BAND_2GHZ].ht_cap);
 
 	hw->wiphy->bands[IEEE80211_BAND_2GHZ] =
 		&sc->sbands[IEEE80211_BAND_2GHZ];
@@ -958,7 +955,7 @@ static int ath_attach(u16 devid,
 
 		if (sc->sc_ah->ah_caps.hw_caps & ATH9K_HW_CAP_HT)
 			/* Setup HT capabilities for 5Ghz*/
-			setup_ht_cap(&sc->sbands[IEEE80211_BAND_5GHZ].ht_info);
+			setup_ht_cap(&sc->sbands[IEEE80211_BAND_5GHZ].ht_cap);
 
 		hw->wiphy->bands[IEEE80211_BAND_5GHZ] =
 			&sc->sbands[IEEE80211_BAND_5GHZ];
@@ -1254,7 +1251,7 @@ static int ath9k_config(struct ieee80211_hw *hw,
 		(curchan->band == IEEE80211_BAND_2GHZ) ?
 		CHANNEL_G : CHANNEL_A;
 
-	if (sc->sc_curaid && hw->conf.ht_conf.ht_supported)
+	if (sc->sc_curaid && hw->conf.ht_cap.ht_supported)
 		sc->sc_ah->ah_channels[pos].chanmode =
 			ath_get_extchanmode(sc, curchan);
 
diff --git a/drivers/net/wireless/ath9k/rc.c b/drivers/net/wireless/ath9k/rc.c
index b1e535b8ec4..ee2dbce42b4 100644
--- a/drivers/net/wireless/ath9k/rc.c
+++ b/drivers/net/wireless/ath9k/rc.c
@@ -1838,7 +1838,7 @@ void ath_rc_node_update(struct ieee80211_hw *hw, struct ath_rate_node *rc_priv)
 	struct ath_softc *sc = hw->priv;
 	u32 capflag = 0;
 
-	if (hw->conf.ht_conf.ht_supported) {
+	if (hw->conf.ht_cap.ht_supported) {
 		capflag |= ATH_RC_HT_FLAG | ATH_RC_DS_FLAG;
 		if (sc->sc_ht_info.tx_chan_width == ATH9K_HT_MACMODE_2040)
 			capflag |= ATH_RC_CW40_FLAG;
@@ -1910,7 +1910,7 @@ static void ath_tx_aggr_resp(struct ath_softc *sc,
 	 */
 	si = container_of(sta, struct sta_info, sta);
 	buffersize = IEEE80211_MIN_AMPDU_BUF <<
-		sband->ht_info.ampdu_factor; /* FIXME */
+		sband->ht_cap.ampdu_factor; /* FIXME */
 	state = si->ampdu_mlme.tid_state_tx[tidno];
 
 	if (state & HT_ADDBA_RECEIVED_MSK) {
@@ -1979,7 +1979,7 @@ static void ath_get_rate(void *priv, struct ieee80211_supported_band *sband,
 
 	/* Check if aggregation has to be enabled for this tid */
 
-	if (hw->conf.ht_conf.ht_supported) {
+	if (hw->conf.ht_cap.ht_supported) {
 		if (ieee80211_is_data_qos(fc)) {
 			qc = ieee80211_get_qos_ctl(hdr);
 			tid = qc[0] & 0xf;
@@ -2027,8 +2027,8 @@ static void ath_rate_init(void *priv, struct ieee80211_supported_band *sband,
 
 	ath_setup_rates(sc, sband, sta, ath_rc_priv);
 	if (sc->hw->conf.flags & IEEE80211_CONF_SUPPORT_HT_MODE) {
-		for (i = 0; i < MCS_SET_SIZE; i++) {
-			if (sc->hw->conf.ht_conf.supp_mcs_set[i/8] & (1<<(i%8)))
+		for (i = 0; i < 77; i++) {
+			if (sc->hw->conf.ht_cap.mcs.rx_mask[i/8] & (1<<(i%8)))
 				ath_rc_priv->neg_ht_rates.rs_rates[j++] = i;
 			if (j == ATH_RATE_MAX)
 				break;
diff --git a/drivers/net/wireless/ath9k/rc.h b/drivers/net/wireless/ath9k/rc.h
index b95b41508b9..6671097fad7 100644
--- a/drivers/net/wireless/ath9k/rc.h
+++ b/drivers/net/wireless/ath9k/rc.h
@@ -59,7 +59,6 @@ struct ath_softc;
 #define FALSE 0
 
 #define ATH_RATE_MAX	30
-#define MCS_SET_SIZE	128
 
 enum ieee80211_fixed_rate_mode {
 	IEEE80211_FIXED_RATE_NONE  = 0,
diff --git a/drivers/net/wireless/ath9k/recv.c b/drivers/net/wireless/ath9k/recv.c
index 4983402af55..010fcdfec3f 100644
--- a/drivers/net/wireless/ath9k/recv.c
+++ b/drivers/net/wireless/ath9k/recv.c
@@ -1119,7 +1119,7 @@ int ath_rx_aggr_start(struct ath_softc *sc,
 
 	sband = hw->wiphy->bands[hw->conf.channel->band];
 	buffersize = IEEE80211_MIN_AMPDU_BUF <<
-		sband->ht_info.ampdu_factor; /* FIXME */
+		sband->ht_cap.ampdu_factor; /* FIXME */
 
 	rxtid = &an->an_aggr.rx.tid[tid];
 
diff --git a/drivers/net/wireless/ath9k/xmit.c b/drivers/net/wireless/ath9k/xmit.c
index 13866043dec..3770fbe84fc 100644
--- a/drivers/net/wireless/ath9k/xmit.c
+++ b/drivers/net/wireless/ath9k/xmit.c
@@ -300,7 +300,7 @@ static int ath_tx_prepare(struct ath_softc *sc,
 	if (ieee80211_is_data(fc) && !txctl->use_minrate) {
 
 		/* Enable HT only for DATA frames and not for EAPOL */
-		txctl->ht = (hw->conf.ht_conf.ht_supported &&
+		txctl->ht = (hw->conf.ht_cap.ht_supported &&
 			    (tx_info->flags & IEEE80211_TX_CTL_AMPDU));
 
 		if (is_multicast_ether_addr(hdr->addr1)) {
diff --git a/drivers/net/wireless/iwlwifi/iwl-agn-rs.c b/drivers/net/wireless/iwlwifi/iwl-agn-rs.c
index b497d40dc39..cd1bff59049 100644
--- a/drivers/net/wireless/iwlwifi/iwl-agn-rs.c
+++ b/drivers/net/wireless/iwlwifi/iwl-agn-rs.c
@@ -1134,10 +1134,10 @@ static int rs_switch_to_mimo2(struct iwl_priv *priv,
 	s8 is_green = lq_sta->is_green;
 
 	if (!(conf->flags & IEEE80211_CONF_SUPPORT_HT_MODE) ||
-	    !sta->ht_info.ht_supported)
+	    !sta->ht_cap.ht_supported)
 		return -1;
 
-	if (((sta->ht_info.cap & IEEE80211_HT_CAP_SM_PS) >> 2)
+	if (((sta->ht_cap.cap & IEEE80211_HT_CAP_SM_PS) >> 2)
 						== WLAN_HT_CAP_SM_PS_STATIC)
 		return -1;
 
@@ -1202,7 +1202,7 @@ static int rs_switch_to_siso(struct iwl_priv *priv,
 	s32 rate;
 
 	if (!(conf->flags & IEEE80211_CONF_SUPPORT_HT_MODE) ||
-	    !sta->ht_info.ht_supported)
+	    !sta->ht_cap.ht_supported)
 		return -1;
 
 	IWL_DEBUG_RATE("LQ: try to switch to SISO\n");
@@ -2238,19 +2238,19 @@ static void rs_rate_init(void *priv_r, struct ieee80211_supported_band *sband,
 	 * active_siso_rate mask includes 9 MBits (bit 5), and CCK (bits 0-3),
 	 * supp_rates[] does not; shift to convert format, force 9 MBits off.
 	 */
-	lq_sta->active_siso_rate = conf->ht_conf.supp_mcs_set[0] << 1;
-	lq_sta->active_siso_rate |= conf->ht_conf.supp_mcs_set[0] & 0x1;
+	lq_sta->active_siso_rate = conf->ht_cap.mcs.rx_mask[0] << 1;
+	lq_sta->active_siso_rate |= conf->ht_cap.mcs.rx_mask[0] & 0x1;
 	lq_sta->active_siso_rate &= ~((u16)0x2);
 	lq_sta->active_siso_rate <<= IWL_FIRST_OFDM_RATE;
 
 	/* Same here */
-	lq_sta->active_mimo2_rate = conf->ht_conf.supp_mcs_set[1] << 1;
-	lq_sta->active_mimo2_rate |= conf->ht_conf.supp_mcs_set[1] & 0x1;
+	lq_sta->active_mimo2_rate = conf->ht_cap.mcs.rx_mask[1] << 1;
+	lq_sta->active_mimo2_rate |= conf->ht_cap.mcs.rx_mask[1] & 0x1;
 	lq_sta->active_mimo2_rate &= ~((u16)0x2);
 	lq_sta->active_mimo2_rate <<= IWL_FIRST_OFDM_RATE;
 
-	lq_sta->active_mimo3_rate = conf->ht_conf.supp_mcs_set[2] << 1;
-	lq_sta->active_mimo3_rate |= conf->ht_conf.supp_mcs_set[2] & 0x1;
+	lq_sta->active_mimo3_rate = conf->ht_cap.mcs.rx_mask[2] << 1;
+	lq_sta->active_mimo3_rate |= conf->ht_cap.mcs.rx_mask[2] & 0x1;
 	lq_sta->active_mimo3_rate &= ~((u16)0x2);
 	lq_sta->active_mimo3_rate <<= IWL_FIRST_OFDM_RATE;
 
diff --git a/drivers/net/wireless/iwlwifi/iwl-agn.c b/drivers/net/wireless/iwlwifi/iwl-agn.c
index 2cac09405af..e6695e80fb5 100644
--- a/drivers/net/wireless/iwlwifi/iwl-agn.c
+++ b/drivers/net/wireless/iwlwifi/iwl-agn.c
@@ -552,7 +552,7 @@ static int iwl4965_send_beacon_cmd(struct iwl_priv *priv)
 static void iwl4965_ht_conf(struct iwl_priv *priv,
 			    struct ieee80211_bss_conf *bss_conf)
 {
-	struct ieee80211_ht_info *ht_conf = bss_conf->ht_conf;
+	struct ieee80211_sta_ht_cap *ht_conf = bss_conf->ht_cap;
 	struct ieee80211_ht_bss_info *ht_bss_conf = bss_conf->ht_bss_conf;
 	struct iwl_ht_info *iwl_conf = &priv->current_ht_config;
 
@@ -573,27 +573,27 @@ static void iwl4965_ht_conf(struct iwl_priv *priv,
 		!!(ht_conf->cap & IEEE80211_HT_CAP_MAX_AMSDU);
 
 	iwl_conf->supported_chan_width =
-		!!(ht_conf->cap & IEEE80211_HT_CAP_SUP_WIDTH);
+		!!(ht_conf->cap & IEEE80211_HT_CAP_SUP_WIDTH_20_40);
 	iwl_conf->extension_chan_offset =
-		ht_bss_conf->bss_cap & IEEE80211_HT_IE_CHA_SEC_OFFSET;
+		ht_bss_conf->bss_cap & IEEE80211_HT_PARAM_CHA_SEC_OFFSET;
 	/* If no above or below channel supplied disable FAT channel */
-	if (iwl_conf->extension_chan_offset != IEEE80211_HT_IE_CHA_SEC_ABOVE &&
-	    iwl_conf->extension_chan_offset != IEEE80211_HT_IE_CHA_SEC_BELOW) {
-		iwl_conf->extension_chan_offset = IEEE80211_HT_IE_CHA_SEC_NONE;
+	if (iwl_conf->extension_chan_offset != IEEE80211_HT_PARAM_CHA_SEC_ABOVE &&
+	    iwl_conf->extension_chan_offset != IEEE80211_HT_PARAM_CHA_SEC_BELOW) {
+		iwl_conf->extension_chan_offset = IEEE80211_HT_PARAM_CHA_SEC_NONE;
 		iwl_conf->supported_chan_width = 0;
 	}
 
 	iwl_conf->sm_ps = (u8)((ht_conf->cap & IEEE80211_HT_CAP_SM_PS) >> 2);
 
-	memcpy(iwl_conf->supp_mcs_set, ht_conf->supp_mcs_set, 16);
+	memcpy(&iwl_conf->mcs, &ht_conf->mcs, 16);
 
 	iwl_conf->control_channel = ht_bss_conf->primary_channel;
 	iwl_conf->tx_chan_width =
-		!!(ht_bss_conf->bss_cap & IEEE80211_HT_IE_CHA_WIDTH);
+		!!(ht_bss_conf->bss_cap & IEEE80211_HT_PARAM_CHAN_WIDTH_ANY);
 	iwl_conf->ht_protection =
-		ht_bss_conf->bss_op_mode & IEEE80211_HT_IE_HT_PROTECTION;
+		ht_bss_conf->bss_op_mode & IEEE80211_HT_OP_MODE_PROTECTION;
 	iwl_conf->non_GF_STA_present =
-		!!(ht_bss_conf->bss_op_mode & IEEE80211_HT_IE_NON_GF_STA_PRSNT);
+		!!(ht_bss_conf->bss_op_mode & IEEE80211_HT_OP_MODE_NON_GF_STA_PRSNT);
 
 	IWL_DEBUG_MAC80211("control channel %d\n", iwl_conf->control_channel);
 	IWL_DEBUG_MAC80211("leave\n");
diff --git a/drivers/net/wireless/iwlwifi/iwl-core.c b/drivers/net/wireless/iwlwifi/iwl-core.c
index 4c312c55f90..4678da447ff 100644
--- a/drivers/net/wireless/iwlwifi/iwl-core.c
+++ b/drivers/net/wireless/iwlwifi/iwl-core.c
@@ -382,10 +382,10 @@ void iwl_reset_qos(struct iwl_priv *priv)
 }
 EXPORT_SYMBOL(iwl_reset_qos);
 
-#define MAX_BIT_RATE_40_MHZ 0x96 /* 150 Mbps */
-#define MAX_BIT_RATE_20_MHZ 0x48 /* 72 Mbps */
+#define MAX_BIT_RATE_40_MHZ 150 /* Mbps */
+#define MAX_BIT_RATE_20_MHZ 72 /* Mbps */
 static void iwlcore_init_ht_hw_capab(const struct iwl_priv *priv,
-			      struct ieee80211_ht_info *ht_info,
+			      struct ieee80211_sta_ht_cap *ht_info,
 			      enum ieee80211_band band)
 {
 	u16 max_bit_rate = 0;
@@ -393,45 +393,46 @@ static void iwlcore_init_ht_hw_capab(const struct iwl_priv *priv,
 	u8 tx_chains_num = priv->hw_params.tx_chains_num;
 
 	ht_info->cap = 0;
-	memset(ht_info->supp_mcs_set, 0, 16);
+	memset(&ht_info->mcs, 0, sizeof(ht_info->mcs));
 
-	ht_info->ht_supported = 1;
+	ht_info->ht_supported = true;
 
-	ht_info->cap |= (u16)IEEE80211_HT_CAP_GRN_FLD;
-	ht_info->cap |= (u16)IEEE80211_HT_CAP_SGI_20;
-	ht_info->cap |= (u16)(IEEE80211_HT_CAP_SM_PS &
+	ht_info->cap |= IEEE80211_HT_CAP_GRN_FLD;
+	ht_info->cap |= IEEE80211_HT_CAP_SGI_20;
+	ht_info->cap |= (IEEE80211_HT_CAP_SM_PS &
 			     (WLAN_HT_CAP_SM_PS_DISABLED << 2));
 
 	max_bit_rate = MAX_BIT_RATE_20_MHZ;
 	if (priv->hw_params.fat_channel & BIT(band)) {
-		ht_info->cap |= (u16)IEEE80211_HT_CAP_SUP_WIDTH;
-		ht_info->cap |= (u16)IEEE80211_HT_CAP_SGI_40;
-		ht_info->supp_mcs_set[4] = 0x01;
+		ht_info->cap |= IEEE80211_HT_CAP_SUP_WIDTH_20_40;
+		ht_info->cap |= IEEE80211_HT_CAP_SGI_40;
+		ht_info->mcs.rx_mask[4] = 0x01;
 		max_bit_rate = MAX_BIT_RATE_40_MHZ;
 	}
 
 	if (priv->cfg->mod_params->amsdu_size_8K)
-		ht_info->cap |= (u16)IEEE80211_HT_CAP_MAX_AMSDU;
+		ht_info->cap |= IEEE80211_HT_CAP_MAX_AMSDU;
 
 	ht_info->ampdu_factor = CFG_HT_RX_AMPDU_FACTOR_DEF;
 	ht_info->ampdu_density = CFG_HT_MPDU_DENSITY_DEF;
 
-	ht_info->supp_mcs_set[0] = 0xFF;
+	ht_info->mcs.rx_mask[0] = 0xFF;
 	if (rx_chains_num >= 2)
-		ht_info->supp_mcs_set[1] = 0xFF;
+		ht_info->mcs.rx_mask[1] = 0xFF;
 	if (rx_chains_num >= 3)
-		ht_info->supp_mcs_set[2] = 0xFF;
+		ht_info->mcs.rx_mask[2] = 0xFF;
 
 	/* Highest supported Rx data rate */
 	max_bit_rate *= rx_chains_num;
-	ht_info->supp_mcs_set[10] = (u8)(max_bit_rate & 0x00FF);
-	ht_info->supp_mcs_set[11] = (u8)((max_bit_rate & 0xFF00) >> 8);
+	WARN_ON(max_bit_rate & ~IEEE80211_HT_MCS_RX_HIGHEST_MASK);
+	ht_info->mcs.rx_highest = cpu_to_le16(max_bit_rate);
 
 	/* Tx MCS capabilities */
-	ht_info->supp_mcs_set[12] = IEEE80211_HT_CAP_MCS_TX_DEFINED;
+	ht_info->mcs.tx_params = IEEE80211_HT_MCS_TX_DEFINED;
 	if (tx_chains_num != rx_chains_num) {
-		ht_info->supp_mcs_set[12] |= IEEE80211_HT_CAP_MCS_TX_RX_DIFF;
-		ht_info->supp_mcs_set[12] |= ((tx_chains_num - 1) << 2);
+		ht_info->mcs.tx_params |= IEEE80211_HT_MCS_TX_RX_DIFF;
+		ht_info->mcs.tx_params |= ((tx_chains_num - 1) <<
+				IEEE80211_HT_MCS_TX_MAX_STREAMS_SHIFT);
 	}
 }
 
@@ -495,7 +496,7 @@ static int iwlcore_init_geos(struct iwl_priv *priv)
 	sband->n_bitrates = IWL_RATE_COUNT - IWL_FIRST_OFDM_RATE;
 
 	if (priv->cfg->sku & IWL_SKU_N)
-		iwlcore_init_ht_hw_capab(priv, &sband->ht_info,
+		iwlcore_init_ht_hw_capab(priv, &sband->ht_cap,
 					 IEEE80211_BAND_5GHZ);
 
 	sband = &priv->bands[IEEE80211_BAND_2GHZ];
@@ -505,7 +506,7 @@ static int iwlcore_init_geos(struct iwl_priv *priv)
 	sband->n_bitrates = IWL_RATE_COUNT;
 
 	if (priv->cfg->sku & IWL_SKU_N)
-		iwlcore_init_ht_hw_capab(priv, &sband->ht_info,
+		iwlcore_init_ht_hw_capab(priv, &sband->ht_cap,
 					 IEEE80211_BAND_2GHZ);
 
 	priv->ieee_channels = channels;
@@ -595,8 +596,8 @@ static void iwlcore_free_geos(struct iwl_priv *priv)
 static bool is_single_rx_stream(struct iwl_priv *priv)
 {
 	return !priv->current_ht_config.is_ht ||
-	       ((priv->current_ht_config.supp_mcs_set[1] == 0) &&
-		(priv->current_ht_config.supp_mcs_set[2] == 0));
+	       ((priv->current_ht_config.mcs.rx_mask[1] == 0) &&
+		(priv->current_ht_config.mcs.rx_mask[2] == 0));
 }
 
 static u8 iwl_is_channel_extension(struct iwl_priv *priv,
@@ -609,10 +610,10 @@ static u8 iwl_is_channel_extension(struct iwl_priv *priv,
 	if (!is_channel_valid(ch_info))
 		return 0;
 
-	if (extension_chan_offset == IEEE80211_HT_IE_CHA_SEC_ABOVE)
+	if (extension_chan_offset == IEEE80211_HT_PARAM_CHA_SEC_ABOVE)
 		return !(ch_info->fat_extension_channel &
 					IEEE80211_CHAN_NO_FAT_ABOVE);
-	else if (extension_chan_offset == IEEE80211_HT_IE_CHA_SEC_BELOW)
+	else if (extension_chan_offset == IEEE80211_HT_PARAM_CHA_SEC_BELOW)
 		return !(ch_info->fat_extension_channel &
 					IEEE80211_CHAN_NO_FAT_BELOW);
 
@@ -620,18 +621,18 @@ static u8 iwl_is_channel_extension(struct iwl_priv *priv,
 }
 
 u8 iwl_is_fat_tx_allowed(struct iwl_priv *priv,
-			     struct ieee80211_ht_info *sta_ht_inf)
+			 struct ieee80211_sta_ht_cap *sta_ht_inf)
 {
 	struct iwl_ht_info *iwl_ht_conf = &priv->current_ht_config;
 
 	if ((!iwl_ht_conf->is_ht) ||
 	   (iwl_ht_conf->supported_chan_width != IWL_CHANNEL_WIDTH_40MHZ) ||
-	   (iwl_ht_conf->extension_chan_offset == IEEE80211_HT_IE_CHA_SEC_NONE))
+	   (iwl_ht_conf->extension_chan_offset == IEEE80211_HT_PARAM_CHA_SEC_NONE))
 		return 0;
 
 	if (sta_ht_inf) {
 		if ((!sta_ht_inf->ht_supported) ||
-		   (!(sta_ht_inf->cap & IEEE80211_HT_CAP_SUP_WIDTH)))
+		   (!(sta_ht_inf->cap & IEEE80211_HT_CAP_SUP_WIDTH_20_40)))
 			return 0;
 	}
 
@@ -671,13 +672,13 @@ void iwl_set_rxon_ht(struct iwl_priv *priv, struct iwl_ht_info *ht_info)
 
 	/* Note: control channel is opposite of extension channel */
 	switch (ht_info->extension_chan_offset) {
-	case IEEE80211_HT_IE_CHA_SEC_ABOVE:
+	case IEEE80211_HT_PARAM_CHA_SEC_ABOVE:
 		rxon->flags &= ~(RXON_FLG_CTRL_CHANNEL_LOC_HI_MSK);
 		break;
-	case IEEE80211_HT_IE_CHA_SEC_BELOW:
+	case IEEE80211_HT_PARAM_CHA_SEC_BELOW:
 		rxon->flags |= RXON_FLG_CTRL_CHANNEL_LOC_HI_MSK;
 		break;
-	case IEEE80211_HT_IE_CHA_SEC_NONE:
+	case IEEE80211_HT_PARAM_CHA_SEC_NONE:
 	default:
 		rxon->flags &= ~RXON_FLG_CHANNEL_MODE_MIXED_MSK;
 		break;
@@ -693,9 +694,9 @@ void iwl_set_rxon_ht(struct iwl_priv *priv, struct iwl_ht_info *ht_info)
 			"rxon flags 0x%X operation mode :0x%X "
 			"extension channel offset 0x%x "
 			"control chan %d\n",
-			ht_info->supp_mcs_set[0],
-			ht_info->supp_mcs_set[1],
-			ht_info->supp_mcs_set[2],
+			ht_info->mcs.rx_mask[0],
+			ht_info->mcs.rx_mask[1],
+			ht_info->mcs.rx_mask[2],
 			le32_to_cpu(rxon->flags), ht_info->ht_protection,
 			ht_info->extension_chan_offset,
 			ht_info->control_channel);
diff --git a/drivers/net/wireless/iwlwifi/iwl-core.h b/drivers/net/wireless/iwlwifi/iwl-core.h
index 288b6a800e0..1a3ad8b1ebd 100644
--- a/drivers/net/wireless/iwlwifi/iwl-core.h
+++ b/drivers/net/wireless/iwlwifi/iwl-core.h
@@ -190,7 +190,7 @@ void iwl_set_rxon_chain(struct iwl_priv *priv);
 int iwl_set_rxon_channel(struct iwl_priv *priv, struct ieee80211_channel *ch);
 void iwl_set_rxon_ht(struct iwl_priv *priv, struct iwl_ht_info *ht_info);
 u8 iwl_is_fat_tx_allowed(struct iwl_priv *priv,
-			 struct ieee80211_ht_info *sta_ht_inf);
+			 struct ieee80211_sta_ht_cap *sta_ht_inf);
 int iwl_hw_nic_init(struct iwl_priv *priv);
 int iwl_setup_mac(struct iwl_priv *priv);
 int iwl_set_hw_params(struct iwl_priv *priv);
diff --git a/drivers/net/wireless/iwlwifi/iwl-dev.h b/drivers/net/wireless/iwlwifi/iwl-dev.h
index 34306b6798e..572250ee9d5 100644
--- a/drivers/net/wireless/iwlwifi/iwl-dev.h
+++ b/drivers/net/wireless/iwlwifi/iwl-dev.h
@@ -411,7 +411,7 @@ struct iwl_ht_info {
 	u8 max_amsdu_size;
 	u8 ampdu_factor;
 	u8 mpdu_density;
-	u8 supp_mcs_set[16];
+	struct ieee80211_mcs_info mcs;
 	/* BSS related data */
 	u8 control_channel;
 	u8 extension_chan_offset;
@@ -585,7 +585,7 @@ struct iwl_addsta_cmd;
 extern int iwl_send_add_sta(struct iwl_priv *priv,
 			    struct iwl_addsta_cmd *sta, u8 flags);
 extern u8 iwl_add_station_flags(struct iwl_priv *priv, const u8 *addr,
-			int is_ap, u8 flags, struct ieee80211_ht_info *ht_info);
+			int is_ap, u8 flags, struct ieee80211_sta_ht_cap *ht_info);
 extern void iwl4965_update_chain_flags(struct iwl_priv *priv);
 extern int iwl4965_set_pwr_src(struct iwl_priv *priv, enum iwl_pwr_src src);
 extern const u8 iwl_bcast_addr[ETH_ALEN];
diff --git a/drivers/net/wireless/iwlwifi/iwl-scan.c b/drivers/net/wireless/iwlwifi/iwl-scan.c
index 3b0bee331a3..78d16bd7b74 100644
--- a/drivers/net/wireless/iwlwifi/iwl-scan.c
+++ b/drivers/net/wireless/iwlwifi/iwl-scan.c
@@ -550,7 +550,7 @@ static void iwl_ht_cap_to_ie(const struct ieee80211_supported_band *sband,
 {
 	struct ieee80211_ht_cap *ht_cap;
 
-	if (!sband || !sband->ht_info.ht_supported)
+	if (!sband || !sband->ht_cap.ht_supported)
 		return;
 
 	if (*left < sizeof(struct ieee80211_ht_cap))
@@ -559,12 +559,12 @@ static void iwl_ht_cap_to_ie(const struct ieee80211_supported_band *sband,
 	*pos++ = sizeof(struct ieee80211_ht_cap);
 	ht_cap = (struct ieee80211_ht_cap *) pos;
 
-	ht_cap->cap_info = cpu_to_le16(sband->ht_info.cap);
-	memcpy(ht_cap->supp_mcs_set, sband->ht_info.supp_mcs_set, 16);
+	ht_cap->cap_info = cpu_to_le16(sband->ht_cap.cap);
+	memcpy(&ht_cap->mcs, &sband->ht_cap.mcs, 16);
 	ht_cap->ampdu_params_info =
-		(sband->ht_info.ampdu_factor & IEEE80211_HT_CAP_AMPDU_FACTOR) |
-		((sband->ht_info.ampdu_density << 2) &
-			IEEE80211_HT_CAP_AMPDU_DENSITY);
+		(sband->ht_cap.ampdu_factor & IEEE80211_HT_AMPDU_PARM_FACTOR) |
+		((sband->ht_cap.ampdu_density << 2) &
+			IEEE80211_HT_AMPDU_PARM_DENSITY);
 	*left -= sizeof(struct ieee80211_ht_cap);
 }
 
diff --git a/drivers/net/wireless/iwlwifi/iwl-sta.c b/drivers/net/wireless/iwlwifi/iwl-sta.c
index a28a8decc79..b9b8554433a 100644
--- a/drivers/net/wireless/iwlwifi/iwl-sta.c
+++ b/drivers/net/wireless/iwlwifi/iwl-sta.c
@@ -181,7 +181,7 @@ int iwl_send_add_sta(struct iwl_priv *priv,
 EXPORT_SYMBOL(iwl_send_add_sta);
 
 static void iwl_set_ht_add_station(struct iwl_priv *priv, u8 index,
-				   struct ieee80211_ht_info *sta_ht_inf)
+				   struct ieee80211_sta_ht_cap *sta_ht_inf)
 {
 	__le32 sta_flags;
 	u8 mimo_ps_mode;
@@ -229,7 +229,7 @@ static void iwl_set_ht_add_station(struct iwl_priv *priv, u8 index,
  * iwl_add_station_flags - Add station to tables in driver and device
  */
 u8 iwl_add_station_flags(struct iwl_priv *priv, const u8 *addr, int is_ap,
-			 u8 flags, struct ieee80211_ht_info *ht_info)
+			 u8 flags, struct ieee80211_sta_ht_cap *ht_info)
 {
 	int i;
 	int sta_id = IWL_INVALID_STATION;
@@ -894,7 +894,7 @@ int iwl_rxon_add_station(struct iwl_priv *priv, const u8 *addr, int is_ap)
 
 	/* Add station to device's station table */
 	struct ieee80211_conf *conf = &priv->hw->conf;
-	struct ieee80211_ht_info *cur_ht_config = &conf->ht_conf;
+	struct ieee80211_sta_ht_cap *cur_ht_config = &conf->ht_cap;
 
 	if ((is_ap) &&
 	    (conf->flags & IEEE80211_CONF_SUPPORT_HT_MODE) &&
diff --git a/drivers/net/wireless/mac80211_hwsim.c b/drivers/net/wireless/mac80211_hwsim.c
index e23d9a52d08..3f236b54668 100644
--- a/drivers/net/wireless/mac80211_hwsim.c
+++ b/drivers/net/wireless/mac80211_hwsim.c
@@ -563,19 +563,18 @@ static int __init init_mac80211_hwsim(void)
 		data->band.n_channels = ARRAY_SIZE(hwsim_channels);
 		data->band.bitrates = data->rates;
 		data->band.n_bitrates = ARRAY_SIZE(hwsim_rates);
-		data->band.ht_info.ht_supported = 1;
-		data->band.ht_info.cap = IEEE80211_HT_CAP_SUP_WIDTH |
+		data->band.ht_cap.ht_supported = true;
+		data->band.ht_cap.cap = IEEE80211_HT_CAP_SUP_WIDTH_20_40 |
 			IEEE80211_HT_CAP_GRN_FLD |
 			IEEE80211_HT_CAP_SGI_40 |
 			IEEE80211_HT_CAP_DSSSCCK40;
-		data->band.ht_info.ampdu_factor = 0x3;
-		data->band.ht_info.ampdu_density = 0x6;
-		memset(data->band.ht_info.supp_mcs_set, 0,
-		       sizeof(data->band.ht_info.supp_mcs_set));
-		data->band.ht_info.supp_mcs_set[0] = 0xff;
-		data->band.ht_info.supp_mcs_set[1] = 0xff;
-		data->band.ht_info.supp_mcs_set[12] =
-			IEEE80211_HT_CAP_MCS_TX_DEFINED;
+		data->band.ht_cap.ampdu_factor = 0x3;
+		data->band.ht_cap.ampdu_density = 0x6;
+		memset(&data->band.ht_cap.mcs, 0,
+		       sizeof(data->band.ht_cap.mcs));
+		data->band.ht_cap.mcs.rx_mask[0] = 0xff;
+		data->band.ht_cap.mcs.rx_mask[1] = 0xff;
+		data->band.ht_cap.mcs.tx_params = IEEE80211_HT_MCS_TX_DEFINED;
 		hw->wiphy->bands[IEEE80211_BAND_2GHZ] = &data->band;
 
 		err = ieee80211_register_hw(hw);
diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h
index 14126bc3664..64a4abce6d9 100644
--- a/include/linux/ieee80211.h
+++ b/include/linux/ieee80211.h
@@ -685,28 +685,88 @@ struct ieee80211_bar {
 #define IEEE80211_BAR_CTRL_ACK_POLICY_NORMAL     0x0000
 #define IEEE80211_BAR_CTRL_CBMTID_COMPRESSED_BA  0x0004
 
+
+#define IEEE80211_HT_MCS_MASK_LEN		10
+
+/**
+ * struct ieee80211_mcs_info - MCS information
+ * @rx_mask: RX mask
+ * @rx_highest: highest supported RX rate
+ * @tx_params: TX parameters
+ */
+struct ieee80211_mcs_info {
+	u8 rx_mask[IEEE80211_HT_MCS_MASK_LEN];
+	__le16 rx_highest;
+	u8 tx_params;
+	u8 reserved[3];
+} __attribute__((packed));
+
+/* 802.11n HT capability MSC set */
+#define IEEE80211_HT_MCS_RX_HIGHEST_MASK	0x3ff
+#define IEEE80211_HT_MCS_TX_DEFINED		0x01
+#define IEEE80211_HT_MCS_TX_RX_DIFF		0x02
+/* value 0 == 1 stream etc */
+#define IEEE80211_HT_MCS_TX_MAX_STREAMS_MASK	0x0C
+#define IEEE80211_HT_MCS_TX_MAX_STREAMS_SHIFT	2
+#define		IEEE80211_HT_MCS_TX_MAX_STREAMS	4
+#define IEEE80211_HT_MCS_TX_UNEQUAL_MODULATION	0x10
+
+/*
+ * 802.11n D5.0 20.3.5 / 20.6 says:
+ * - indices 0 to 7 and 32 are single spatial stream
+ * - 8 to 31 are multiple spatial streams using equal modulation
+ *   [8..15 for two streams, 16..23 for three and 24..31 for four]
+ * - remainder are multiple spatial streams using unequal modulation
+ */
+#define IEEE80211_HT_MCS_UNEQUAL_MODULATION_START 33
+#define IEEE80211_HT_MCS_UNEQUAL_MODULATION_START_BYTE \
+	(IEEE80211_HT_MCS_UNEQUAL_MODULATION_START / 8)
+
 /**
  * struct ieee80211_ht_cap - HT capabilities
  *
- * This structure refers to "HT capabilities element" as
- * described in 802.11n draft section 7.3.2.52
+ * This structure is the "HT capabilities element" as
+ * described in 802.11n D5.0 7.3.2.57
  */
 struct ieee80211_ht_cap {
 	__le16 cap_info;
 	u8 ampdu_params_info;
-	u8 supp_mcs_set[16];
+
+	/* 16 bytes MCS information */
+	struct ieee80211_mcs_info mcs;
+
 	__le16 extended_ht_cap_info;
 	__le32 tx_BF_cap_info;
 	u8 antenna_selection_info;
 } __attribute__ ((packed));
 
+/* 802.11n HT capabilities masks (for cap_info) */
+#define IEEE80211_HT_CAP_LDPC_CODING		0x0001
+#define IEEE80211_HT_CAP_SUP_WIDTH_20_40	0x0002
+#define IEEE80211_HT_CAP_SM_PS			0x000C
+#define IEEE80211_HT_CAP_GRN_FLD		0x0010
+#define IEEE80211_HT_CAP_SGI_20			0x0020
+#define IEEE80211_HT_CAP_SGI_40			0x0040
+#define IEEE80211_HT_CAP_TX_STBC		0x0080
+#define IEEE80211_HT_CAP_RX_STBC		0x0300
+#define IEEE80211_HT_CAP_DELAY_BA		0x0400
+#define IEEE80211_HT_CAP_MAX_AMSDU		0x0800
+#define IEEE80211_HT_CAP_DSSSCCK40		0x1000
+#define IEEE80211_HT_CAP_PSMP_SUPPORT		0x2000
+#define IEEE80211_HT_CAP_40MHZ_INTOLERANT	0x4000
+#define IEEE80211_HT_CAP_LSIG_TXOP_PROT		0x8000
+
+/* 802.11n HT capability AMPDU settings (for ampdu_params_info) */
+#define IEEE80211_HT_AMPDU_PARM_FACTOR		0x03
+#define IEEE80211_HT_AMPDU_PARM_DENSITY		0x1C
+
 /**
- * struct ieee80211_ht_cap - HT additional information
+ * struct ieee80211_ht_info - HT information
  *
- * This structure refers to "HT information element" as
- * described in 802.11n draft section 7.3.2.53
+ * This structure is the "HT information element" as
+ * described in 802.11n D5.0 7.3.2.58
  */
-struct ieee80211_ht_addt_info {
+struct ieee80211_ht_info {
 	u8 control_chan;
 	u8 ht_param;
 	__le16 operation_mode;
@@ -714,36 +774,33 @@ struct ieee80211_ht_addt_info {
 	u8 basic_set[16];
 } __attribute__ ((packed));
 
-/* 802.11n HT capabilities masks */
-#define IEEE80211_HT_CAP_SUP_WIDTH		0x0002
-#define IEEE80211_HT_CAP_SM_PS			0x000C
-#define IEEE80211_HT_CAP_GRN_FLD		0x0010
-#define IEEE80211_HT_CAP_SGI_20			0x0020
-#define IEEE80211_HT_CAP_SGI_40			0x0040
-#define IEEE80211_HT_CAP_DELAY_BA		0x0400
-#define IEEE80211_HT_CAP_MAX_AMSDU		0x0800
-#define IEEE80211_HT_CAP_DSSSCCK40		0x1000
-/* 802.11n HT capability AMPDU settings */
-#define IEEE80211_HT_CAP_AMPDU_FACTOR		0x03
-#define IEEE80211_HT_CAP_AMPDU_DENSITY		0x1C
-/* 802.11n HT capability MSC set */
-#define IEEE80211_SUPP_MCS_SET_UEQM		4
-#define IEEE80211_HT_CAP_MAX_STREAMS		4
-#define IEEE80211_SUPP_MCS_SET_LEN		10
-/* maximum streams the spec allows */
-#define IEEE80211_HT_CAP_MCS_TX_DEFINED		0x01
-#define IEEE80211_HT_CAP_MCS_TX_RX_DIFF		0x02
-#define IEEE80211_HT_CAP_MCS_TX_STREAMS		0x0C
-#define IEEE80211_HT_CAP_MCS_TX_UEQM		0x10
-/* 802.11n HT IE masks */
-#define IEEE80211_HT_IE_CHA_SEC_OFFSET		0x03
-#define IEEE80211_HT_IE_CHA_SEC_NONE	 	0x00
-#define IEEE80211_HT_IE_CHA_SEC_ABOVE 		0x01
-#define IEEE80211_HT_IE_CHA_SEC_BELOW 		0x03
-#define IEEE80211_HT_IE_CHA_WIDTH		0x04
-#define IEEE80211_HT_IE_HT_PROTECTION		0x0003
-#define IEEE80211_HT_IE_NON_GF_STA_PRSNT	0x0004
-#define IEEE80211_HT_IE_NON_HT_STA_PRSNT	0x0010
+/* for ht_param */
+#define IEEE80211_HT_PARAM_CHA_SEC_OFFSET		0x03
+#define		IEEE80211_HT_PARAM_CHA_SEC_NONE		0x00
+#define		IEEE80211_HT_PARAM_CHA_SEC_ABOVE	0x01
+#define		IEEE80211_HT_PARAM_CHA_SEC_BELOW	0x03
+#define IEEE80211_HT_PARAM_CHAN_WIDTH_ANY		0x04
+#define IEEE80211_HT_PARAM_RIFS_MODE			0x08
+#define IEEE80211_HT_PARAM_SPSMP_SUPPORT		0x10
+#define IEEE80211_HT_PARAM_SERV_INTERVAL_GRAN		0xE0
+
+/* for operation_mode */
+#define IEEE80211_HT_OP_MODE_PROTECTION			0x0003
+#define		IEEE80211_HT_OP_MODE_PROTECTION_NONE		0
+#define		IEEE80211_HT_OP_MODE_PROTECTION_NONMEMBER	1
+#define		IEEE80211_HT_OP_MODE_PROTECTION_20MHZ		2
+#define		IEEE80211_HT_OP_MODE_PROTECTION_NONHT_MIXED	3
+#define IEEE80211_HT_OP_MODE_NON_GF_STA_PRSNT		0x0004
+#define IEEE80211_HT_OP_MODE_NON_HT_STA_PRSNT		0x0010
+
+/* for stbc_param */
+#define IEEE80211_HT_STBC_PARAM_DUAL_BEACON		0x0040
+#define IEEE80211_HT_STBC_PARAM_DUAL_CTS_PROT		0x0080
+#define IEEE80211_HT_STBC_PARAM_STBC_BEACON		0x0100
+#define IEEE80211_HT_STBC_PARAM_LSIG_TXOP_FULLPROT	0x0200
+#define IEEE80211_HT_STBC_PARAM_PCO_ACTIVE		0x0400
+#define IEEE80211_HT_STBC_PARAM_PCO_PHASE		0x0800
+
 
 /* block-ack parameters */
 #define IEEE80211_ADDBA_PARAM_POLICY_MASK 0x0002
@@ -949,7 +1006,7 @@ enum ieee80211_eid {
 	WLAN_EID_EXT_SUPP_RATES = 50,
 	/* 802.11n */
 	WLAN_EID_HT_CAPABILITY = 45,
-	WLAN_EID_HT_EXTRA_INFO = 61,
+	WLAN_EID_HT_INFORMATION = 61,
 	/* 802.11i */
 	WLAN_EID_RSN = 48,
 	WLAN_EID_WPA = 221,
diff --git a/include/net/mac80211.h b/include/net/mac80211.h
index d1466e7a47b..2870f3973f1 100644
--- a/include/net/mac80211.h
+++ b/include/net/mac80211.h
@@ -191,7 +191,7 @@ enum ieee80211_bss_change {
  * @beacon_int: beacon interval
  * @assoc_capability: capabilities taken from assoc resp
  * @assoc_ht: association in HT mode
- * @ht_conf: ht capabilities
+ * @ht_cap: ht capabilities
  * @ht_bss_conf: ht extended capabilities
  * @basic_rates: bitmap of basic rates, each bit stands for an
  *	index into the rate table configured by the driver in
@@ -212,7 +212,7 @@ struct ieee80211_bss_conf {
 	u64 basic_rates;
 	/* ht related data */
 	bool assoc_ht;
-	struct ieee80211_ht_info *ht_conf;
+	struct ieee80211_sta_ht_cap *ht_cap;
 	struct ieee80211_ht_bss_info *ht_bss_conf;
 };
 
@@ -477,7 +477,7 @@ static inline int __deprecated __IEEE80211_CONF_SHORT_SLOT_TIME(void)
  * @antenna_sel_tx: transmit antenna selection, 0: default/diversity,
  *	1/2: antenna 0/1
  * @antenna_sel_rx: receive antenna selection, like @antenna_sel_tx
- * @ht_conf: describes current self configuration of 802.11n HT capabilies
+ * @ht_cap: describes current self configuration of 802.11n HT capabilities
  * @ht_bss_conf: describes current BSS configuration of 802.11n HT parameters
  * @channel: the channel to tune to
  */
@@ -493,7 +493,7 @@ struct ieee80211_conf {
 
 	struct ieee80211_channel *channel;
 
-	struct ieee80211_ht_info ht_conf;
+	struct ieee80211_sta_ht_cap ht_cap;
 	struct ieee80211_ht_bss_info ht_bss_conf;
 };
 
@@ -687,7 +687,7 @@ enum set_key_cmd {
  * @addr: MAC address
  * @aid: AID we assigned to the station if we're an AP
  * @supp_rates: Bitmap of supported rates (per band)
- * @ht_info: HT capabilities of this STA
+ * @ht_cap: HT capabilities of this STA
  * @drv_priv: data area for driver use, will always be aligned to
  *	sizeof(void *), size is determined in hw information.
  */
@@ -695,7 +695,7 @@ struct ieee80211_sta {
 	u64 supp_rates[IEEE80211_NUM_BANDS];
 	u8 addr[ETH_ALEN];
 	u16 aid;
-	struct ieee80211_ht_info ht_info;
+	struct ieee80211_sta_ht_cap ht_cap;
 
 	/* must be last */
 	u8 drv_priv[0] __attribute__((__aligned__(sizeof(void *))));
diff --git a/include/net/wireless.h b/include/net/wireless.h
index 721efb363db..c0aa0fb8db5 100644
--- a/include/net/wireless.h
+++ b/include/net/wireless.h
@@ -10,6 +10,7 @@
 #include <linux/netdevice.h>
 #include <linux/debugfs.h>
 #include <linux/list.h>
+#include <linux/ieee80211.h>
 #include <net/cfg80211.h>
 
 /**
@@ -133,23 +134,23 @@ struct ieee80211_rate {
 };
 
 /**
- * struct ieee80211_ht_info - describing STA's HT capabilities
+ * struct ieee80211_sta_ht_cap - STA's HT capabilities
  *
  * This structure describes most essential parameters needed
  * to describe 802.11n HT capabilities for an STA.
  *
- * @ht_supported: is HT supported by STA, 0: no, 1: yes
+ * @ht_supported: is HT supported by the STA
  * @cap: HT capabilities map as described in 802.11n spec
  * @ampdu_factor: Maximum A-MPDU length factor
  * @ampdu_density: Minimum A-MPDU spacing
- * @supp_mcs_set: Supported MCS set as described in 802.11n spec
+ * @mcs: Supported MCS rates
  */
-struct ieee80211_ht_info {
+struct ieee80211_sta_ht_cap {
 	u16 cap; /* use IEEE80211_HT_CAP_ */
-	u8 ht_supported;
+	bool ht_supported;
 	u8 ampdu_factor;
 	u8 ampdu_density;
-	u8 supp_mcs_set[16];
+	struct ieee80211_mcs_info mcs;
 };
 
 /**
@@ -173,7 +174,7 @@ struct ieee80211_supported_band {
 	enum ieee80211_band band;
 	int n_channels;
 	int n_bitrates;
-	struct ieee80211_ht_info ht_info;
+	struct ieee80211_sta_ht_cap ht_cap;
 };
 
 /**
diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c
index cf3fd5d6066..a5dea617aab 100644
--- a/net/mac80211/cfg.c
+++ b/net/mac80211/cfg.c
@@ -633,10 +633,9 @@ static void sta_apply_parameters(struct ieee80211_local *local,
 		sta->sta.supp_rates[local->oper_channel->band] = rates;
 	}
 
-	if (params->ht_capa) {
-		ieee80211_ht_cap_ie_to_ht_info(params->ht_capa,
-					       &sta->sta.ht_info);
-	}
+	if (params->ht_capa)
+		ieee80211_ht_cap_ie_to_sta_ht_cap(params->ht_capa,
+						  &sta->sta.ht_cap);
 
 	if (ieee80211_vif_is_mesh(&sdata->vif) && params->plink_action) {
 		switch (params->plink_action) {
diff --git a/net/mac80211/ht.c b/net/mac80211/ht.c
index b854483cf23..e2d121bf274 100644
--- a/net/mac80211/ht.c
+++ b/net/mac80211/ht.c
@@ -20,37 +20,33 @@
 #include "sta_info.h"
 #include "wme.h"
 
-int ieee80211_ht_cap_ie_to_ht_info(struct ieee80211_ht_cap *ht_cap_ie,
-				   struct ieee80211_ht_info *ht_info)
+void ieee80211_ht_cap_ie_to_sta_ht_cap(struct ieee80211_ht_cap *ht_cap_ie,
+				       struct ieee80211_sta_ht_cap *ht_cap)
 {
 
-	if (ht_info == NULL)
-		return -EINVAL;
+	BUG_ON(!ht_cap);
 
-	memset(ht_info, 0, sizeof(*ht_info));
+	memset(ht_cap, 0, sizeof(*ht_cap));
 
 	if (ht_cap_ie) {
 		u8 ampdu_info = ht_cap_ie->ampdu_params_info;
 
-		ht_info->ht_supported = 1;
-		ht_info->cap = le16_to_cpu(ht_cap_ie->cap_info);
-		ht_info->ampdu_factor =
-			ampdu_info & IEEE80211_HT_CAP_AMPDU_FACTOR;
-		ht_info->ampdu_density =
-			(ampdu_info & IEEE80211_HT_CAP_AMPDU_DENSITY) >> 2;
-		memcpy(ht_info->supp_mcs_set, ht_cap_ie->supp_mcs_set, 16);
+		ht_cap->ht_supported = true;
+		ht_cap->cap = le16_to_cpu(ht_cap_ie->cap_info);
+		ht_cap->ampdu_factor =
+			ampdu_info & IEEE80211_HT_AMPDU_PARM_FACTOR;
+		ht_cap->ampdu_density =
+			(ampdu_info & IEEE80211_HT_AMPDU_PARM_DENSITY) >> 2;
+		memcpy(&ht_cap->mcs, &ht_cap_ie->mcs, sizeof(ht_cap->mcs));
 	} else
-		ht_info->ht_supported = 0;
-
-	return 0;
+		ht_cap->ht_supported = false;
 }
 
-int ieee80211_ht_addt_info_ie_to_ht_bss_info(
-			struct ieee80211_ht_addt_info *ht_add_info_ie,
+void ieee80211_ht_info_ie_to_ht_bss_info(
+			struct ieee80211_ht_info *ht_add_info_ie,
 			struct ieee80211_ht_bss_info *bss_info)
 {
-	if (bss_info == NULL)
-		return -EINVAL;
+	BUG_ON(!bss_info);
 
 	memset(bss_info, 0, sizeof(*bss_info));
 
@@ -62,8 +58,119 @@ int ieee80211_ht_addt_info_ie_to_ht_bss_info(
 		bss_info->bss_cap = ht_add_info_ie->ht_param;
 		bss_info->bss_op_mode = (u8)(op_mode & 0xff);
 	}
+}
+
+/*
+ * ieee80211_handle_ht should be called only after the operating band
+ * has been determined as ht configuration depends on the hw's
+ * HT abilities for a specific band.
+ */
+u32 ieee80211_handle_ht(struct ieee80211_local *local,
+			struct ieee80211_sta_ht_cap *req_ht_cap,
+			struct ieee80211_ht_bss_info *req_bss_cap)
+{
+	struct ieee80211_conf *conf = &local->hw.conf;
+	struct ieee80211_supported_band *sband;
+	struct ieee80211_sta_ht_cap ht_cap;
+	struct ieee80211_ht_bss_info ht_bss_conf;
+	u32 changed = 0;
+	int i;
+	u8 max_tx_streams;
+	u8 tx_mcs_set_cap;
+	bool enable_ht = true;
+
+	sband = local->hw.wiphy->bands[conf->channel->band];
+
+	memset(&ht_cap, 0, sizeof(ht_cap));
+	memset(&ht_bss_conf, 0, sizeof(struct ieee80211_ht_bss_info));
+
+	/* HT is not supported */
+	if (!sband->ht_cap.ht_supported)
+		enable_ht = false;
+
+	/* disable HT */
+	if (!enable_ht) {
+		if (conf->flags & IEEE80211_CONF_SUPPORT_HT_MODE)
+			changed |= BSS_CHANGED_HT;
+		conf->flags &= ~IEEE80211_CONF_SUPPORT_HT_MODE;
+		conf->ht_cap.ht_supported = false;
+		return changed;
+	}
+
+
+	if (!(conf->flags & IEEE80211_CONF_SUPPORT_HT_MODE))
+		changed |= BSS_CHANGED_HT;
+
+	conf->flags |= IEEE80211_CONF_SUPPORT_HT_MODE;
+	ht_cap.ht_supported = true;
+
+	ht_cap.cap = req_ht_cap->cap & sband->ht_cap.cap;
+	ht_cap.cap &= ~IEEE80211_HT_CAP_SM_PS;
+	ht_cap.cap |= sband->ht_cap.cap & IEEE80211_HT_CAP_SM_PS;
+
+	ht_bss_conf.primary_channel = req_bss_cap->primary_channel;
+	ht_bss_conf.bss_cap = req_bss_cap->bss_cap;
+	ht_bss_conf.bss_op_mode = req_bss_cap->bss_op_mode;
+
+	ht_cap.ampdu_factor = req_ht_cap->ampdu_factor;
+	ht_cap.ampdu_density = req_ht_cap->ampdu_density;
+
+	/* own MCS TX capabilities */
+	tx_mcs_set_cap = sband->ht_cap.mcs.tx_params;
+
+	/*
+	 * configure supported Tx MCS according to requested MCS
+	 * (based in most cases on Rx capabilities of peer) and self
+	 * Tx MCS capabilities (as defined by low level driver HW
+	 * Tx capabilities)
+	 */
+
+	/* can we TX with MCS rates? */
+	if (!(tx_mcs_set_cap & IEEE80211_HT_MCS_TX_DEFINED))
+		goto check_changed;
+
+	/* Counting from 0, therefore +1 */
+	if (tx_mcs_set_cap & IEEE80211_HT_MCS_TX_RX_DIFF)
+		max_tx_streams =
+			((tx_mcs_set_cap & IEEE80211_HT_MCS_TX_MAX_STREAMS_MASK)
+				>> IEEE80211_HT_MCS_TX_MAX_STREAMS_SHIFT) + 1;
+	else
+		max_tx_streams = IEEE80211_HT_MCS_TX_MAX_STREAMS;
+
+	/*
+	 * 802.11n D5.0 20.3.5 / 20.6 says:
+	 * - indices 0 to 7 and 32 are single spatial stream
+	 * - 8 to 31 are multiple spatial streams using equal modulation
+	 *   [8..15 for two streams, 16..23 for three and 24..31 for four]
+	 * - remainder are multiple spatial streams using unequal modulation
+	 */
+	for (i = 0; i < max_tx_streams; i++)
+		ht_cap.mcs.rx_mask[i] =
+			sband->ht_cap.mcs.rx_mask[i] &
+					req_ht_cap->mcs.rx_mask[i];
+
+	if (tx_mcs_set_cap & IEEE80211_HT_MCS_TX_UNEQUAL_MODULATION)
+		for (i = IEEE80211_HT_MCS_UNEQUAL_MODULATION_START_BYTE;
+		     i < IEEE80211_HT_MCS_MASK_LEN; i++)
+			ht_cap.mcs.rx_mask[i] =
+				sband->ht_cap.mcs.rx_mask[i] &
+					req_ht_cap->mcs.rx_mask[i];
+
+	/* handle MCS rate 32 too */
+	if (sband->ht_cap.mcs.rx_mask[32/8] &
+	    req_ht_cap->mcs.rx_mask[32/8] & 1)
+		ht_cap.mcs.rx_mask[32/8] |= 1;
+
+ check_changed:
+	/* if bss configuration changed store the new one */
+	if (memcmp(&conf->ht_cap, &ht_cap, sizeof(ht_cap)) ||
+	    memcmp(&conf->ht_bss_conf, &ht_bss_conf, sizeof(ht_bss_conf))) {
+		changed |= BSS_CHANGED_HT;
+		memcpy(&conf->ht_cap, &ht_cap, sizeof(ht_cap));
+		memcpy(&conf->ht_bss_conf, &ht_bss_conf, sizeof(ht_bss_conf));
+	}
 
-	return 0;
+	return changed;
 }
 
 static void ieee80211_send_addba_request(struct ieee80211_sub_if_data *sdata,
@@ -794,7 +901,7 @@ void ieee80211_process_addba_request(struct ieee80211_local *local,
 	 * check if configuration can support the BA policy
 	 * and if buffer size does not exceeds max value */
 	if (((ba_policy != 1)
-		&& (!(conf->ht_conf.cap & IEEE80211_HT_CAP_DELAY_BA)))
+		&& (!(conf->ht_cap.cap & IEEE80211_HT_CAP_DELAY_BA)))
 		|| (buf_size > IEEE80211_MAX_AMPDU_BUF)) {
 		status = WLAN_STATUS_INVALID_QOS_PARAM;
 #ifdef CONFIG_MAC80211_HT_DEBUG
@@ -812,7 +919,7 @@ void ieee80211_process_addba_request(struct ieee80211_local *local,
 
 		sband = local->hw.wiphy->bands[conf->channel->band];
 		buf_size = IEEE80211_MIN_AMPDU_BUF;
-		buf_size = buf_size << sband->ht_info.ampdu_factor;
+		buf_size = buf_size << sband->ht_cap.ampdu_factor;
 	}
 
 
diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h
index f49cb9e8bb4..ae4ca3e8b44 100644
--- a/net/mac80211/ieee80211_i.h
+++ b/net/mac80211/ieee80211_i.h
@@ -817,7 +817,7 @@ struct ieee802_11_elems {
 	u8 *wmm_info;
 	u8 *wmm_param;
 	struct ieee80211_ht_cap *ht_cap_elem;
-	struct ieee80211_ht_addt_info *ht_info_elem;
+	struct ieee80211_ht_info *ht_info_elem;
 	u8 *mesh_config;
 	u8 *mesh_id;
 	u8 *peer_link;
@@ -880,9 +880,6 @@ static inline int ieee80211_bssid_match(const u8 *raddr, const u8 *addr)
 int ieee80211_hw_config(struct ieee80211_local *local);
 int ieee80211_if_config(struct ieee80211_sub_if_data *sdata, u32 changed);
 void ieee80211_tx_set_protected(struct ieee80211_tx_data *tx);
-u32 ieee80211_handle_ht(struct ieee80211_local *local, int enable_ht,
-			struct ieee80211_ht_info *req_ht_cap,
-			struct ieee80211_ht_bss_info *req_bss_cap);
 void ieee80211_bss_info_change_notify(struct ieee80211_sub_if_data *sdata,
 				      u32 changed);
 void ieee80211_configure_filter(struct ieee80211_local *local);
@@ -963,11 +960,14 @@ int ieee80211_monitor_start_xmit(struct sk_buff *skb, struct net_device *dev);
 int ieee80211_subif_start_xmit(struct sk_buff *skb, struct net_device *dev);
 
 /* HT */
-int ieee80211_ht_cap_ie_to_ht_info(struct ieee80211_ht_cap *ht_cap_ie,
-				   struct ieee80211_ht_info *ht_info);
-int ieee80211_ht_addt_info_ie_to_ht_bss_info(
-			struct ieee80211_ht_addt_info *ht_add_info_ie,
+void ieee80211_ht_cap_ie_to_sta_ht_cap(struct ieee80211_ht_cap *ht_cap_ie,
+				       struct ieee80211_sta_ht_cap *ht_cap);
+void ieee80211_ht_info_ie_to_ht_bss_info(
+			struct ieee80211_ht_info *ht_add_info_ie,
 			struct ieee80211_ht_bss_info *bss_info);
+u32 ieee80211_handle_ht(struct ieee80211_local *local,
+			struct ieee80211_sta_ht_cap *req_ht_cap,
+			struct ieee80211_ht_bss_info *req_bss_cap);
 void ieee80211_send_bar(struct ieee80211_sub_if_data *sdata, u8 *ra, u16 tid, u16 ssn);
 
 void ieee80211_sta_stop_rx_ba_session(struct ieee80211_sub_if_data *sdata, u8 *da,
diff --git a/net/mac80211/main.c b/net/mac80211/main.c
index c427954fe8e..07f812755e5 100644
--- a/net/mac80211/main.c
+++ b/net/mac80211/main.c
@@ -232,100 +232,6 @@ int ieee80211_hw_config(struct ieee80211_local *local)
 	return ret;
 }
 
-/**
- * ieee80211_handle_ht should be used only after legacy configuration
- * has been determined namely band, as ht configuration depends upon
- * the hardware's HT abilities for a _specific_ band.
- */
-u32 ieee80211_handle_ht(struct ieee80211_local *local, int enable_ht,
-			   struct ieee80211_ht_info *req_ht_cap,
-			   struct ieee80211_ht_bss_info *req_bss_cap)
-{
-	struct ieee80211_conf *conf = &local->hw.conf;
-	struct ieee80211_supported_band *sband;
-	struct ieee80211_ht_info ht_conf;
-	struct ieee80211_ht_bss_info ht_bss_conf;
-	u32 changed = 0;
-	int i;
-	u8 max_tx_streams = IEEE80211_HT_CAP_MAX_STREAMS;
-	u8 tx_mcs_set_cap;
-
-	sband = local->hw.wiphy->bands[conf->channel->band];
-
-	memset(&ht_conf, 0, sizeof(struct ieee80211_ht_info));
-	memset(&ht_bss_conf, 0, sizeof(struct ieee80211_ht_bss_info));
-
-	/* HT is not supported */
-	if (!sband->ht_info.ht_supported) {
-		conf->flags &= ~IEEE80211_CONF_SUPPORT_HT_MODE;
-		goto out;
-	}
-
-	/* disable HT */
-	if (!enable_ht) {
-		if (conf->flags & IEEE80211_CONF_SUPPORT_HT_MODE)
-			changed |= BSS_CHANGED_HT;
-		conf->flags &= ~IEEE80211_CONF_SUPPORT_HT_MODE;
-		conf->ht_conf.ht_supported = 0;
-		goto out;
-	}
-
-
-	if (!(conf->flags & IEEE80211_CONF_SUPPORT_HT_MODE))
-		changed |= BSS_CHANGED_HT;
-
-	conf->flags |= IEEE80211_CONF_SUPPORT_HT_MODE;
-	ht_conf.ht_supported = 1;
-
-	ht_conf.cap = req_ht_cap->cap & sband->ht_info.cap;
-	ht_conf.cap &= ~(IEEE80211_HT_CAP_SM_PS);
-	ht_conf.cap |= sband->ht_info.cap & IEEE80211_HT_CAP_SM_PS;
-	ht_bss_conf.primary_channel = req_bss_cap->primary_channel;
-	ht_bss_conf.bss_cap = req_bss_cap->bss_cap;
-	ht_bss_conf.bss_op_mode = req_bss_cap->bss_op_mode;
-
-	ht_conf.ampdu_factor = req_ht_cap->ampdu_factor;
-	ht_conf.ampdu_density = req_ht_cap->ampdu_density;
-
-	/* Bits 96-100 */
-	tx_mcs_set_cap = sband->ht_info.supp_mcs_set[12];
-
-	/* configure suppoerted Tx MCS according to requested MCS
-	 * (based in most cases on Rx capabilities of peer) and self
-	 * Tx MCS capabilities (as defined by low level driver HW
-	 * Tx capabilities) */
-	if (!(tx_mcs_set_cap & IEEE80211_HT_CAP_MCS_TX_DEFINED))
-		goto check_changed;
-
-	/* Counting from 0 therfore + 1 */
-	if (tx_mcs_set_cap & IEEE80211_HT_CAP_MCS_TX_RX_DIFF)
-		max_tx_streams = ((tx_mcs_set_cap &
-				IEEE80211_HT_CAP_MCS_TX_STREAMS) >> 2) + 1;
-
-	for (i = 0; i < max_tx_streams; i++)
-		ht_conf.supp_mcs_set[i] =
-			sband->ht_info.supp_mcs_set[i] &
-					req_ht_cap->supp_mcs_set[i];
-
-	if (tx_mcs_set_cap & IEEE80211_HT_CAP_MCS_TX_UEQM)
-		for (i = IEEE80211_SUPP_MCS_SET_UEQM;
-		     i < IEEE80211_SUPP_MCS_SET_LEN; i++)
-			ht_conf.supp_mcs_set[i] =
-				sband->ht_info.supp_mcs_set[i] &
-					req_ht_cap->supp_mcs_set[i];
-
-check_changed:
-	/* if bss configuration changed store the new one */
-	if (memcmp(&conf->ht_conf, &ht_conf, sizeof(ht_conf)) ||
-	    memcmp(&conf->ht_bss_conf, &ht_bss_conf, sizeof(ht_bss_conf))) {
-		changed |= BSS_CHANGED_HT;
-		memcpy(&conf->ht_conf, &ht_conf, sizeof(ht_conf));
-		memcpy(&conf->ht_bss_conf, &ht_bss_conf, sizeof(ht_bss_conf));
-	}
-out:
-	return changed;
-}
-
 void ieee80211_bss_info_change_notify(struct ieee80211_sub_if_data *sdata,
 				      u32 changed)
 {
diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c
index 829995e740a..196dd39f628 100644
--- a/net/mac80211/mlme.c
+++ b/net/mac80211/mlme.c
@@ -236,7 +236,7 @@ static void ieee80211_send_assoc(struct ieee80211_sub_if_data *sdata,
 	struct ieee80211_local *local = sdata->local;
 	struct sk_buff *skb;
 	struct ieee80211_mgmt *mgmt;
-	u8 *pos, *ies, *ht_add_ie;
+	u8 *pos, *ies, *ht_ie;
 	int i, len, count, rates_len, supp_rates_len;
 	u16 capab;
 	struct ieee80211_bss *bss;
@@ -393,24 +393,25 @@ static void ieee80211_send_assoc(struct ieee80211_sub_if_data *sdata,
 
 	/* wmm support is a must to HT */
 	if (wmm && (ifsta->flags & IEEE80211_STA_WMM_ENABLED) &&
-	    sband->ht_info.ht_supported &&
-	    (ht_add_ie = ieee80211_bss_get_ie(bss, WLAN_EID_HT_EXTRA_INFO))) {
-		struct ieee80211_ht_addt_info *ht_add_info =
-			(struct ieee80211_ht_addt_info *)ht_add_ie;
-		u16 cap = sband->ht_info.cap;
+	    sband->ht_cap.ht_supported &&
+	    (ht_ie = ieee80211_bss_get_ie(bss, WLAN_EID_HT_INFORMATION)) &&
+	    ht_ie[1] >= sizeof(struct ieee80211_ht_info)) {
+		struct ieee80211_ht_info *ht_info =
+			(struct ieee80211_ht_info *)(ht_ie + 2);
+		u16 cap = sband->ht_cap.cap;
 		__le16 tmp;
 		u32 flags = local->hw.conf.channel->flags;
 
-		switch (ht_add_info->ht_param & IEEE80211_HT_IE_CHA_SEC_OFFSET) {
-		case IEEE80211_HT_IE_CHA_SEC_ABOVE:
+		switch (ht_info->ht_param & IEEE80211_HT_PARAM_CHA_SEC_OFFSET) {
+		case IEEE80211_HT_PARAM_CHA_SEC_ABOVE:
 			if (flags & IEEE80211_CHAN_NO_FAT_ABOVE) {
-				cap &= ~IEEE80211_HT_CAP_SUP_WIDTH;
+				cap &= ~IEEE80211_HT_CAP_SUP_WIDTH_20_40;
 				cap &= ~IEEE80211_HT_CAP_SGI_40;
 			}
 			break;
-		case IEEE80211_HT_IE_CHA_SEC_BELOW:
+		case IEEE80211_HT_PARAM_CHA_SEC_BELOW:
 			if (flags & IEEE80211_CHAN_NO_FAT_BELOW) {
-				cap &= ~IEEE80211_HT_CAP_SUP_WIDTH;
+				cap &= ~IEEE80211_HT_CAP_SUP_WIDTH_20_40;
 				cap &= ~IEEE80211_HT_CAP_SGI_40;
 			}
 			break;
@@ -424,9 +425,9 @@ static void ieee80211_send_assoc(struct ieee80211_sub_if_data *sdata,
 		memcpy(pos, &tmp, sizeof(u16));
 		pos += sizeof(u16);
 		/* TODO: needs a define here for << 2 */
-		*pos++ = sband->ht_info.ampdu_factor |
-			 (sband->ht_info.ampdu_density << 2);
-		memcpy(pos, sband->ht_info.supp_mcs_set, 16);
+		*pos++ = sband->ht_cap.ampdu_factor |
+			 (sband->ht_cap.ampdu_density << 2);
+		memcpy(pos, &sband->ht_cap.mcs, sizeof(sband->ht_cap.mcs));
 	}
 
 	kfree(ifsta->assocreq_ies);
@@ -730,7 +731,7 @@ static void ieee80211_set_associated(struct ieee80211_sub_if_data *sdata,
 	if (conf->flags & IEEE80211_CONF_SUPPORT_HT_MODE) {
 		changed |= BSS_CHANGED_HT;
 		sdata->bss_conf.assoc_ht = 1;
-		sdata->bss_conf.ht_conf = &conf->ht_conf;
+		sdata->bss_conf.ht_cap = &conf->ht_cap;
 		sdata->bss_conf.ht_bss_conf = &conf->ht_bss_conf;
 	}
 
@@ -850,7 +851,7 @@ static void ieee80211_set_disassoc(struct ieee80211_sub_if_data *sdata,
 		changed |= BSS_CHANGED_HT;
 
 	sdata->bss_conf.assoc_ht = 0;
-	sdata->bss_conf.ht_conf = NULL;
+	sdata->bss_conf.ht_cap = NULL;
 	sdata->bss_conf.ht_bss_conf = NULL;
 
 	ieee80211_led_assoc(local, 0);
@@ -1335,11 +1336,11 @@ static void ieee80211_rx_mgmt_assoc_resp(struct ieee80211_sub_if_data *sdata,
 	if (elems.ht_cap_elem && elems.ht_info_elem && elems.wmm_param &&
 	    (ifsta->flags & IEEE80211_STA_WMM_ENABLED)) {
 		struct ieee80211_ht_bss_info bss_info;
-		ieee80211_ht_cap_ie_to_ht_info(
-				elems.ht_cap_elem, &sta->sta.ht_info);
-		ieee80211_ht_addt_info_ie_to_ht_bss_info(
+		ieee80211_ht_cap_ie_to_sta_ht_cap(
+				elems.ht_cap_elem, &sta->sta.ht_cap);
+		ieee80211_ht_info_ie_to_ht_bss_info(
 				elems.ht_info_elem, &bss_info);
-		ieee80211_handle_ht(local, 1, &sta->sta.ht_info, &bss_info);
+		ieee80211_handle_ht(local, &sta->sta.ht_cap, &bss_info);
 	}
 
 	rate_control_rate_init(sta);
@@ -1696,9 +1697,9 @@ static void ieee80211_rx_mgmt_beacon(struct ieee80211_sub_if_data *sdata,
 	    elems.wmm_param && conf->flags & IEEE80211_CONF_SUPPORT_HT_MODE) {
 		struct ieee80211_ht_bss_info bss_info;
 
-		ieee80211_ht_addt_info_ie_to_ht_bss_info(
+		ieee80211_ht_info_ie_to_ht_bss_info(
 				elems.ht_info_elem, &bss_info);
-		changed |= ieee80211_handle_ht(local, 1, &conf->ht_conf,
+		changed |= ieee80211_handle_ht(local, &conf->ht_cap,
 					       &bss_info);
 	}
 
diff --git a/net/mac80211/util.c b/net/mac80211/util.c
index 1b605457017..9941a60a232 100644
--- a/net/mac80211/util.c
+++ b/net/mac80211/util.c
@@ -532,8 +532,8 @@ void ieee802_11_parse_elems(u8 *start, size_t len,
 			if (elen >= sizeof(struct ieee80211_ht_cap))
 				elems->ht_cap_elem = (void *)pos;
 			break;
-		case WLAN_EID_HT_EXTRA_INFO:
-			if (elen >= sizeof(struct ieee80211_ht_addt_info))
+		case WLAN_EID_HT_INFORMATION:
+			if (elen >= sizeof(struct ieee80211_ht_info))
 				elems->ht_info_elem = (void *)pos;
 			break;
 		case WLAN_EID_MESH_ID:
diff --git a/net/mac80211/wext.c b/net/mac80211/wext.c
index 29c41040c8c..a3af1514124 100644
--- a/net/mac80211/wext.c
+++ b/net/mac80211/wext.c
@@ -147,7 +147,7 @@ static int ieee80211_ioctl_giwname(struct net_device *dev,
 	sband = local->hw.wiphy->bands[IEEE80211_BAND_5GHZ];
 	if (sband) {
 		is_a = 1;
-		is_ht |= sband->ht_info.ht_supported;
+		is_ht |= sband->ht_cap.ht_supported;
 	}
 
 	sband = local->hw.wiphy->bands[IEEE80211_BAND_2GHZ];
@@ -160,7 +160,7 @@ static int ieee80211_ioctl_giwname(struct net_device *dev,
 			if (sband->bitrates[i].bitrate == 60)
 				is_g = 1;
 		}
-		is_ht |= sband->ht_info.ht_supported;
+		is_ht |= sband->ht_cap.ht_supported;
 	}
 
 	strcpy(name, "IEEE 802.11");
-- 
cgit v1.2.3-70-g09d2


From d51626df5747efaa8d2c00678f64cb503845effe Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes@sipsolutions.net>
Date: Thu, 9 Oct 2008 12:20:13 +0200
Subject: nl80211: export HT capabilities

This exports the local HT capabilities in nl80211.

Signed-off-by: Johannes Berg <johannes@sipsolutions.net>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 include/linux/nl80211.h | 12 ++++++++++++
 net/wireless/nl80211.c  | 13 +++++++++++++
 2 files changed, 25 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/nl80211.h b/include/linux/nl80211.h
index 9bad65400fb..41720d47d61 100644
--- a/include/linux/nl80211.h
+++ b/include/linux/nl80211.h
@@ -452,17 +452,29 @@ enum nl80211_mpath_info {
  *	an array of nested frequency attributes
  * @NL80211_BAND_ATTR_RATES: supported bitrates in this band,
  *	an array of nested bitrate attributes
+ * @NL80211_BAND_ATTR_HT_MCS_SET: 16-byte attribute containing the MCS set as
+ *	defined in 802.11n
+ * @NL80211_BAND_ATTR_HT_CAPA: HT capabilities, as in the HT information IE
+ * @NL80211_BAND_ATTR_HT_AMPDU_FACTOR: A-MPDU factor, as in 11n
+ * @NL80211_BAND_ATTR_HT_AMPDU_DENSITY: A-MPDU density, as in 11n
  */
 enum nl80211_band_attr {
 	__NL80211_BAND_ATTR_INVALID,
 	NL80211_BAND_ATTR_FREQS,
 	NL80211_BAND_ATTR_RATES,
 
+	NL80211_BAND_ATTR_HT_MCS_SET,
+	NL80211_BAND_ATTR_HT_CAPA,
+	NL80211_BAND_ATTR_HT_AMPDU_FACTOR,
+	NL80211_BAND_ATTR_HT_AMPDU_DENSITY,
+
 	/* keep last */
 	__NL80211_BAND_ATTR_AFTER_LAST,
 	NL80211_BAND_ATTR_MAX = __NL80211_BAND_ATTR_AFTER_LAST - 1
 };
 
+#define NL80211_BAND_ATTR_HT_CAPA NL80211_BAND_ATTR_HT_CAPA
+
 /**
  * enum nl80211_frequency_attr - frequency attributes
  * @NL80211_FREQUENCY_ATTR_FREQ: Frequency in MHz
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index 572793c8c7a..4d12e885170 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -157,6 +157,19 @@ static int nl80211_send_wiphy(struct sk_buff *msg, u32 pid, u32 seq, int flags,
 		if (!nl_band)
 			goto nla_put_failure;
 
+		/* add HT info */
+		if (dev->wiphy.bands[band]->ht_cap.ht_supported) {
+			NLA_PUT(msg, NL80211_BAND_ATTR_HT_MCS_SET,
+				sizeof(dev->wiphy.bands[band]->ht_cap.mcs),
+				&dev->wiphy.bands[band]->ht_cap.mcs);
+			NLA_PUT_U16(msg, NL80211_BAND_ATTR_HT_CAPA,
+				dev->wiphy.bands[band]->ht_cap.cap);
+			NLA_PUT_U8(msg, NL80211_BAND_ATTR_HT_AMPDU_FACTOR,
+				dev->wiphy.bands[band]->ht_cap.ampdu_factor);
+			NLA_PUT_U8(msg, NL80211_BAND_ATTR_HT_AMPDU_DENSITY,
+				dev->wiphy.bands[band]->ht_cap.ampdu_density);
+		}
+
 		/* add frequencies */
 		nl_freqs = nla_nest_start(msg, NL80211_BAND_ATTR_FREQS);
 		if (!nl_freqs)
-- 
cgit v1.2.3-70-g09d2


From 93da9cc17c5ae8a751886fd4732db89ad5e9bdb9 Mon Sep 17 00:00:00 2001
From: "colin@cozybit.com" <colin@cozybit.com>
Date: Tue, 21 Oct 2008 12:03:48 -0700
Subject: Add nl80211 commands to get and set o11s mesh networking parameters

The two new commands are NL80211_CMD_GET_MESH_PARAMS and
NL80211_CMD_SET_MESH_PARAMS. There is a new attribute enum,
NL80211_ATTR_MESH_PARAMS, which enumerates the various mesh configuration
parameters.

Moved struct mesh_config from mac80211/ieee80211_i.h to net/cfg80211.h.
nl80211_get_mesh_params and nl80211_set_mesh_params unpack the netlink messages
and ask the driver to get or set the configuration.  This is done via two new
function stubs, get_mesh_params and set_mesh_params, in struct cfg80211_ops.

Signed-off-by: Colin McCabe <colin@cozybit.com>
Acked-by: Johannes Berg <johannes@sipsolutions.net>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 include/linux/nl80211.h    |  86 ++++++++++++++++++++
 include/net/cfg80211.h     |  32 +++++++-
 net/mac80211/cfg.c         |  68 ++++++++++++++++
 net/mac80211/ieee80211_i.h |  21 +----
 net/wireless/nl80211.c     | 191 +++++++++++++++++++++++++++++++++++++++++++++
 5 files changed, 377 insertions(+), 21 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/nl80211.h b/include/linux/nl80211.h
index 41720d47d61..e4cc7869b22 100644
--- a/include/linux/nl80211.h
+++ b/include/linux/nl80211.h
@@ -106,6 +106,12 @@
  * 	to the the specified ISO/IEC 3166-1 alpha2 country code. The core will
  * 	store this as a valid request and then query userspace for it.
  *
+ * @NL80211_CMD_GET_MESH_PARAMS: Get mesh networking properties for the
+ *	interface identified by %NL80211_ATTR_IFINDEX
+ *
+ * @NL80211_CMD_SET_MESH_PARAMS: Set mesh networking properties for the
+ *      interface identified by %NL80211_ATTR_IFINDEX
+ *
  * @NL80211_CMD_MAX: highest used command number
  * @__NL80211_CMD_AFTER_LAST: internal use
  */
@@ -148,6 +154,9 @@ enum nl80211_commands {
 	NL80211_CMD_SET_REG,
 	NL80211_CMD_REQ_SET_REG,
 
+	NL80211_CMD_GET_MESH_PARAMS,
+	NL80211_CMD_SET_MESH_PARAMS,
+
 	/* add new commands above here */
 
 	/* used to define NL80211_CMD_MAX below */
@@ -296,6 +305,8 @@ enum nl80211_attrs {
 	NL80211_ATTR_REG_ALPHA2,
 	NL80211_ATTR_REG_RULES,
 
+	NL80211_ATTR_MESH_PARAMS,
+
 	/* add attributes here, update the policy in nl80211.c */
 
 	__NL80211_ATTR_AFTER_LAST,
@@ -606,4 +617,79 @@ enum nl80211_mntr_flags {
 	NL80211_MNTR_FLAG_MAX = __NL80211_MNTR_FLAG_AFTER_LAST - 1
 };
 
+/**
+ * enum nl80211_meshconf_params - mesh configuration parameters
+ *
+ * Mesh configuration parameters
+ *
+ * @__NL80211_MESHCONF_INVALID: internal use
+ *
+ * @NL80211_MESHCONF_RETRY_TIMEOUT: specifies the initial retry timeout in
+ * millisecond units, used by the Peer Link Open message
+ *
+ * @NL80211_MESHCONF_CONFIRM_TIMEOUT: specifies the inital confirm timeout, in
+ * millisecond units, used by the peer link management to close a peer link
+ *
+ * @NL80211_MESHCONF_HOLDING_TIMEOUT: specifies the holding timeout, in
+ * millisecond units
+ *
+ * @NL80211_MESHCONF_MAX_PEER_LINKS: maximum number of peer links allowed
+ * on this mesh interface
+ *
+ * @NL80211_MESHCONF_MAX_RETRIES: specifies the maximum number of peer link
+ * open retries that can be sent to establish a new peer link instance in a
+ * mesh
+ *
+ * @NL80211_MESHCONF_TTL: specifies the value of TTL field set at a source mesh
+ * point.
+ *
+ * @NL80211_MESHCONF_AUTO_OPEN_PLINKS: whether we should automatically
+ * open peer links when we detect compatible mesh peers.
+ *
+ * @NL80211_MESHCONF_HWMP_MAX_PREQ_RETRIES: the number of action frames
+ * containing a PREQ that an MP can send to a particular destination (path
+ * target)
+ *
+ * @NL80211_MESHCONF_PATH_REFRESH_TIME: how frequently to refresh mesh paths
+ * (in milliseconds)
+ *
+ * @NL80211_MESHCONF_MIN_DISCOVERY_TIMEOUT: minimum length of time to wait
+ * until giving up on a path discovery (in milliseconds)
+ *
+ * @NL80211_MESHCONF_HWMP_ACTIVE_PATH_TIMEOUT: The time (in TUs) for which mesh
+ * points receiving a PREQ shall consider the forwarding information from the
+ * root to be valid. (TU = time unit)
+ *
+ * @NL80211_MESHCONF_HWMP_PREQ_MIN_INTERVAL: The minimum interval of time (in
+ * TUs) during which an MP can send only one action frame containing a PREQ
+ * reference element
+ *
+ * @NL80211_MESHCONF_HWMP_NET_DIAM_TRVS_TIME: The interval of time (in TUs)
+ * that it takes for an HWMP information element to propagate across the mesh
+ *
+ * @NL80211_MESHCONF_ATTR_MAX: highest possible mesh configuration attribute
+ *
+ * @__NL80211_MESHCONF_ATTR_AFTER_LAST: internal use
+ */
+enum nl80211_meshconf_params {
+	__NL80211_MESHCONF_INVALID,
+	NL80211_MESHCONF_RETRY_TIMEOUT,
+	NL80211_MESHCONF_CONFIRM_TIMEOUT,
+	NL80211_MESHCONF_HOLDING_TIMEOUT,
+	NL80211_MESHCONF_MAX_PEER_LINKS,
+	NL80211_MESHCONF_MAX_RETRIES,
+	NL80211_MESHCONF_TTL,
+	NL80211_MESHCONF_AUTO_OPEN_PLINKS,
+	NL80211_MESHCONF_HWMP_MAX_PREQ_RETRIES,
+	NL80211_MESHCONF_PATH_REFRESH_TIME,
+	NL80211_MESHCONF_MIN_DISCOVERY_TIMEOUT,
+	NL80211_MESHCONF_HWMP_ACTIVE_PATH_TIMEOUT,
+	NL80211_MESHCONF_HWMP_PREQ_MIN_INTERVAL,
+	NL80211_MESHCONF_HWMP_NET_DIAM_TRVS_TIME,
+
+	/* keep last */
+	__NL80211_MESHCONF_ATTR_AFTER_LAST,
+	NL80211_MESHCONF_ATTR_MAX = __NL80211_MESHCONF_ATTR_AFTER_LAST - 1
+};
+
 #endif /* __LINUX_NL80211_H */
diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index 0e85ec39b63..03e1e88c6a0 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -347,6 +347,25 @@ struct ieee80211_regdomain {
 	.flags = reg_flags, \
 	}
 
+struct mesh_config {
+	/* Timeouts in ms */
+	/* Mesh plink management parameters */
+	u16 dot11MeshRetryTimeout;
+	u16 dot11MeshConfirmTimeout;
+	u16 dot11MeshHoldingTimeout;
+	u16 dot11MeshMaxPeerLinks;
+	u8  dot11MeshMaxRetries;
+	u8  dot11MeshTTL;
+	bool auto_open_plinks;
+	/* HWMP parameters */
+	u8  dot11MeshHWMPmaxPREQretries;
+	u32 path_refresh_time;
+	u16 min_discovery_timeout;
+	u32 dot11MeshHWMPactivePathTimeout;
+	u16 dot11MeshHWMPpreqMinInterval;
+	u16 dot11MeshHWMPnetDiameterTraversalTime;
+};
+
 /* from net/wireless.h */
 struct wiphy;
 
@@ -397,6 +416,12 @@ struct wiphy;
  *
  * @change_station: Modify a given station.
  *
+ * @get_mesh_params: Put the current mesh parameters into *params
+ *
+ * @set_mesh_params: Set mesh parameters.
+ *	The mask is a bitfield which tells us which parameters to
+ *	set, and which to leave alone.
+ *
  * @set_mesh_cfg: set mesh parameters (by now, just mesh id)
  *
  * @change_bss: Modify parameters for a given BSS.
@@ -452,7 +477,12 @@ struct cfg80211_ops {
 	int	(*dump_mpath)(struct wiphy *wiphy, struct net_device *dev,
 			       int idx, u8 *dst, u8 *next_hop,
 			       struct mpath_info *pinfo);
-
+	int	(*get_mesh_params)(struct wiphy *wiphy,
+				struct net_device *dev,
+				struct mesh_config *conf);
+	int	(*set_mesh_params)(struct wiphy *wiphy,
+				struct net_device *dev,
+				const struct mesh_config *nconf, u32 mask);
 	int	(*change_bss)(struct wiphy *wiphy, struct net_device *dev,
 			      struct bss_parameters *params);
 };
diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c
index 55e3a26510e..91f56a48e2b 100644
--- a/net/mac80211/cfg.c
+++ b/net/mac80211/cfg.c
@@ -951,6 +951,72 @@ static int ieee80211_dump_mpath(struct wiphy *wiphy, struct net_device *dev,
 	rcu_read_unlock();
 	return 0;
 }
+
+static int ieee80211_get_mesh_params(struct wiphy *wiphy,
+				struct net_device *dev,
+				struct mesh_config *conf)
+{
+	struct ieee80211_sub_if_data *sdata;
+	sdata = IEEE80211_DEV_TO_SUB_IF(dev);
+
+	if (sdata->vif.type != NL80211_IFTYPE_MESH_POINT)
+		return -ENOTSUPP;
+	memcpy(conf, &(sdata->u.mesh.mshcfg), sizeof(struct mesh_config));
+	return 0;
+}
+
+static inline bool _chg_mesh_attr(enum nl80211_meshconf_params parm, u32 mask)
+{
+	return (mask >> (parm-1)) & 0x1;
+}
+
+static int ieee80211_set_mesh_params(struct wiphy *wiphy,
+				struct net_device *dev,
+				const struct mesh_config *nconf, u32 mask)
+{
+	struct mesh_config *conf;
+	struct ieee80211_sub_if_data *sdata;
+	sdata = IEEE80211_DEV_TO_SUB_IF(dev);
+
+	if (sdata->vif.type != NL80211_IFTYPE_MESH_POINT)
+		return -ENOTSUPP;
+
+	/* Set the config options which we are interested in setting */
+	conf = &(sdata->u.mesh.mshcfg);
+	if (_chg_mesh_attr(NL80211_MESHCONF_RETRY_TIMEOUT, mask))
+		conf->dot11MeshRetryTimeout = nconf->dot11MeshRetryTimeout;
+	if (_chg_mesh_attr(NL80211_MESHCONF_CONFIRM_TIMEOUT, mask))
+		conf->dot11MeshConfirmTimeout = nconf->dot11MeshConfirmTimeout;
+	if (_chg_mesh_attr(NL80211_MESHCONF_HOLDING_TIMEOUT, mask))
+		conf->dot11MeshHoldingTimeout = nconf->dot11MeshHoldingTimeout;
+	if (_chg_mesh_attr(NL80211_MESHCONF_MAX_PEER_LINKS, mask))
+		conf->dot11MeshMaxPeerLinks = nconf->dot11MeshMaxPeerLinks;
+	if (_chg_mesh_attr(NL80211_MESHCONF_MAX_RETRIES, mask))
+		conf->dot11MeshMaxRetries = nconf->dot11MeshMaxRetries;
+	if (_chg_mesh_attr(NL80211_MESHCONF_TTL, mask))
+		conf->dot11MeshTTL = nconf->dot11MeshTTL;
+	if (_chg_mesh_attr(NL80211_MESHCONF_AUTO_OPEN_PLINKS, mask))
+		conf->auto_open_plinks = nconf->auto_open_plinks;
+	if (_chg_mesh_attr(NL80211_MESHCONF_HWMP_MAX_PREQ_RETRIES, mask))
+		conf->dot11MeshHWMPmaxPREQretries =
+			nconf->dot11MeshHWMPmaxPREQretries;
+	if (_chg_mesh_attr(NL80211_MESHCONF_PATH_REFRESH_TIME, mask))
+		conf->path_refresh_time = nconf->path_refresh_time;
+	if (_chg_mesh_attr(NL80211_MESHCONF_MIN_DISCOVERY_TIMEOUT, mask))
+		conf->min_discovery_timeout = nconf->min_discovery_timeout;
+	if (_chg_mesh_attr(NL80211_MESHCONF_HWMP_ACTIVE_PATH_TIMEOUT, mask))
+		conf->dot11MeshHWMPactivePathTimeout =
+			nconf->dot11MeshHWMPactivePathTimeout;
+	if (_chg_mesh_attr(NL80211_MESHCONF_HWMP_PREQ_MIN_INTERVAL, mask))
+		conf->dot11MeshHWMPpreqMinInterval =
+			nconf->dot11MeshHWMPpreqMinInterval;
+	if (_chg_mesh_attr(NL80211_MESHCONF_HWMP_NET_DIAM_TRVS_TIME,
+			   mask))
+		conf->dot11MeshHWMPnetDiameterTraversalTime =
+			nconf->dot11MeshHWMPnetDiameterTraversalTime;
+	return 0;
+}
+
 #endif
 
 static int ieee80211_change_bss(struct wiphy *wiphy,
@@ -1007,6 +1073,8 @@ struct cfg80211_ops mac80211_config_ops = {
 	.change_mpath = ieee80211_change_mpath,
 	.get_mpath = ieee80211_get_mpath,
 	.dump_mpath = ieee80211_dump_mpath,
+	.set_mesh_params = ieee80211_set_mesh_params,
+	.get_mesh_params = ieee80211_get_mesh_params,
 #endif
 	.change_bss = ieee80211_change_bss,
 };
diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h
index fe4efdd4253..2c91108e390 100644
--- a/net/mac80211/ieee80211_i.h
+++ b/net/mac80211/ieee80211_i.h
@@ -23,6 +23,7 @@
 #include <linux/types.h>
 #include <linux/spinlock.h>
 #include <linux/etherdevice.h>
+#include <net/cfg80211.h>
 #include <net/wireless.h>
 #include <net/iw_handler.h>
 #include <net/mac80211.h>
@@ -247,26 +248,6 @@ struct mesh_preq_queue {
 	u8 flags;
 };
 
-struct mesh_config {
-	/* Timeouts in ms */
-	/* Mesh plink management parameters */
-	u16 dot11MeshRetryTimeout;
-	u16 dot11MeshConfirmTimeout;
-	u16 dot11MeshHoldingTimeout;
-	u16 dot11MeshMaxPeerLinks;
-	u8  dot11MeshMaxRetries;
-	u8  dot11MeshTTL;
-	bool auto_open_plinks;
-	/* HWMP parameters */
-	u8  dot11MeshHWMPmaxPREQretries;
-	u32 path_refresh_time;
-	u16 min_discovery_timeout;
-	u32 dot11MeshHWMPactivePathTimeout;
-	u16 dot11MeshHWMPpreqMinInterval;
-	u16 dot11MeshHWMPnetDiameterTraversalTime;
-};
-
-
 /* flags used in struct ieee80211_if_sta.flags */
 #define IEEE80211_STA_SSID_SET		BIT(0)
 #define IEEE80211_STA_BSSID_SET		BIT(1)
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index 2b87aec231e..9a16e9e6c5c 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -96,6 +96,8 @@ static struct nla_policy nl80211_policy[NL80211_ATTR_MAX+1] __read_mostly = {
 	[NL80211_ATTR_BSS_SHORT_PREAMBLE] = { .type = NLA_U8 },
 	[NL80211_ATTR_BSS_SHORT_SLOT_TIME] = { .type = NLA_U8 },
 
+	[NL80211_ATTR_MESH_PARAMS] = { .type = NLA_NESTED },
+
 	[NL80211_ATTR_HT_CAPABILITY] = { .type = NLA_BINARY,
 					 .len = NL80211_HT_CAPABILITY_LEN },
 };
@@ -1698,6 +1700,183 @@ static int nl80211_req_set_reg(struct sk_buff *skb, struct genl_info *info)
 	return r;
 }
 
+static int nl80211_get_mesh_params(struct sk_buff *skb,
+	struct genl_info *info)
+{
+	struct cfg80211_registered_device *drv;
+	struct mesh_config cur_params;
+	int err;
+	struct net_device *dev;
+	void *hdr;
+	struct nlattr *pinfoattr;
+	struct sk_buff *msg;
+
+	/* Look up our device */
+	err = get_drv_dev_by_info_ifindex(info->attrs, &drv, &dev);
+	if (err)
+		return err;
+
+	/* Get the mesh params */
+	rtnl_lock();
+	err = drv->ops->get_mesh_params(&drv->wiphy, dev, &cur_params);
+	rtnl_unlock();
+	if (err)
+		goto out;
+
+	/* Draw up a netlink message to send back */
+	msg = nlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL);
+	if (!msg) {
+		err = -ENOBUFS;
+		goto out;
+	}
+	hdr = nl80211hdr_put(msg, info->snd_pid, info->snd_seq, 0,
+			     NL80211_CMD_GET_MESH_PARAMS);
+	if (!hdr)
+		goto nla_put_failure;
+	pinfoattr = nla_nest_start(msg, NL80211_ATTR_MESH_PARAMS);
+	if (!pinfoattr)
+		goto nla_put_failure;
+	NLA_PUT_U32(msg, NL80211_ATTR_IFINDEX, dev->ifindex);
+	NLA_PUT_U16(msg, NL80211_MESHCONF_RETRY_TIMEOUT,
+			cur_params.dot11MeshRetryTimeout);
+	NLA_PUT_U16(msg, NL80211_MESHCONF_CONFIRM_TIMEOUT,
+			cur_params.dot11MeshConfirmTimeout);
+	NLA_PUT_U16(msg, NL80211_MESHCONF_HOLDING_TIMEOUT,
+			cur_params.dot11MeshHoldingTimeout);
+	NLA_PUT_U16(msg, NL80211_MESHCONF_MAX_PEER_LINKS,
+			cur_params.dot11MeshMaxPeerLinks);
+	NLA_PUT_U8(msg, NL80211_MESHCONF_MAX_RETRIES,
+			cur_params.dot11MeshMaxRetries);
+	NLA_PUT_U8(msg, NL80211_MESHCONF_TTL,
+			cur_params.dot11MeshTTL);
+	NLA_PUT_U8(msg, NL80211_MESHCONF_AUTO_OPEN_PLINKS,
+			cur_params.auto_open_plinks);
+	NLA_PUT_U8(msg, NL80211_MESHCONF_HWMP_MAX_PREQ_RETRIES,
+			cur_params.dot11MeshHWMPmaxPREQretries);
+	NLA_PUT_U32(msg, NL80211_MESHCONF_PATH_REFRESH_TIME,
+			cur_params.path_refresh_time);
+	NLA_PUT_U16(msg, NL80211_MESHCONF_MIN_DISCOVERY_TIMEOUT,
+			cur_params.min_discovery_timeout);
+	NLA_PUT_U32(msg, NL80211_MESHCONF_HWMP_ACTIVE_PATH_TIMEOUT,
+			cur_params.dot11MeshHWMPactivePathTimeout);
+	NLA_PUT_U16(msg, NL80211_MESHCONF_HWMP_PREQ_MIN_INTERVAL,
+			cur_params.dot11MeshHWMPpreqMinInterval);
+	NLA_PUT_U16(msg, NL80211_MESHCONF_HWMP_NET_DIAM_TRVS_TIME,
+			cur_params.dot11MeshHWMPnetDiameterTraversalTime);
+	nla_nest_end(msg, pinfoattr);
+	genlmsg_end(msg, hdr);
+	err = genlmsg_unicast(msg, info->snd_pid);
+	goto out;
+
+nla_put_failure:
+	genlmsg_cancel(msg, hdr);
+	err = -EMSGSIZE;
+out:
+	/* Cleanup */
+	cfg80211_put_dev(drv);
+	dev_put(dev);
+	return err;
+}
+
+#define FILL_IN_MESH_PARAM_IF_SET(table, cfg, param, mask, attr_num, nla_fn) \
+do {\
+	if (table[attr_num]) {\
+		cfg.param = nla_fn(table[attr_num]); \
+		mask |= (1 << (attr_num - 1)); \
+	} \
+} while (0);\
+
+static struct nla_policy
+nl80211_meshconf_params_policy[NL80211_MESHCONF_ATTR_MAX+1] __read_mostly = {
+	[NL80211_MESHCONF_RETRY_TIMEOUT] = { .type = NLA_U16 },
+	[NL80211_MESHCONF_CONFIRM_TIMEOUT] = { .type = NLA_U16 },
+	[NL80211_MESHCONF_HOLDING_TIMEOUT] = { .type = NLA_U16 },
+	[NL80211_MESHCONF_MAX_PEER_LINKS] = { .type = NLA_U16 },
+	[NL80211_MESHCONF_MAX_RETRIES] = { .type = NLA_U8 },
+	[NL80211_MESHCONF_TTL] = { .type = NLA_U8 },
+	[NL80211_MESHCONF_AUTO_OPEN_PLINKS] = { .type = NLA_U8 },
+
+	[NL80211_MESHCONF_HWMP_MAX_PREQ_RETRIES] = { .type = NLA_U8 },
+	[NL80211_MESHCONF_PATH_REFRESH_TIME] = { .type = NLA_U32 },
+	[NL80211_MESHCONF_MIN_DISCOVERY_TIMEOUT] = { .type = NLA_U16 },
+	[NL80211_MESHCONF_HWMP_ACTIVE_PATH_TIMEOUT] = { .type = NLA_U32 },
+	[NL80211_MESHCONF_HWMP_PREQ_MIN_INTERVAL] = { .type = NLA_U16 },
+	[NL80211_MESHCONF_HWMP_NET_DIAM_TRVS_TIME] = { .type = NLA_U16 },
+};
+
+static int nl80211_set_mesh_params(struct sk_buff *skb, struct genl_info *info)
+{
+	int err;
+	u32 mask;
+	struct cfg80211_registered_device *drv;
+	struct net_device *dev;
+	struct mesh_config cfg;
+	struct nlattr *tb[NL80211_MESHCONF_ATTR_MAX + 1];
+	struct nlattr *parent_attr;
+
+	parent_attr = info->attrs[NL80211_ATTR_MESH_PARAMS];
+	if (!parent_attr)
+		return -EINVAL;
+	if (nla_parse_nested(tb, NL80211_MESHCONF_ATTR_MAX,
+			parent_attr, nl80211_meshconf_params_policy))
+		return -EINVAL;
+
+	err = get_drv_dev_by_info_ifindex(info->attrs, &drv, &dev);
+	if (err)
+		return err;
+
+	/* This makes sure that there aren't more than 32 mesh config
+	 * parameters (otherwise our bitfield scheme would not work.) */
+	BUILD_BUG_ON(NL80211_MESHCONF_ATTR_MAX > 32);
+
+	/* Fill in the params struct */
+	mask = 0;
+	FILL_IN_MESH_PARAM_IF_SET(tb, cfg, dot11MeshRetryTimeout,
+			mask, NL80211_MESHCONF_RETRY_TIMEOUT, nla_get_u16);
+	FILL_IN_MESH_PARAM_IF_SET(tb, cfg, dot11MeshConfirmTimeout,
+			mask, NL80211_MESHCONF_CONFIRM_TIMEOUT, nla_get_u16);
+	FILL_IN_MESH_PARAM_IF_SET(tb, cfg, dot11MeshHoldingTimeout,
+			mask, NL80211_MESHCONF_HOLDING_TIMEOUT, nla_get_u16);
+	FILL_IN_MESH_PARAM_IF_SET(tb, cfg, dot11MeshMaxPeerLinks,
+			mask, NL80211_MESHCONF_MAX_PEER_LINKS, nla_get_u16);
+	FILL_IN_MESH_PARAM_IF_SET(tb, cfg, dot11MeshMaxRetries,
+			mask, NL80211_MESHCONF_MAX_RETRIES, nla_get_u8);
+	FILL_IN_MESH_PARAM_IF_SET(tb, cfg, dot11MeshTTL,
+			mask, NL80211_MESHCONF_TTL, nla_get_u8);
+	FILL_IN_MESH_PARAM_IF_SET(tb, cfg, auto_open_plinks,
+			mask, NL80211_MESHCONF_AUTO_OPEN_PLINKS, nla_get_u8);
+	FILL_IN_MESH_PARAM_IF_SET(tb, cfg, dot11MeshHWMPmaxPREQretries,
+			mask, NL80211_MESHCONF_HWMP_MAX_PREQ_RETRIES,
+			nla_get_u8);
+	FILL_IN_MESH_PARAM_IF_SET(tb, cfg, path_refresh_time,
+			mask, NL80211_MESHCONF_PATH_REFRESH_TIME, nla_get_u32);
+	FILL_IN_MESH_PARAM_IF_SET(tb, cfg, min_discovery_timeout,
+			mask, NL80211_MESHCONF_MIN_DISCOVERY_TIMEOUT,
+			nla_get_u16);
+	FILL_IN_MESH_PARAM_IF_SET(tb, cfg, dot11MeshHWMPactivePathTimeout,
+			mask, NL80211_MESHCONF_HWMP_ACTIVE_PATH_TIMEOUT,
+			nla_get_u32);
+	FILL_IN_MESH_PARAM_IF_SET(tb, cfg, dot11MeshHWMPpreqMinInterval,
+			mask, NL80211_MESHCONF_HWMP_PREQ_MIN_INTERVAL,
+			nla_get_u16);
+	FILL_IN_MESH_PARAM_IF_SET(tb, cfg,
+			dot11MeshHWMPnetDiameterTraversalTime,
+			mask, NL80211_MESHCONF_HWMP_NET_DIAM_TRVS_TIME,
+			nla_get_u16);
+
+	/* Apply changes */
+	rtnl_lock();
+	err = drv->ops->set_mesh_params(&drv->wiphy, dev, &cfg, mask);
+	rtnl_unlock();
+
+	/* cleanup */
+	cfg80211_put_dev(drv);
+	dev_put(dev);
+	return err;
+}
+
+#undef FILL_IN_MESH_PARAM_IF_SET
+
 static int nl80211_set_reg(struct sk_buff *skb, struct genl_info *info)
 {
 	struct nlattr *tb[NL80211_REG_RULE_ATTR_MAX + 1];
@@ -1915,6 +2094,18 @@ static struct genl_ops nl80211_ops[] = {
 		.policy = nl80211_policy,
 		.flags = GENL_ADMIN_PERM,
 	},
+	{
+		.cmd = NL80211_CMD_GET_MESH_PARAMS,
+		.doit = nl80211_get_mesh_params,
+		.policy = nl80211_policy,
+		/* can be retrieved by unprivileged users */
+	},
+	{
+		.cmd = NL80211_CMD_SET_MESH_PARAMS,
+		.doit = nl80211_set_mesh_params,
+		.policy = nl80211_policy,
+		.flags = GENL_ADMIN_PERM,
+	},
 };
 
 /* multicast groups */
-- 
cgit v1.2.3-70-g09d2


From 9387b7caf3049168fc97a8a9111af8fe2143af18 Mon Sep 17 00:00:00 2001
From: "John W. Linville" <linville@tuxdriver.com>
Date: Tue, 30 Sep 2008 20:59:05 -0400
Subject: wireless: use individual buffers for printing ssid values

Also change escape_ssid to print_ssid to match print_mac semantics.

Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 drivers/net/wireless/ipw2100.c              |  20 ++--
 drivers/net/wireless/ipw2200.c              | 153 +++++++++++++++++-----------
 drivers/net/wireless/iwlwifi/iwl-scan.c     |   7 +-
 drivers/net/wireless/iwlwifi/iwl3945-base.c |   9 +-
 drivers/net/wireless/libertas/assoc.c       |  17 ++--
 drivers/net/wireless/libertas/cmd.c         |   3 +-
 drivers/net/wireless/libertas/debugfs.c     |   4 +-
 drivers/net/wireless/libertas/scan.c        |  12 ++-
 drivers/net/wireless/libertas/wext.c        |   3 +-
 include/linux/ieee80211.h                   |   6 +-
 include/net/lib80211.h                      |   5 +-
 net/ieee80211/ieee80211_rx.c                |  19 ++--
 net/ieee80211/ieee80211_wx.c                |   7 +-
 net/wireless/lib80211.c                     |   9 +-
 14 files changed, 168 insertions(+), 106 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/wireless/ipw2100.c b/drivers/net/wireless/ipw2100.c
index 223914e3e07..062c9f28030 100644
--- a/drivers/net/wireless/ipw2100.c
+++ b/drivers/net/wireless/ipw2100.c
@@ -163,6 +163,8 @@ that only one external action is invoked at a time.
 #include <linux/ctype.h>
 #include <linux/pm_qos_params.h>
 
+#include <net/lib80211.h>
+
 #include "ipw2100.h"
 
 #define IPW2100_VERSION "git-1.2.2"
@@ -1914,6 +1916,7 @@ static void isr_indicate_associated(struct ipw2100_priv *priv, u32 status)
 	u32 chan;
 	char *txratename;
 	u8 bssid[ETH_ALEN];
+	DECLARE_SSID_BUF(ssid);
 
 	/*
 	 * TBD: BSSID is usually 00:00:00:00:00:00 here and not
@@ -1975,7 +1978,7 @@ static void isr_indicate_associated(struct ipw2100_priv *priv, u32 status)
 	}
 
 	IPW_DEBUG_INFO("%s: Associated with '%s' at %s, channel %d (BSSID=%pM)\n",
-		       priv->net_dev->name, escape_ssid(essid, essid_len),
+		       priv->net_dev->name, print_ssid(ssid, essid, essid_len),
 		       txratename, chan, bssid);
 
 	/* now we copy read ssid into dev */
@@ -2002,8 +2005,9 @@ static int ipw2100_set_essid(struct ipw2100_priv *priv, char *essid,
 		.host_command_length = ssid_len
 	};
 	int err;
+	DECLARE_SSID_BUF(ssid);
 
-	IPW_DEBUG_HC("SSID: '%s'\n", escape_ssid(essid, ssid_len));
+	IPW_DEBUG_HC("SSID: '%s'\n", print_ssid(ssid, essid, ssid_len));
 
 	if (ssid_len)
 		memcpy(cmd.host_command_parameters, essid, ssid_len);
@@ -2044,9 +2048,11 @@ static int ipw2100_set_essid(struct ipw2100_priv *priv, char *essid,
 
 static void isr_indicate_association_lost(struct ipw2100_priv *priv, u32 status)
 {
+	DECLARE_SSID_BUF(ssid);
+
 	IPW_DEBUG(IPW_DL_NOTIF | IPW_DL_STATE | IPW_DL_ASSOC,
 		  "disassociated: '%s' %pM \n",
-		  escape_ssid(priv->essid, priv->essid_len),
+		  print_ssid(ssid, priv->essid, priv->essid_len),
 		  priv->bssid);
 
 	priv->status &= ~(STATUS_ASSOCIATED | STATUS_ASSOCIATING);
@@ -6958,6 +6964,7 @@ static int ipw2100_wx_set_essid(struct net_device *dev,
 	char *essid = "";	/* ANY */
 	int length = 0;
 	int err = 0;
+	DECLARE_SSID_BUF(ssid);
 
 	mutex_lock(&priv->action_mutex);
 	if (!(priv->status & STATUS_INITIALIZED)) {
@@ -6987,8 +6994,8 @@ static int ipw2100_wx_set_essid(struct net_device *dev,
 		goto done;
 	}
 
-	IPW_DEBUG_WX("Setting ESSID: '%s' (%d)\n", escape_ssid(essid, length),
-		     length);
+	IPW_DEBUG_WX("Setting ESSID: '%s' (%d)\n",
+		     print_ssid(ssid, essid, length), length);
 
 	priv->essid_len = length;
 	memcpy(priv->essid, essid, priv->essid_len);
@@ -7009,12 +7016,13 @@ static int ipw2100_wx_get_essid(struct net_device *dev,
 	 */
 
 	struct ipw2100_priv *priv = ieee80211_priv(dev);
+	DECLARE_SSID_BUF(ssid);
 
 	/* If we are associated, trying to associate, or have a statically
 	 * configured ESSID then return that; otherwise return ANY */
 	if (priv->config & CFG_STATIC_ESSID || priv->status & STATUS_ASSOCIATED) {
 		IPW_DEBUG_WX("Getting essid: '%s'\n",
-			     escape_ssid(priv->essid, priv->essid_len));
+			     print_ssid(ssid, priv->essid, priv->essid_len));
 		memcpy(extra, priv->essid, priv->essid_len);
 		wrqu->essid.length = priv->essid_len;
 		wrqu->essid.flags = 1;	/* active */
diff --git a/drivers/net/wireless/ipw2200.c b/drivers/net/wireless/ipw2200.c
index 6e0c55c64e1..2b9d96a5c10 100644
--- a/drivers/net/wireless/ipw2200.c
+++ b/drivers/net/wireless/ipw2200.c
@@ -4395,6 +4395,7 @@ static void handle_scan_event(struct ipw_priv *priv)
 static void ipw_rx_notification(struct ipw_priv *priv,
 				       struct ipw_rx_notification *notif)
 {
+	DECLARE_SSID_BUF(ssid);
 	u16 size = le16_to_cpu(notif->size);
 	notif->size = le16_to_cpu(notif->size);
 
@@ -4409,8 +4410,8 @@ static void ipw_rx_notification(struct ipw_priv *priv,
 					IPW_DEBUG(IPW_DL_NOTIF | IPW_DL_STATE |
 						  IPW_DL_ASSOC,
 						  "associated: '%s' %pM \n",
-						  escape_ssid(priv->essid,
-							      priv->essid_len),
+						  print_ssid(ssid, priv->essid,
+							     priv->essid_len),
 						  priv->bssid);
 
 					switch (priv->ieee->iw_mode) {
@@ -4490,10 +4491,11 @@ static void ipw_rx_notification(struct ipw_priv *priv,
 							  "deauthenticated: '%s' "
 							  "%pM"
 							  ": (0x%04X) - %s \n",
-							  escape_ssid(priv->
-								      essid,
-								      priv->
-								      essid_len),
+							  print_ssid(ssid,
+								     priv->
+								     essid,
+								     priv->
+								     essid_len),
 							  priv->bssid,
 							  le16_to_cpu(auth->status),
 							  ipw_get_status_code
@@ -4512,8 +4514,8 @@ static void ipw_rx_notification(struct ipw_priv *priv,
 					IPW_DEBUG(IPW_DL_NOTIF | IPW_DL_STATE |
 						  IPW_DL_ASSOC,
 						  "authenticated: '%s' %pM\n",
-						  escape_ssid(priv->essid,
-							       priv->essid_len),
+						  print_ssid(ssid, priv->essid,
+							     priv->essid_len),
 						  priv->bssid);
 					break;
 				}
@@ -4540,8 +4542,8 @@ static void ipw_rx_notification(struct ipw_priv *priv,
 					IPW_DEBUG(IPW_DL_NOTIF | IPW_DL_STATE |
 						  IPW_DL_ASSOC,
 						  "disassociated: '%s' %pM \n",
-						  escape_ssid(priv->essid,
-							       priv->essid_len),
+						  print_ssid(ssid, priv->essid,
+							     priv->essid_len),
 						  priv->bssid);
 
 					priv->status &=
@@ -4578,8 +4580,8 @@ static void ipw_rx_notification(struct ipw_priv *priv,
 			case CMAS_AUTHENTICATED:
 				IPW_DEBUG(IPW_DL_NOTIF | IPW_DL_STATE,
 					  "authenticated: '%s' %pM \n",
-					  escape_ssid(priv->essid,
-						       priv->essid_len),
+					  print_ssid(ssid, priv->essid,
+						     priv->essid_len),
 					  priv->bssid);
 				priv->status |= STATUS_AUTH;
 				break;
@@ -4597,8 +4599,8 @@ static void ipw_rx_notification(struct ipw_priv *priv,
 				IPW_DEBUG(IPW_DL_NOTIF | IPW_DL_STATE |
 					  IPW_DL_ASSOC,
 					  "deauthenticated: '%s' %pM\n",
-					  escape_ssid(priv->essid,
-						      priv->essid_len),
+					  print_ssid(ssid, priv->essid,
+						     priv->essid_len),
 					  priv->bssid);
 
 				priv->status &= ~(STATUS_ASSOCIATING |
@@ -5423,6 +5425,7 @@ static int ipw_find_adhoc_network(struct ipw_priv *priv,
 				  int roaming)
 {
 	struct ipw_supported_rates rates;
+	DECLARE_SSID_BUF(ssid);
 
 	/* Verify that this network's capability is compatible with the
 	 * current mode (AdHoc or Infrastructure) */
@@ -5430,7 +5433,8 @@ static int ipw_find_adhoc_network(struct ipw_priv *priv,
 	     !(network->capability & WLAN_CAPABILITY_IBSS))) {
 		IPW_DEBUG_MERGE("Network '%s (%pM)' excluded due to "
 				"capability mismatch.\n",
-				escape_ssid(network->ssid, network->ssid_len),
+				print_ssid(ssid, network->ssid,
+					   network->ssid_len),
 				network->bssid);
 		return 0;
 	}
@@ -5443,8 +5447,8 @@ static int ipw_find_adhoc_network(struct ipw_priv *priv,
 			   network->ssid_len)) {
 			IPW_DEBUG_MERGE("Network '%s (%pM)' excluded "
 					"because of non-network ESSID.\n",
-					escape_ssid(network->ssid,
-						    network->ssid_len),
+					print_ssid(ssid, network->ssid,
+						   network->ssid_len),
 					network->bssid);
 			return 0;
 		}
@@ -5458,13 +5462,14 @@ static int ipw_find_adhoc_network(struct ipw_priv *priv,
 			char escaped[IW_ESSID_MAX_SIZE * 2 + 1];
 
 			strncpy(escaped,
-				escape_ssid(network->ssid, network->ssid_len),
+				print_ssid(ssid, network->ssid,
+					   network->ssid_len),
 				sizeof(escaped));
 			IPW_DEBUG_MERGE("Network '%s (%pM)' excluded "
 					"because of ESSID mismatch: '%s'.\n",
 					escaped, network->bssid,
-					escape_ssid(priv->essid,
-						    priv->essid_len));
+					print_ssid(ssid, priv->essid,
+						   priv->essid_len));
 			return 0;
 		}
 	}
@@ -5475,14 +5480,14 @@ static int ipw_find_adhoc_network(struct ipw_priv *priv,
 	if (network->time_stamp[0] < match->network->time_stamp[0]) {
 		IPW_DEBUG_MERGE("Network '%s excluded because newer than "
 				"current network.\n",
-				escape_ssid(match->network->ssid,
-					    match->network->ssid_len));
+				print_ssid(ssid, match->network->ssid,
+					   match->network->ssid_len));
 		return 0;
 	} else if (network->time_stamp[1] < match->network->time_stamp[1]) {
 		IPW_DEBUG_MERGE("Network '%s excluded because newer than "
 				"current network.\n",
-				escape_ssid(match->network->ssid,
-					    match->network->ssid_len));
+				print_ssid(ssid, match->network->ssid,
+					   match->network->ssid_len));
 		return 0;
 	}
 
@@ -5491,7 +5496,8 @@ static int ipw_find_adhoc_network(struct ipw_priv *priv,
 	    time_after(jiffies, network->last_scanned + priv->ieee->scan_age)) {
 		IPW_DEBUG_MERGE("Network '%s (%pM)' excluded "
 				"because of age: %ums.\n",
-				escape_ssid(network->ssid, network->ssid_len),
+				print_ssid(ssid, network->ssid,
+					   network->ssid_len),
 				network->bssid,
 				jiffies_to_msecs(jiffies -
 						 network->last_scanned));
@@ -5502,7 +5508,8 @@ static int ipw_find_adhoc_network(struct ipw_priv *priv,
 	    (network->channel != priv->channel)) {
 		IPW_DEBUG_MERGE("Network '%s (%pM)' excluded "
 				"because of channel mismatch: %d != %d.\n",
-				escape_ssid(network->ssid, network->ssid_len),
+				print_ssid(ssid, network->ssid,
+					   network->ssid_len),
 				network->bssid,
 				network->channel, priv->channel);
 		return 0;
@@ -5513,7 +5520,8 @@ static int ipw_find_adhoc_network(struct ipw_priv *priv,
 	    ((network->capability & WLAN_CAPABILITY_PRIVACY) ? 1 : 0)) {
 		IPW_DEBUG_MERGE("Network '%s (%pM)' excluded "
 				"because of privacy mismatch: %s != %s.\n",
-				escape_ssid(network->ssid, network->ssid_len),
+				print_ssid(ssid, network->ssid,
+					   network->ssid_len),
 				network->bssid,
 				priv->
 				capability & CAP_PRIVACY_ON ? "on" : "off",
@@ -5526,8 +5534,8 @@ static int ipw_find_adhoc_network(struct ipw_priv *priv,
 	if (!memcmp(network->bssid, priv->bssid, ETH_ALEN)) {
 		IPW_DEBUG_MERGE("Network '%s (%pM)' excluded "
 				"because of the same BSSID match: %pM"
-				".\n", escape_ssid(network->ssid,
-						   network->ssid_len),
+				".\n", print_ssid(ssid, network->ssid,
+						  network->ssid_len),
 				network->bssid,
 				priv->bssid);
 		return 0;
@@ -5538,7 +5546,8 @@ static int ipw_find_adhoc_network(struct ipw_priv *priv,
 		IPW_DEBUG_MERGE("Network '%s (%pM)' excluded "
 				"because of invalid frequency/mode "
 				"combination.\n",
-				escape_ssid(network->ssid, network->ssid_len),
+				print_ssid(ssid, network->ssid,
+					   network->ssid_len),
 				network->bssid);
 		return 0;
 	}
@@ -5549,7 +5558,8 @@ static int ipw_find_adhoc_network(struct ipw_priv *priv,
 		IPW_DEBUG_MERGE("Network '%s (%pM)' excluded "
 				"because configured rate mask excludes "
 				"AP mandatory rate.\n",
-				escape_ssid(network->ssid, network->ssid_len),
+				print_ssid(ssid, network->ssid,
+					   network->ssid_len),
 				network->bssid);
 		return 0;
 	}
@@ -5557,7 +5567,8 @@ static int ipw_find_adhoc_network(struct ipw_priv *priv,
 	if (rates.num_rates == 0) {
 		IPW_DEBUG_MERGE("Network '%s (%pM)' excluded "
 				"because of no compatible rates.\n",
-				escape_ssid(network->ssid, network->ssid_len),
+				print_ssid(ssid, network->ssid,
+					   network->ssid_len),
 				network->bssid);
 		return 0;
 	}
@@ -5570,7 +5581,7 @@ static int ipw_find_adhoc_network(struct ipw_priv *priv,
 	ipw_copy_rates(&match->rates, &rates);
 	match->network = network;
 	IPW_DEBUG_MERGE("Network '%s (%pM)' is a viable match.\n",
-			escape_ssid(network->ssid, network->ssid_len),
+			print_ssid(ssid, network->ssid, network->ssid_len),
 			network->bssid);
 
 	return 1;
@@ -5578,6 +5589,7 @@ static int ipw_find_adhoc_network(struct ipw_priv *priv,
 
 static void ipw_merge_adhoc_network(struct work_struct *work)
 {
+	DECLARE_SSID_BUF(ssid);
 	struct ipw_priv *priv =
 		container_of(work, struct ipw_priv, merge_networks);
 	struct ieee80211_network *network = NULL;
@@ -5608,8 +5620,8 @@ static void ipw_merge_adhoc_network(struct work_struct *work)
 		mutex_lock(&priv->mutex);
 		if ((priv->ieee->iw_mode == IW_MODE_ADHOC)) {
 			IPW_DEBUG_MERGE("remove network %s\n",
-					escape_ssid(priv->essid,
-						    priv->essid_len));
+					print_ssid(ssid, priv->essid,
+						   priv->essid_len));
 			ipw_remove_current_network(priv);
 		}
 
@@ -5625,6 +5637,7 @@ static int ipw_best_network(struct ipw_priv *priv,
 			    struct ieee80211_network *network, int roaming)
 {
 	struct ipw_supported_rates rates;
+	DECLARE_SSID_BUF(ssid);
 
 	/* Verify that this network's capability is compatible with the
 	 * current mode (AdHoc or Infrastructure) */
@@ -5634,7 +5647,8 @@ static int ipw_best_network(struct ipw_priv *priv,
 	     !(network->capability & WLAN_CAPABILITY_IBSS))) {
 		IPW_DEBUG_ASSOC("Network '%s (%pM)' excluded due to "
 				"capability mismatch.\n",
-				escape_ssid(network->ssid, network->ssid_len),
+				print_ssid(ssid, network->ssid,
+					   network->ssid_len),
 				network->bssid);
 		return 0;
 	}
@@ -5647,8 +5661,8 @@ static int ipw_best_network(struct ipw_priv *priv,
 			   network->ssid_len)) {
 			IPW_DEBUG_ASSOC("Network '%s (%pM)' excluded "
 					"because of non-network ESSID.\n",
-					escape_ssid(network->ssid,
-						     network->ssid_len),
+					print_ssid(ssid, network->ssid,
+						   network->ssid_len),
 					network->bssid);
 			return 0;
 		}
@@ -5661,13 +5675,14 @@ static int ipw_best_network(struct ipw_priv *priv,
 			    min(network->ssid_len, priv->essid_len)))) {
 			char escaped[IW_ESSID_MAX_SIZE * 2 + 1];
 			strncpy(escaped,
-				escape_ssid(network->ssid, network->ssid_len),
+				print_ssid(ssid, network->ssid,
+					   network->ssid_len),
 				sizeof(escaped));
 			IPW_DEBUG_ASSOC("Network '%s (%pM)' excluded "
 					"because of ESSID mismatch: '%s'.\n",
 					escaped, network->bssid,
-					escape_ssid(priv->essid,
-						    priv->essid_len));
+					print_ssid(ssid, priv->essid,
+						   priv->essid_len));
 			return 0;
 		}
 	}
@@ -5677,13 +5692,13 @@ static int ipw_best_network(struct ipw_priv *priv,
 	if (match->network && match->network->stats.rssi > network->stats.rssi) {
 		char escaped[IW_ESSID_MAX_SIZE * 2 + 1];
 		strncpy(escaped,
-			escape_ssid(network->ssid, network->ssid_len),
+			print_ssid(ssid, network->ssid, network->ssid_len),
 			sizeof(escaped));
 		IPW_DEBUG_ASSOC("Network '%s (%pM)' excluded because "
 				"'%s (%pM)' has a stronger signal.\n",
 				escaped, network->bssid,
-				escape_ssid(match->network->ssid,
-					    match->network->ssid_len),
+				print_ssid(ssid, match->network->ssid,
+					   match->network->ssid_len),
 				match->network->bssid);
 		return 0;
 	}
@@ -5695,7 +5710,8 @@ static int ipw_best_network(struct ipw_priv *priv,
 		IPW_DEBUG_ASSOC("Network '%s (%pM)' excluded "
 				"because of storming (%ums since last "
 				"assoc attempt).\n",
-				escape_ssid(network->ssid, network->ssid_len),
+				print_ssid(ssid, network->ssid,
+					   network->ssid_len),
 				network->bssid,
 				jiffies_to_msecs(jiffies -
 						 network->last_associate));
@@ -5707,7 +5723,8 @@ static int ipw_best_network(struct ipw_priv *priv,
 	    time_after(jiffies, network->last_scanned + priv->ieee->scan_age)) {
 		IPW_DEBUG_ASSOC("Network '%s (%pM)' excluded "
 				"because of age: %ums.\n",
-				escape_ssid(network->ssid, network->ssid_len),
+				print_ssid(ssid, network->ssid,
+					   network->ssid_len),
 				network->bssid,
 				jiffies_to_msecs(jiffies -
 						 network->last_scanned));
@@ -5718,7 +5735,8 @@ static int ipw_best_network(struct ipw_priv *priv,
 	    (network->channel != priv->channel)) {
 		IPW_DEBUG_ASSOC("Network '%s (%pM)' excluded "
 				"because of channel mismatch: %d != %d.\n",
-				escape_ssid(network->ssid, network->ssid_len),
+				print_ssid(ssid, network->ssid,
+					   network->ssid_len),
 				network->bssid,
 				network->channel, priv->channel);
 		return 0;
@@ -5729,7 +5747,8 @@ static int ipw_best_network(struct ipw_priv *priv,
 	    ((network->capability & WLAN_CAPABILITY_PRIVACY) ? 1 : 0)) {
 		IPW_DEBUG_ASSOC("Network '%s (%pM)' excluded "
 				"because of privacy mismatch: %s != %s.\n",
-				escape_ssid(network->ssid, network->ssid_len),
+				print_ssid(ssid, network->ssid,
+					   network->ssid_len),
 				network->bssid,
 				priv->capability & CAP_PRIVACY_ON ? "on" :
 				"off",
@@ -5742,7 +5761,8 @@ static int ipw_best_network(struct ipw_priv *priv,
 	    memcmp(network->bssid, priv->bssid, ETH_ALEN)) {
 		IPW_DEBUG_ASSOC("Network '%s (%pM)' excluded "
 				"because of BSSID mismatch: %pM.\n",
-				escape_ssid(network->ssid, network->ssid_len),
+				print_ssid(ssid, network->ssid,
+					   network->ssid_len),
 				network->bssid, priv->bssid);
 		return 0;
 	}
@@ -5752,7 +5772,8 @@ static int ipw_best_network(struct ipw_priv *priv,
 		IPW_DEBUG_ASSOC("Network '%s (%pM)' excluded "
 				"because of invalid frequency/mode "
 				"combination.\n",
-				escape_ssid(network->ssid, network->ssid_len),
+				print_ssid(ssid, network->ssid,
+					   network->ssid_len),
 				network->bssid);
 		return 0;
 	}
@@ -5761,7 +5782,8 @@ static int ipw_best_network(struct ipw_priv *priv,
 	if (!ieee80211_is_valid_channel(priv->ieee, network->channel)) {
 		IPW_DEBUG_ASSOC("Network '%s (%pM)' excluded "
 				"because of invalid channel in current GEO\n",
-				escape_ssid(network->ssid, network->ssid_len),
+				print_ssid(ssid, network->ssid,
+					   network->ssid_len),
 				network->bssid);
 		return 0;
 	}
@@ -5772,7 +5794,8 @@ static int ipw_best_network(struct ipw_priv *priv,
 		IPW_DEBUG_ASSOC("Network '%s (%pM)' excluded "
 				"because configured rate mask excludes "
 				"AP mandatory rate.\n",
-				escape_ssid(network->ssid, network->ssid_len),
+				print_ssid(ssid, network->ssid,
+					   network->ssid_len),
 				network->bssid);
 		return 0;
 	}
@@ -5780,7 +5803,8 @@ static int ipw_best_network(struct ipw_priv *priv,
 	if (rates.num_rates == 0) {
 		IPW_DEBUG_ASSOC("Network '%s (%pM)' excluded "
 				"because of no compatible rates.\n",
-				escape_ssid(network->ssid, network->ssid_len),
+				print_ssid(ssid, network->ssid,
+					   network->ssid_len),
 				network->bssid);
 		return 0;
 	}
@@ -5794,7 +5818,7 @@ static int ipw_best_network(struct ipw_priv *priv,
 	match->network = network;
 
 	IPW_DEBUG_ASSOC("Network '%s (%pM)' is a viable match.\n",
-			escape_ssid(network->ssid, network->ssid_len),
+			print_ssid(ssid, network->ssid, network->ssid_len),
 			network->bssid);
 
 	return 1;
@@ -6037,6 +6061,7 @@ static void ipw_bg_adhoc_check(struct work_struct *work)
 
 static void ipw_debug_config(struct ipw_priv *priv)
 {
+	DECLARE_SSID_BUF(ssid);
 	IPW_DEBUG_INFO("Scan completed, no valid APs matched "
 		       "[CFG 0x%08X]\n", priv->config);
 	if (priv->config & CFG_STATIC_CHANNEL)
@@ -6045,7 +6070,7 @@ static void ipw_debug_config(struct ipw_priv *priv)
 		IPW_DEBUG_INFO("Channel unlocked.\n");
 	if (priv->config & CFG_STATIC_ESSID)
 		IPW_DEBUG_INFO("ESSID locked to '%s'\n",
-			       escape_ssid(priv->essid, priv->essid_len));
+			       print_ssid(ssid, priv->essid, priv->essid_len));
 	else
 		IPW_DEBUG_INFO("ESSID unlocked.\n");
 	if (priv->config & CFG_STATIC_BSSID)
@@ -7263,6 +7288,7 @@ static int ipw_associate_network(struct ipw_priv *priv,
 				 struct ipw_supported_rates *rates, int roaming)
 {
 	int err;
+	DECLARE_SSID_BUF(ssid);
 
 	if (priv->config & CFG_FIXED_RATE)
 		ipw_set_fixed_rate(priv, network->mode);
@@ -7331,7 +7357,7 @@ static int ipw_associate_network(struct ipw_priv *priv,
 	IPW_DEBUG_ASSOC("%sssocation attempt: '%s', channel %d, "
 			"802.11%c [%d], %s[:%s], enc=%s%s%s%c%c\n",
 			roaming ? "Rea" : "A",
-			escape_ssid(priv->essid, priv->essid_len),
+			print_ssid(ssid, priv->essid, priv->essid_len),
 			network->channel,
 			ipw_modes[priv->assoc_request.ieee_mode],
 			rates->num_rates,
@@ -7431,7 +7457,7 @@ static int ipw_associate_network(struct ipw_priv *priv,
 	}
 
 	IPW_DEBUG(IPW_DL_STATE, "associating: '%s' %pM \n",
-		  escape_ssid(priv->essid, priv->essid_len),
+		  print_ssid(ssid, priv->essid, priv->essid_len),
 		  priv->bssid);
 
 	return 0;
@@ -7522,6 +7548,7 @@ static int ipw_associate(void *data)
 	struct ipw_supported_rates *rates;
 	struct list_head *element;
 	unsigned long flags;
+	DECLARE_SSID_BUF(ssid);
 
 	if (priv->ieee->iw_mode == IW_MODE_MONITOR) {
 		IPW_DEBUG_ASSOC("Not attempting association (monitor mode)\n");
@@ -7583,8 +7610,8 @@ static int ipw_associate(void *data)
 			target = oldest;
 			IPW_DEBUG_ASSOC("Expired '%s' (%pM) from "
 					"network list.\n",
-					escape_ssid(target->ssid,
-						    target->ssid_len),
+					print_ssid(ssid, target->ssid,
+						   target->ssid_len),
 					target->bssid);
 			list_add_tail(&target->list,
 				      &priv->ieee->network_free_list);
@@ -9012,6 +9039,7 @@ static int ipw_wx_set_essid(struct net_device *dev,
 {
 	struct ipw_priv *priv = ieee80211_priv(dev);
         int length;
+	DECLARE_SSID_BUF(ssid);
 
         mutex_lock(&priv->mutex);
 
@@ -9036,8 +9064,8 @@ static int ipw_wx_set_essid(struct net_device *dev,
 		return 0;
 	}
 
-	IPW_DEBUG_WX("Setting ESSID: '%s' (%d)\n", escape_ssid(extra, length),
-		     length);
+	IPW_DEBUG_WX("Setting ESSID: '%s' (%d)\n",
+		     print_ssid(ssid, extra, length), length);
 
 	priv->essid_len = length;
 	memcpy(priv->essid, extra, priv->essid_len);
@@ -9056,6 +9084,7 @@ static int ipw_wx_get_essid(struct net_device *dev,
 			    union iwreq_data *wrqu, char *extra)
 {
 	struct ipw_priv *priv = ieee80211_priv(dev);
+	DECLARE_SSID_BUF(ssid);
 
 	/* If we are associated, trying to associate, or have a statically
 	 * configured ESSID then return that; otherwise return ANY */
@@ -9063,7 +9092,7 @@ static int ipw_wx_get_essid(struct net_device *dev,
 	if (priv->config & CFG_STATIC_ESSID ||
 	    priv->status & (STATUS_ASSOCIATED | STATUS_ASSOCIATING)) {
 		IPW_DEBUG_WX("Getting essid: '%s'\n",
-			     escape_ssid(priv->essid, priv->essid_len));
+			     print_ssid(ssid, priv->essid, priv->essid_len));
 		memcpy(extra, priv->essid, priv->essid_len);
 		wrqu->essid.length = priv->essid_len;
 		wrqu->essid.flags = 1;	/* active */
diff --git a/drivers/net/wireless/iwlwifi/iwl-scan.c b/drivers/net/wireless/iwlwifi/iwl-scan.c
index 1cc8aa59282..3379b41fb5e 100644
--- a/drivers/net/wireless/iwlwifi/iwl-scan.c
+++ b/drivers/net/wireless/iwlwifi/iwl-scan.c
@@ -643,6 +643,7 @@ static void iwl_bg_request_scan(struct work_struct *data)
 	u8 n_probes = 2;
 	u8 rx_chain = priv->hw_params.valid_rx_ant;
 	u8 rate;
+	DECLARE_SSID_BUF(ssid);
 
 	conf = ieee80211_get_hw_conf(priv->hw);
 
@@ -735,8 +736,8 @@ static void iwl_bg_request_scan(struct work_struct *data)
 	/* We should add the ability for user to lock to PASSIVE ONLY */
 	if (priv->one_direct_scan) {
 		IWL_DEBUG_SCAN("Start direct scan for '%s'\n",
-				escape_ssid(priv->direct_ssid,
-					    priv->direct_ssid_len));
+				print_ssid(ssid, priv->direct_ssid,
+					   priv->direct_ssid_len));
 		scan->direct_scan[0].id = WLAN_EID_SSID;
 		scan->direct_scan[0].len = priv->direct_ssid_len;
 		memcpy(scan->direct_scan[0].ssid,
@@ -744,7 +745,7 @@ static void iwl_bg_request_scan(struct work_struct *data)
 		n_probes++;
 	} else if (!iwl_is_associated(priv) && priv->essid_len) {
 		IWL_DEBUG_SCAN("Start direct scan for '%s' (not associated)\n",
-				escape_ssid(priv->essid, priv->essid_len));
+				print_ssid(ssid, priv->essid, priv->essid_len));
 		scan->direct_scan[0].id = WLAN_EID_SSID;
 		scan->direct_scan[0].len = priv->essid_len;
 		memcpy(scan->direct_scan[0].ssid, priv->essid, priv->essid_len);
diff --git a/drivers/net/wireless/iwlwifi/iwl3945-base.c b/drivers/net/wireless/iwlwifi/iwl3945-base.c
index 370cc46b488..8009094503e 100644
--- a/drivers/net/wireless/iwlwifi/iwl3945-base.c
+++ b/drivers/net/wireless/iwlwifi/iwl3945-base.c
@@ -6054,6 +6054,7 @@ static void iwl3945_bg_request_scan(struct work_struct *data)
 	struct ieee80211_conf *conf = NULL;
 	u8 n_probes = 2;
 	enum ieee80211_band band;
+	DECLARE_SSID_BUF(ssid);
 
 	conf = ieee80211_get_hw_conf(priv->hw);
 
@@ -6154,7 +6155,8 @@ static void iwl3945_bg_request_scan(struct work_struct *data)
 	if (priv->one_direct_scan) {
 		IWL_DEBUG_SCAN
 		    ("Kicking off one direct scan for '%s'\n",
-		     escape_ssid(priv->direct_ssid, priv->direct_ssid_len));
+		     print_ssid(ssid, priv->direct_ssid,
+				priv->direct_ssid_len));
 		scan->direct_scan[0].id = WLAN_EID_SSID;
 		scan->direct_scan[0].len = priv->direct_ssid_len;
 		memcpy(scan->direct_scan[0].ssid,
@@ -6163,7 +6165,7 @@ static void iwl3945_bg_request_scan(struct work_struct *data)
 	} else if (!iwl3945_is_associated(priv) && priv->essid_len) {
 		IWL_DEBUG_SCAN
 		  ("Kicking off one direct scan for '%s' when not associated\n",
-		   escape_ssid(priv->essid, priv->essid_len));
+		   print_ssid(ssid, priv->essid, priv->essid_len));
 		scan->direct_scan[0].id = WLAN_EID_SSID;
 		scan->direct_scan[0].len = priv->essid_len;
 		memcpy(scan->direct_scan[0].ssid, priv->essid, priv->essid_len);
@@ -6945,6 +6947,7 @@ static int iwl3945_mac_hw_scan(struct ieee80211_hw *hw, u8 *ssid, size_t len)
 	int rc = 0;
 	unsigned long flags;
 	struct iwl3945_priv *priv = hw->priv;
+	DECLARE_SSID_BUF(ssid_buf);
 
 	IWL_DEBUG_MAC80211("enter\n");
 
@@ -6978,7 +6981,7 @@ static int iwl3945_mac_hw_scan(struct ieee80211_hw *hw, u8 *ssid, size_t len)
 	}
 	if (len) {
 		IWL_DEBUG_SCAN("direct scan for %s [%d]\n ",
-			       escape_ssid(ssid, len), (int)len);
+			       print_ssid(ssid_buf, ssid, len), (int)len);
 
 		priv->one_direct_scan = 1;
 		priv->direct_ssid_len = (u8)
diff --git a/drivers/net/wireless/libertas/assoc.c b/drivers/net/wireless/libertas/assoc.c
index 3492e89d1dd..92863780286 100644
--- a/drivers/net/wireless/libertas/assoc.c
+++ b/drivers/net/wireless/libertas/assoc.c
@@ -153,17 +153,18 @@ static int lbs_adhoc_join(struct lbs_private *priv,
 	struct cmd_ds_802_11_ad_hoc_join cmd;
 	struct bss_descriptor *bss = &assoc_req->bss;
 	u8 preamble = RADIO_PREAMBLE_LONG;
+	DECLARE_SSID_BUF(ssid);
 	u16 ratesize = 0;
 	int ret = 0;
 
 	lbs_deb_enter(LBS_DEB_ASSOC);
 
 	lbs_deb_join("current SSID '%s', ssid length %u\n",
-		escape_ssid(priv->curbssparams.ssid,
+		print_ssid(ssid, priv->curbssparams.ssid,
 		priv->curbssparams.ssid_len),
 		priv->curbssparams.ssid_len);
 	lbs_deb_join("requested ssid '%s', ssid length %u\n",
-		escape_ssid(bss->ssid, bss->ssid_len),
+		print_ssid(ssid, bss->ssid, bss->ssid_len),
 		bss->ssid_len);
 
 	/* check if the requested SSID is already joined */
@@ -308,6 +309,7 @@ static int lbs_adhoc_start(struct lbs_private *priv,
 	size_t ratesize = 0;
 	u16 tmpcap = 0;
 	int ret = 0;
+	DECLARE_SSID_BUF(ssid);
 
 	lbs_deb_enter(LBS_DEB_ASSOC);
 
@@ -327,7 +329,7 @@ static int lbs_adhoc_start(struct lbs_private *priv,
 	memcpy(cmd.ssid, assoc_req->ssid, assoc_req->ssid_len);
 
 	lbs_deb_join("ADHOC_START: SSID '%s', ssid length %u\n",
-		escape_ssid(assoc_req->ssid, assoc_req->ssid_len),
+		print_ssid(ssid, assoc_req->ssid, assoc_req->ssid_len),
 		assoc_req->ssid_len);
 
 	cmd.bsstype = CMD_BSS_TYPE_IBSS;
@@ -695,6 +697,7 @@ static int assoc_helper_essid(struct lbs_private *priv,
 	int ret = 0;
 	struct bss_descriptor * bss;
 	int channel = -1;
+	DECLARE_SSID_BUF(ssid);
 
 	lbs_deb_enter(LBS_DEB_ASSOC);
 
@@ -706,7 +709,7 @@ static int assoc_helper_essid(struct lbs_private *priv,
 		channel = assoc_req->channel;
 
 	lbs_deb_assoc("SSID '%s' requested\n",
-	              escape_ssid(assoc_req->ssid, assoc_req->ssid_len));
+	              print_ssid(ssid, assoc_req->ssid, assoc_req->ssid_len));
 	if (assoc_req->mode == IW_MODE_INFRA) {
 		lbs_send_specific_ssid_scan(priv, assoc_req->ssid,
 			assoc_req->ssid_len);
@@ -1207,6 +1210,7 @@ void lbs_association_worker(struct work_struct *work)
 	struct assoc_request * assoc_req = NULL;
 	int ret = 0;
 	int find_any_ssid = 0;
+	DECLARE_SSID_BUF(ssid);
 
 	lbs_deb_enter(LBS_DEB_ASSOC);
 
@@ -1230,7 +1234,7 @@ void lbs_association_worker(struct work_struct *work)
 		"    secinfo:  %s%s%s\n"
 		"    auth_mode: %d\n",
 		assoc_req->flags,
-		escape_ssid(assoc_req->ssid, assoc_req->ssid_len),
+		print_ssid(ssid, assoc_req->ssid, assoc_req->ssid_len),
 		assoc_req->channel, assoc_req->band, assoc_req->mode,
 		assoc_req->bssid,
 		assoc_req->secinfo.WPAenabled ? " WPA" : "",
@@ -1767,6 +1771,7 @@ static int lbs_adhoc_post(struct lbs_private *priv, struct cmd_header *resp)
 	struct cmd_ds_802_11_ad_hoc_result *adhoc_resp;
 	union iwreq_data wrqu;
 	struct bss_descriptor *bss;
+	DECLARE_SSID_BUF(ssid);
 
 	lbs_deb_enter(LBS_DEB_JOIN);
 
@@ -1816,7 +1821,7 @@ static int lbs_adhoc_post(struct lbs_private *priv, struct cmd_header *resp)
 	wireless_send_event(priv->dev, SIOCGIWAP, &wrqu, NULL);
 
 	lbs_deb_join("ADHOC_RESP: Joined/started '%s', BSSID %pM, channel %d\n",
-		     escape_ssid(bss->ssid, bss->ssid_len),
+		     print_ssid(ssid, bss->ssid, bss->ssid_len),
 		     priv->curbssparams.bssid,
 		     priv->curbssparams.channel);
 
diff --git a/drivers/net/wireless/libertas/cmd.c b/drivers/net/wireless/libertas/cmd.c
index 52feab69ee4..38843c8b919 100644
--- a/drivers/net/wireless/libertas/cmd.c
+++ b/drivers/net/wireless/libertas/cmd.c
@@ -1063,6 +1063,7 @@ int lbs_mesh_config(struct lbs_private *priv, uint16_t action, uint16_t chan)
 {
 	struct cmd_ds_mesh_config cmd;
 	struct mrvl_meshie *ie;
+	DECLARE_SSID_BUF(ssid);
 
 	memset(&cmd, 0, sizeof(cmd));
 	cmd.channel = cpu_to_le16(chan);
@@ -1093,7 +1094,7 @@ int lbs_mesh_config(struct lbs_private *priv, uint16_t action, uint16_t chan)
 	}
 	lbs_deb_cmd("mesh config action %d type %x channel %d SSID %s\n",
 		    action, priv->mesh_tlv, chan,
-		    escape_ssid(priv->mesh_ssid, priv->mesh_ssid_len));
+		    print_ssid(ssid, priv->mesh_ssid, priv->mesh_ssid_len));
 
 	return __lbs_mesh_config_send(priv, &cmd, action, priv->mesh_tlv);
 }
diff --git a/drivers/net/wireless/libertas/debugfs.c b/drivers/net/wireless/libertas/debugfs.c
index 84933203be7..ec4efd7ff3c 100644
--- a/drivers/net/wireless/libertas/debugfs.c
+++ b/drivers/net/wireless/libertas/debugfs.c
@@ -66,6 +66,7 @@ static ssize_t lbs_getscantable(struct file *file, char __user *userbuf,
 	int numscansdone = 0, res;
 	unsigned long addr = get_zeroed_page(GFP_KERNEL);
 	char *buf = (char *)addr;
+	DECLARE_SSID_BUF(ssid);
 	struct bss_descriptor * iter_bss;
 
 	pos += snprintf(buf+pos, len-pos,
@@ -86,7 +87,8 @@ static ssize_t lbs_getscantable(struct file *file, char __user *userbuf,
 				spectrum_mgmt ? 'S' : ' ');
 		pos += snprintf(buf+pos, len-pos, " %04d |", SCAN_RSSI(iter_bss->rssi));
 		pos += snprintf(buf+pos, len-pos, " %s\n",
-		                escape_ssid(iter_bss->ssid, iter_bss->ssid_len));
+		                print_ssid(ssid, iter_bss->ssid,
+					   iter_bss->ssid_len));
 
 		numscansdone++;
 	}
diff --git a/drivers/net/wireless/libertas/scan.c b/drivers/net/wireless/libertas/scan.c
index 7881890a4e9..5c34ac58818 100644
--- a/drivers/net/wireless/libertas/scan.c
+++ b/drivers/net/wireless/libertas/scan.c
@@ -362,6 +362,7 @@ int lbs_scan_networks(struct lbs_private *priv, int full_scan)
 #ifdef CONFIG_LIBERTAS_DEBUG
 	struct bss_descriptor *iter;
 	int i = 0;
+	DECLARE_SSID_BUF(ssid);
 #endif
 
 	lbs_deb_enter_args(LBS_DEB_SCAN, "full_scan %d", full_scan);
@@ -455,7 +456,7 @@ int lbs_scan_networks(struct lbs_private *priv, int full_scan)
 	list_for_each_entry(iter, &priv->network_list, list)
 		lbs_deb_scan("%02d: BSSID %pM, RSSI %d, SSID '%s'\n",
 			     i++, iter->bssid, iter->rssi,
-			     escape_ssid(iter->ssid, iter->ssid_len));
+			     print_ssid(ssid, iter->ssid, iter->ssid_len));
 	mutex_unlock(&priv->lock);
 #endif
 
@@ -514,6 +515,7 @@ static int lbs_process_bss(struct bss_descriptor *bss,
 	struct ieeetypes_dsparamset *pDS;
 	struct ieeetypes_cfparamset *pCF;
 	struct ieeetypes_ibssparamset *pibss;
+	DECLARE_SSID_BUF(ssid);
 	struct ieeetypes_countryinfoset *pcountryinfo;
 	uint8_t *pos, *end, *p;
 	uint8_t n_ex_rates = 0, got_basic_rates = 0, n_basic_rates = 0;
@@ -602,7 +604,7 @@ static int lbs_process_bss(struct bss_descriptor *bss,
 			bss->ssid_len = min_t(int, 32, elem->len);
 			memcpy(bss->ssid, elem->data, bss->ssid_len);
 			lbs_deb_scan("got SSID IE: '%s', len %u\n",
-			             escape_ssid(bss->ssid, bss->ssid_len),
+			             print_ssid(ssid, bss->ssid, bss->ssid_len),
 			             bss->ssid_len);
 			break;
 
@@ -742,10 +744,11 @@ done:
 int lbs_send_specific_ssid_scan(struct lbs_private *priv, uint8_t *ssid,
 				uint8_t ssid_len)
 {
+	DECLARE_SSID_BUF(ssid_buf);
 	int ret = 0;
 
 	lbs_deb_enter_args(LBS_DEB_SCAN, "SSID '%s'\n",
-			   escape_ssid(ssid, ssid_len));
+			   print_ssid(ssid_buf, ssid, ssid_len));
 
 	if (!ssid_len)
 		goto out;
@@ -940,6 +943,7 @@ out:
 int lbs_set_scan(struct net_device *dev, struct iw_request_info *info,
 		 union iwreq_data *wrqu, char *extra)
 {
+	DECLARE_SSID_BUF(ssid);
 	struct lbs_private *priv = dev->priv;
 	int ret = 0;
 
@@ -969,7 +973,7 @@ int lbs_set_scan(struct net_device *dev, struct iw_request_info *info,
 		priv->scan_ssid_len = req->essid_len;
 		memcpy(priv->scan_ssid, req->essid, priv->scan_ssid_len);
 		lbs_deb_wext("set_scan, essid '%s'\n",
-			escape_ssid(priv->scan_ssid, priv->scan_ssid_len));
+			print_ssid(ssid, priv->scan_ssid, priv->scan_ssid_len));
 	} else {
 		priv->scan_ssid_len = 0;
 	}
diff --git a/drivers/net/wireless/libertas/wext.c b/drivers/net/wireless/libertas/wext.c
index 24757995185..d4c6a659b56 100644
--- a/drivers/net/wireless/libertas/wext.c
+++ b/drivers/net/wireless/libertas/wext.c
@@ -1978,6 +1978,7 @@ static int lbs_set_essid(struct net_device *dev, struct iw_request_info *info,
 	u8 ssid_len = 0;
 	struct assoc_request * assoc_req;
 	int in_ssid_len = dwrq->length;
+	DECLARE_SSID_BUF(ssid_buf);
 
 	lbs_deb_enter(LBS_DEB_WEXT);
 
@@ -2006,7 +2007,7 @@ static int lbs_set_essid(struct net_device *dev, struct iw_request_info *info,
 		lbs_deb_wext("requested any SSID\n");
 	} else {
 		lbs_deb_wext("requested SSID '%s'\n",
-		             escape_ssid(ssid, ssid_len));
+		             print_ssid(ssid_buf, ssid, ssid_len));
 	}
 
 out:
diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h
index 64a4abce6d9..b0726e2079b 100644
--- a/include/linux/ieee80211.h
+++ b/include/linux/ieee80211.h
@@ -12,8 +12,8 @@
  * published by the Free Software Foundation.
  */
 
-#ifndef IEEE80211_H
-#define IEEE80211_H
+#ifndef LINUX_IEEE80211_H
+#define LINUX_IEEE80211_H
 
 #include <linux/types.h>
 #include <asm/byteorder.h>
@@ -1114,4 +1114,4 @@ static inline u8 *ieee80211_get_DA(struct ieee80211_hdr *hdr)
 		return hdr->addr1;
 }
 
-#endif /* IEEE80211_H */
+#endif /* LINUX_IEEE80211_H */
diff --git a/include/net/lib80211.h b/include/net/lib80211.h
index ce49a30033b..906d96f1b26 100644
--- a/include/net/lib80211.h
+++ b/include/net/lib80211.h
@@ -8,8 +8,9 @@
 #ifndef LIB80211_H
 #define LIB80211_H
 
-/* escape_ssid() is intended to be used in debug (and possibly error)
+/* print_ssid() is intended to be used in debug (and possibly error)
  * messages. It should never be used for passing ssid to user space. */
-const char *escape_ssid(const char *ssid, u8 ssid_len);
+const char *print_ssid(char *buf, const char *ssid, u8 ssid_len);
+#define DECLARE_SSID_BUF(var) char var[32 * 4 + 1] __maybe_unused
 
 #endif /* LIB80211_H */
diff --git a/net/ieee80211/ieee80211_rx.c b/net/ieee80211/ieee80211_rx.c
index f15f82e7bbf..d19c8de6ef2 100644
--- a/net/ieee80211/ieee80211_rx.c
+++ b/net/ieee80211/ieee80211_rx.c
@@ -1124,6 +1124,7 @@ static int ieee80211_parse_info_param(struct ieee80211_info_element
 				      *info_element, u16 length,
 				      struct ieee80211_network *network)
 {
+	DECLARE_SSID_BUF(ssid);
 	u8 i;
 #ifdef CONFIG_IEEE80211_DEBUG
 	char rates_str[64];
@@ -1155,7 +1156,8 @@ static int ieee80211_parse_info_param(struct ieee80211_info_element
 				       IW_ESSID_MAX_SIZE - network->ssid_len);
 
 			IEEE80211_DEBUG_MGMT("MFIE_TYPE_SSID: '%s' len=%d.\n",
-					     escape_ssid(network->ssid),
+					     print_ssid(ssid, network->ssid,
+							network->ssid_len),
 					     network->ssid_len);
 			break;
 
@@ -1401,6 +1403,8 @@ static int ieee80211_network_init(struct ieee80211_device *ieee, struct ieee8021
 					 struct ieee80211_network *network,
 					 struct ieee80211_rx_stats *stats)
 {
+	DECLARE_SSID_BUF(ssid);
+
 	network->qos_data.active = 0;
 	network->qos_data.supported = 0;
 	network->qos_data.param_count = 0;
@@ -1449,7 +1453,7 @@ static int ieee80211_network_init(struct ieee80211_device *ieee, struct ieee8021
 	if (network->mode == 0) {
 		IEEE80211_DEBUG_SCAN("Filtered out '%s (%pM)' "
 				     "network.\n",
-				     escape_ssid(network->ssid,
+				     print_ssid(ssid, network->ssid,
 						 network->ssid_len),
 				     network->bssid);
 		return 1;
@@ -1563,10 +1567,11 @@ static void ieee80211_process_probe_response(struct ieee80211_device
 	struct ieee80211_info_element *info_element = beacon->info_element;
 #endif
 	unsigned long flags;
+	DECLARE_SSID_BUF(ssid);
 
 	IEEE80211_DEBUG_SCAN("'%s' (%pM"
 		     "): %c%c%c%c %c%c%c%c-%c%c%c%c %c%c%c%c\n",
-		     escape_ssid(info_element->data, info_element->len),
+		     print_ssid(ssid, info_element->data, info_element->len),
 		     beacon->header.addr3,
 		     (beacon->capability & cpu_to_le16(1 << 0xf)) ? '1' : '0',
 		     (beacon->capability & cpu_to_le16(1 << 0xe)) ? '1' : '0',
@@ -1587,7 +1592,7 @@ static void ieee80211_process_probe_response(struct ieee80211_device
 
 	if (ieee80211_network_init(ieee, beacon, &network, stats)) {
 		IEEE80211_DEBUG_SCAN("Dropped '%s' (%pM) via %s.\n",
-				     escape_ssid(info_element->data,
+				     print_ssid(ssid, info_element->data,
 						 info_element->len),
 				     beacon->header.addr3,
 				     is_beacon(beacon->header.frame_ctl) ?
@@ -1625,7 +1630,7 @@ static void ieee80211_process_probe_response(struct ieee80211_device
 			target = oldest;
 			IEEE80211_DEBUG_SCAN("Expired '%s' (%pM) from "
 					     "network list.\n",
-					     escape_ssid(target->ssid,
+					     print_ssid(ssid, target->ssid,
 							 target->ssid_len),
 					     target->bssid);
 			ieee80211_network_reset(target);
@@ -1638,7 +1643,7 @@ static void ieee80211_process_probe_response(struct ieee80211_device
 
 #ifdef CONFIG_IEEE80211_DEBUG
 		IEEE80211_DEBUG_SCAN("Adding '%s' (%pM) via %s.\n",
-				     escape_ssid(network.ssid,
+				     print_ssid(ssid, network.ssid,
 						 network.ssid_len),
 				     network.bssid,
 				     is_beacon(beacon->header.frame_ctl) ?
@@ -1649,7 +1654,7 @@ static void ieee80211_process_probe_response(struct ieee80211_device
 		list_add_tail(&target->list, &ieee->network_list);
 	} else {
 		IEEE80211_DEBUG_SCAN("Updating '%s' (%pM) via %s.\n",
-				     escape_ssid(target->ssid,
+				     print_ssid(ssid, target->ssid,
 						 target->ssid_len),
 				     target->bssid,
 				     is_beacon(beacon->header.frame_ctl) ?
diff --git a/net/ieee80211/ieee80211_wx.c b/net/ieee80211/ieee80211_wx.c
index 3025140ae72..29eb41695a8 100644
--- a/net/ieee80211/ieee80211_wx.c
+++ b/net/ieee80211/ieee80211_wx.c
@@ -34,6 +34,7 @@
 #include <linux/module.h>
 #include <linux/jiffies.h>
 
+#include <net/lib80211.h>
 #include <net/ieee80211.h>
 #include <linux/wireless.h>
 
@@ -258,6 +259,7 @@ int ieee80211_wx_get_scan(struct ieee80211_device *ieee,
 	char *ev = extra;
 	char *stop = ev + wrqu->data.length;
 	int i = 0;
+	DECLARE_SSID_BUF(ssid);
 
 	IEEE80211_DEBUG_WX("Getting scan\n");
 
@@ -277,7 +279,7 @@ int ieee80211_wx_get_scan(struct ieee80211_device *ieee,
 		else
 			IEEE80211_DEBUG_SCAN("Not showing network '%s ("
 					     "%pM)' due to age (%dms).\n",
-					     escape_ssid(network->ssid,
+					     print_ssid(ssid, network->ssid,
 							 network->ssid_len),
 					     network->bssid,
 					     jiffies_to_msecs(jiffies -
@@ -307,6 +309,7 @@ int ieee80211_wx_set_encode(struct ieee80211_device *ieee,
 	int i, key, key_provided, len;
 	struct ieee80211_crypt_data **crypt;
 	int host_crypto = ieee->host_encrypt || ieee->host_decrypt || ieee->host_build_iv;
+	DECLARE_SSID_BUF(ssid);
 
 	IEEE80211_DEBUG_WX("SET_ENCODE\n");
 
@@ -402,7 +405,7 @@ int ieee80211_wx_set_encode(struct ieee80211_device *ieee,
 			memset(sec.keys[key] + erq->length, 0,
 			       len - erq->length);
 		IEEE80211_DEBUG_WX("Setting key %d to '%s' (%d:%d bytes)\n",
-				   key, escape_ssid(sec.keys[key], len),
+				   key, print_ssid(ssid, sec.keys[key], len),
 				   erq->length, len);
 		sec.key_sizes[key] = len;
 		if (*crypt)
diff --git a/net/wireless/lib80211.c b/net/wireless/lib80211.c
index b8e34d31e75..e71f7d08562 100644
--- a/net/wireless/lib80211.c
+++ b/net/wireless/lib80211.c
@@ -19,11 +19,10 @@ MODULE_DESCRIPTION(DRV_DESCRIPTION);
 MODULE_AUTHOR("John W. Linville <linville@tuxdriver.com>");
 MODULE_LICENSE("GPL");
 
-const char *escape_ssid(const char *ssid, u8 ssid_len)
+const char *print_ssid(char *buf, const char *ssid, u8 ssid_len)
 {
-	static char escaped[IEEE80211_MAX_SSID_LEN * 4 + 1];
 	const char *s = ssid;
-	char *d = escaped;
+	char *d = buf;
 
 	ssid_len = min_t(u8, ssid_len, IEEE80211_MAX_SSID_LEN);
 	while (ssid_len--) {
@@ -48,9 +47,9 @@ const char *escape_ssid(const char *ssid, u8 ssid_len)
 		s++;
 	}
 	*d = '\0';
-	return escaped;
+	return buf;
 }
-EXPORT_SYMBOL(escape_ssid);
+EXPORT_SYMBOL(print_ssid);
 
 static int __init ieee80211_init(void)
 {
-- 
cgit v1.2.3-70-g09d2


From 72118015271e6d3852cb9f647efe0987d131adaa Mon Sep 17 00:00:00 2001
From: "John W. Linville" <linville@tuxdriver.com>
Date: Tue, 30 Sep 2008 21:43:03 -0400
Subject: wireless: avoid some net/ieee80211.h vs. linux/ieee80211.h conflicts

There is quite a lot of overlap in definitions between these headers...

Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 drivers/net/wireless/hostap/hostap_common.h |  13 ----
 drivers/net/wireless/ipw2200.c              |  24 +++---
 include/linux/ieee80211.h                   |   3 +-
 include/net/ieee80211.h                     | 112 +---------------------------
 include/net/lib80211.h                      |   4 +-
 net/ieee80211/ieee80211_rx.c                |   2 +-
 6 files changed, 19 insertions(+), 139 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/wireless/hostap/hostap_common.h b/drivers/net/wireless/hostap/hostap_common.h
index b470c743c2d..90b64b09200 100644
--- a/drivers/net/wireless/hostap/hostap_common.h
+++ b/drivers/net/wireless/hostap/hostap_common.h
@@ -6,19 +6,6 @@
 
 /* IEEE 802.11 defines */
 
-/* Information Element IDs */
-#define WLAN_EID_SSID 0
-#define WLAN_EID_SUPP_RATES 1
-#define WLAN_EID_FH_PARAMS 2
-#define WLAN_EID_DS_PARAMS 3
-#define WLAN_EID_CF_PARAMS 4
-#define WLAN_EID_TIM 5
-#define WLAN_EID_IBSS_PARAMS 6
-#define WLAN_EID_CHALLENGE 16
-#define WLAN_EID_RSN 48
-#define WLAN_EID_GENERIC 221
-
-
 /* HFA384X Configuration RIDs */
 #define HFA384X_RID_CNFPORTTYPE 0xFC00
 #define HFA384X_RID_CNFOWNMACADDR 0xFC01
diff --git a/drivers/net/wireless/ipw2200.c b/drivers/net/wireless/ipw2200.c
index 2b9d96a5c10..051ae92d8b6 100644
--- a/drivers/net/wireless/ipw2200.c
+++ b/drivers/net/wireless/ipw2200.c
@@ -4446,7 +4446,7 @@ static void ipw_rx_notification(struct ipw_priv *priv,
 
 #ifdef CONFIG_IPW2200_QOS
 #define IPW_GET_PACKET_STYPE(x) WLAN_FC_GET_STYPE( \
-			 le16_to_cpu(((struct ieee80211_hdr *)(x))->frame_ctl))
+			 le16_to_cpu(((struct ieee80211_hdr *)(x))->frame_control))
 					if ((priv->status & STATUS_AUTH) &&
 					    (IPW_GET_PACKET_STYPE(&notif->u.raw)
 					     == IEEE80211_STYPE_ASSOC_RESP)) {
@@ -7665,12 +7665,12 @@ static void ipw_rebuild_decrypted_skb(struct ipw_priv *priv,
 	u16 fc;
 
 	hdr = (struct ieee80211_hdr *)skb->data;
-	fc = le16_to_cpu(hdr->frame_ctl);
+	fc = le16_to_cpu(hdr->frame_control);
 	if (!(fc & IEEE80211_FCTL_PROTECTED))
 		return;
 
 	fc &= ~IEEE80211_FCTL_PROTECTED;
-	hdr->frame_ctl = cpu_to_le16(fc);
+	hdr->frame_control = cpu_to_le16(fc);
 	switch (priv->ieee->sec.level) {
 	case SEC_LEVEL_3:
 		/* Remove CCMP HDR */
@@ -7982,17 +7982,17 @@ static void ipw_handle_promiscuous_rx(struct ipw_priv *priv,
 	}
 
 	hdr = (void *)rxb->skb->data + IPW_RX_FRAME_SIZE;
-	if (ieee80211_is_management(le16_to_cpu(hdr->frame_ctl))) {
+	if (ieee80211_is_management(le16_to_cpu(hdr->frame_control))) {
 		if (filter & IPW_PROM_NO_MGMT)
 			return;
 		if (filter & IPW_PROM_MGMT_HEADER_ONLY)
 			hdr_only = 1;
-	} else if (ieee80211_is_control(le16_to_cpu(hdr->frame_ctl))) {
+	} else if (ieee80211_is_control(le16_to_cpu(hdr->frame_control))) {
 		if (filter & IPW_PROM_NO_CTL)
 			return;
 		if (filter & IPW_PROM_CTL_HEADER_ONLY)
 			hdr_only = 1;
-	} else if (ieee80211_is_data(le16_to_cpu(hdr->frame_ctl))) {
+	} else if (ieee80211_is_data(le16_to_cpu(hdr->frame_control))) {
 		if (filter & IPW_PROM_NO_DATA)
 			return;
 		if (filter & IPW_PROM_DATA_HEADER_ONLY)
@@ -8010,7 +8010,7 @@ static void ipw_handle_promiscuous_rx(struct ipw_priv *priv,
 	ipw_rt = (void *)skb->data;
 
 	if (hdr_only)
-		len = ieee80211_get_hdrlen(le16_to_cpu(hdr->frame_ctl));
+		len = ieee80211_get_hdrlen(le16_to_cpu(hdr->frame_control));
 
 	memcpy(ipw_rt->payload, hdr, len);
 
@@ -8230,7 +8230,7 @@ static  int is_duplicate_packet(struct ipw_priv *priv,
 	/* Comment this line now since we observed the card receives
 	 * duplicate packets but the FCTL_RETRY bit is not set in the
 	 * IBSS mode with fragmentation enabled.
-	 BUG_ON(!(le16_to_cpu(header->frame_ctl) & IEEE80211_FCTL_RETRY)); */
+	 BUG_ON(!(le16_to_cpu(header->frame_control) & IEEE80211_FCTL_RETRY)); */
 	return 1;
 }
 
@@ -10381,17 +10381,17 @@ static void ipw_handle_promiscuous_tx(struct ipw_priv *priv,
 
 	/* Filtering of fragment chains is done agains the first fragment */
 	hdr = (void *)txb->fragments[0]->data;
-	if (ieee80211_is_management(le16_to_cpu(hdr->frame_ctl))) {
+	if (ieee80211_is_management(le16_to_cpu(hdr->frame_control))) {
 		if (filter & IPW_PROM_NO_MGMT)
 			return;
 		if (filter & IPW_PROM_MGMT_HEADER_ONLY)
 			hdr_only = 1;
-	} else if (ieee80211_is_control(le16_to_cpu(hdr->frame_ctl))) {
+	} else if (ieee80211_is_control(le16_to_cpu(hdr->frame_control))) {
 		if (filter & IPW_PROM_NO_CTL)
 			return;
 		if (filter & IPW_PROM_CTL_HEADER_ONLY)
 			hdr_only = 1;
-	} else if (ieee80211_is_data(le16_to_cpu(hdr->frame_ctl))) {
+	} else if (ieee80211_is_data(le16_to_cpu(hdr->frame_control))) {
 		if (filter & IPW_PROM_NO_DATA)
 			return;
 		if (filter & IPW_PROM_DATA_HEADER_ONLY)
@@ -10406,7 +10406,7 @@ static void ipw_handle_promiscuous_tx(struct ipw_priv *priv,
 
 		if (hdr_only) {
 			hdr = (void *)src->data;
-			len = ieee80211_get_hdrlen(le16_to_cpu(hdr->frame_ctl));
+			len = ieee80211_get_hdrlen(le16_to_cpu(hdr->frame_control));
 		} else
 			len = src->len;
 
diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h
index b0726e2079b..aad99195a4c 100644
--- a/include/linux/ieee80211.h
+++ b/include/linux/ieee80211.h
@@ -826,8 +826,7 @@ struct ieee80211_ht_info {
 /* Authentication algorithms */
 #define WLAN_AUTH_OPEN 0
 #define WLAN_AUTH_SHARED_KEY 1
-#define WLAN_AUTH_FAST_BSS_TRANSITION 2
-#define WLAN_AUTH_LEAP 128
+#define WLAN_AUTH_LEAP 2
 
 #define WLAN_AUTH_CHALLENGE_LEN 128
 
diff --git a/include/net/ieee80211.h b/include/net/ieee80211.h
index afa34d3be72..738734a4653 100644
--- a/include/net/ieee80211.h
+++ b/include/net/ieee80211.h
@@ -28,6 +28,7 @@
 #include <linux/if_ether.h>	/* ETH_ALEN */
 #include <linux/kernel.h>	/* ARRAY_SIZE */
 #include <linux/wireless.h>
+#include <linux/ieee80211.h>
 
 #define IEEE80211_VERSION "git-1.1.13"
 
@@ -214,94 +215,6 @@ struct ieee80211_snap_hdr {
 #define WLAN_GET_SEQ_FRAG(seq) ((seq) & IEEE80211_SCTL_FRAG)
 #define WLAN_GET_SEQ_SEQ(seq)  (((seq) & IEEE80211_SCTL_SEQ) >> 4)
 
-/* Authentication algorithms */
-#define WLAN_AUTH_OPEN 0
-#define WLAN_AUTH_SHARED_KEY 1
-#define WLAN_AUTH_LEAP 2
-
-#define WLAN_AUTH_CHALLENGE_LEN 128
-
-#define WLAN_CAPABILITY_ESS (1<<0)
-#define WLAN_CAPABILITY_IBSS (1<<1)
-#define WLAN_CAPABILITY_CF_POLLABLE (1<<2)
-#define WLAN_CAPABILITY_CF_POLL_REQUEST (1<<3)
-#define WLAN_CAPABILITY_PRIVACY (1<<4)
-#define WLAN_CAPABILITY_SHORT_PREAMBLE (1<<5)
-#define WLAN_CAPABILITY_PBCC (1<<6)
-#define WLAN_CAPABILITY_CHANNEL_AGILITY (1<<7)
-#define WLAN_CAPABILITY_SPECTRUM_MGMT (1<<8)
-#define WLAN_CAPABILITY_QOS (1<<9)
-#define WLAN_CAPABILITY_SHORT_SLOT_TIME (1<<10)
-#define WLAN_CAPABILITY_DSSS_OFDM (1<<13)
-
-/* 802.11g ERP information element */
-#define WLAN_ERP_NON_ERP_PRESENT (1<<0)
-#define WLAN_ERP_USE_PROTECTION (1<<1)
-#define WLAN_ERP_BARKER_PREAMBLE (1<<2)
-
-/* Status codes */
-enum ieee80211_statuscode {
-	WLAN_STATUS_SUCCESS = 0,
-	WLAN_STATUS_UNSPECIFIED_FAILURE = 1,
-	WLAN_STATUS_CAPS_UNSUPPORTED = 10,
-	WLAN_STATUS_REASSOC_NO_ASSOC = 11,
-	WLAN_STATUS_ASSOC_DENIED_UNSPEC = 12,
-	WLAN_STATUS_NOT_SUPPORTED_AUTH_ALG = 13,
-	WLAN_STATUS_UNKNOWN_AUTH_TRANSACTION = 14,
-	WLAN_STATUS_CHALLENGE_FAIL = 15,
-	WLAN_STATUS_AUTH_TIMEOUT = 16,
-	WLAN_STATUS_AP_UNABLE_TO_HANDLE_NEW_STA = 17,
-	WLAN_STATUS_ASSOC_DENIED_RATES = 18,
-	/* 802.11b */
-	WLAN_STATUS_ASSOC_DENIED_NOSHORTPREAMBLE = 19,
-	WLAN_STATUS_ASSOC_DENIED_NOPBCC = 20,
-	WLAN_STATUS_ASSOC_DENIED_NOAGILITY = 21,
-	/* 802.11h */
-	WLAN_STATUS_ASSOC_DENIED_NOSPECTRUM = 22,
-	WLAN_STATUS_ASSOC_REJECTED_BAD_POWER = 23,
-	WLAN_STATUS_ASSOC_REJECTED_BAD_SUPP_CHAN = 24,
-	/* 802.11g */
-	WLAN_STATUS_ASSOC_DENIED_NOSHORTTIME = 25,
-	WLAN_STATUS_ASSOC_DENIED_NODSSSOFDM = 26,
-	/* 802.11i */
-	WLAN_STATUS_INVALID_IE = 40,
-	WLAN_STATUS_INVALID_GROUP_CIPHER = 41,
-	WLAN_STATUS_INVALID_PAIRWISE_CIPHER = 42,
-	WLAN_STATUS_INVALID_AKMP = 43,
-	WLAN_STATUS_UNSUPP_RSN_VERSION = 44,
-	WLAN_STATUS_INVALID_RSN_IE_CAP = 45,
-	WLAN_STATUS_CIPHER_SUITE_REJECTED = 46,
-};
-
-/* Reason codes */
-enum ieee80211_reasoncode {
-	WLAN_REASON_UNSPECIFIED = 1,
-	WLAN_REASON_PREV_AUTH_NOT_VALID = 2,
-	WLAN_REASON_DEAUTH_LEAVING = 3,
-	WLAN_REASON_DISASSOC_DUE_TO_INACTIVITY = 4,
-	WLAN_REASON_DISASSOC_AP_BUSY = 5,
-	WLAN_REASON_CLASS2_FRAME_FROM_NONAUTH_STA = 6,
-	WLAN_REASON_CLASS3_FRAME_FROM_NONASSOC_STA = 7,
-	WLAN_REASON_DISASSOC_STA_HAS_LEFT = 8,
-	WLAN_REASON_STA_REQ_ASSOC_WITHOUT_AUTH = 9,
-	/* 802.11h */
-	WLAN_REASON_DISASSOC_BAD_POWER = 10,
-	WLAN_REASON_DISASSOC_BAD_SUPP_CHAN = 11,
-	/* 802.11i */
-	WLAN_REASON_INVALID_IE = 13,
-	WLAN_REASON_MIC_FAILURE = 14,
-	WLAN_REASON_4WAY_HANDSHAKE_TIMEOUT = 15,
-	WLAN_REASON_GROUP_KEY_HANDSHAKE_TIMEOUT = 16,
-	WLAN_REASON_IE_DIFFERENT = 17,
-	WLAN_REASON_INVALID_GROUP_CIPHER = 18,
-	WLAN_REASON_INVALID_PAIRWISE_CIPHER = 19,
-	WLAN_REASON_INVALID_AKMP = 20,
-	WLAN_REASON_UNSUPP_RSN_VERSION = 21,
-	WLAN_REASON_INVALID_RSN_IE_CAP = 22,
-	WLAN_REASON_IEEE8021X_FAILED = 23,
-	WLAN_REASON_CIPHER_SUITE_REJECTED = 24,
-};
-
 /* Action categories - 802.11h */
 enum ieee80211_actioncategories {
 	WLAN_ACTION_SPECTRUM_MGMT = 0,
@@ -530,15 +443,6 @@ enum ieee80211_mfie {
 	MFIE_TYPE_QOS_PARAMETER = 222,
 };
 
-/* Minimal header; can be used for passing 802.11 frames with sufficient
- * information to determine what type of underlying data type is actually
- * stored in the data. */
-struct ieee80211_hdr {
-	__le16 frame_ctl;
-	__le16 duration_id;
-	u8 payload[0];
-} __attribute__ ((packed));
-
 struct ieee80211_hdr_1addr {
 	__le16 frame_ctl;
 	__le16 duration_id;
@@ -586,18 +490,6 @@ struct ieee80211_hdr_3addrqos {
 	__le16 qos_ctl;
 } __attribute__ ((packed));
 
-struct ieee80211_hdr_4addrqos {
-	__le16 frame_ctl;
-	__le16 duration_id;
-	u8 addr1[ETH_ALEN];
-	u8 addr2[ETH_ALEN];
-	u8 addr3[ETH_ALEN];
-	__le16 seq_ctl;
-	u8 addr4[ETH_ALEN];
-	u8 payload[0];
-	__le16 qos_ctl;
-} __attribute__ ((packed));
-
 struct ieee80211_info_element {
 	u8 id;
 	u8 len;
@@ -1187,7 +1079,7 @@ static inline int ieee80211_get_hdrlen(u16 fc)
 
 static inline u8 *ieee80211_get_payload(struct ieee80211_hdr *hdr)
 {
-	switch (ieee80211_get_hdrlen(le16_to_cpu(hdr->frame_ctl))) {
+	switch (ieee80211_get_hdrlen(le16_to_cpu(hdr->frame_control))) {
 	case IEEE80211_1ADDR_LEN:
 		return ((struct ieee80211_hdr_1addr *)hdr)->payload;
 	case IEEE80211_2ADDR_LEN:
diff --git a/include/net/lib80211.h b/include/net/lib80211.h
index 906d96f1b26..e1558a187ac 100644
--- a/include/net/lib80211.h
+++ b/include/net/lib80211.h
@@ -8,9 +8,11 @@
 #ifndef LIB80211_H
 #define LIB80211_H
 
+#include <linux/ieee80211.h>
+
 /* print_ssid() is intended to be used in debug (and possibly error)
  * messages. It should never be used for passing ssid to user space. */
 const char *print_ssid(char *buf, const char *ssid, u8 ssid_len);
-#define DECLARE_SSID_BUF(var) char var[32 * 4 + 1] __maybe_unused
+#define DECLARE_SSID_BUF(var) char var[IEEE80211_MAX_SSID_LEN * 4 + 1] __maybe_unused
 
 #endif /* LIB80211_H */
diff --git a/net/ieee80211/ieee80211_rx.c b/net/ieee80211/ieee80211_rx.c
index d19c8de6ef2..a91ef847546 100644
--- a/net/ieee80211/ieee80211_rx.c
+++ b/net/ieee80211/ieee80211_rx.c
@@ -40,7 +40,7 @@ static void ieee80211_monitor_rx(struct ieee80211_device *ieee,
 					struct ieee80211_rx_stats *rx_stats)
 {
 	struct ieee80211_hdr *hdr = (struct ieee80211_hdr *)skb->data;
-	u16 fc = le16_to_cpu(hdr->frame_ctl);
+	u16 fc = le16_to_cpu(hdr->frame_control);
 
 	skb->dev = ieee->dev;
 	skb_reset_mac_header(skb);
-- 
cgit v1.2.3-70-g09d2


From 8b30b1fe368ab03049435884c11c5c50e4c4ef0b Mon Sep 17 00:00:00 2001
From: Sujith <Sujith.Manoharan@atheros.com>
Date: Fri, 24 Oct 2008 09:55:27 +0530
Subject: mac80211: Re-enable aggregation

Wireless HW without any dedicated queues for aggregation
do not need the ampdu_queues mechanism present right now
in mac80211. Since mac80211 is still incomplete wrt TX MQ
changes, do not allow aggregation sessions for drivers that
set ampdu_queues.

This is only an interim hack until Intel fixes the requeue issue.

Signed-off-by: Sujith <Sujith.Manoharan@atheros.com>
Signed-off-by: Luis Rodriguez <Luis.Rodriguez@Atheros.com>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 drivers/net/wireless/ath9k/main.c       |  6 ++--
 drivers/net/wireless/iwlwifi/iwl-core.c |  3 +-
 include/linux/skbuff.h                  |  4 +++
 include/net/mac80211.h                  |  8 ++---
 net/core/skbuff.c                       |  1 +
 net/mac80211/ht.c                       | 60 +++++++++++++++++++--------------
 net/mac80211/main.c                     |  7 ++--
 net/mac80211/rx.c                       |  7 ++--
 net/mac80211/tx.c                       | 19 ++++++++---
 net/mac80211/wme.c                      | 24 ++++++-------
 10 files changed, 76 insertions(+), 63 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/wireless/ath9k/main.c b/drivers/net/wireless/ath9k/main.c
index 795fed5cadf..f6dc4c82604 100644
--- a/drivers/net/wireless/ath9k/main.c
+++ b/drivers/net/wireless/ath9k/main.c
@@ -953,10 +953,7 @@ static int ath_attach(u16 devid,
 			&sc->sbands[IEEE80211_BAND_5GHZ];
 	}
 
-	/* FIXME: Have to figure out proper hw init values later */
-
 	hw->queues = 4;
-	hw->ampdu_queues = 1;
 
 	/* Register rate control */
 	hw->rate_control_algorithm = "ath9k_rate_control";
@@ -1745,7 +1742,8 @@ static int ath_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 	hw->flags = IEEE80211_HW_RX_INCLUDES_FCS |
 		IEEE80211_HW_HOST_BROADCAST_PS_BUFFERING |
 		IEEE80211_HW_SIGNAL_DBM |
-		IEEE80211_HW_NOISE_DBM;
+		IEEE80211_HW_NOISE_DBM |
+		IEEE80211_HW_AMPDU_AGGREGATION;
 
 	hw->wiphy->interface_modes =
 		BIT(NL80211_IFTYPE_AP) |
diff --git a/drivers/net/wireless/iwlwifi/iwl-core.c b/drivers/net/wireless/iwlwifi/iwl-core.c
index 20c7ff38291..ba05f5ddc6d 100644
--- a/drivers/net/wireless/iwlwifi/iwl-core.c
+++ b/drivers/net/wireless/iwlwifi/iwl-core.c
@@ -871,7 +871,8 @@ int iwl_setup_mac(struct iwl_priv *priv)
 
 	/* Tell mac80211 our characteristics */
 	hw->flags = IEEE80211_HW_SIGNAL_DBM |
-		    IEEE80211_HW_NOISE_DBM;
+		    IEEE80211_HW_NOISE_DBM |
+		    IEEE80211_HW_AMPDU_AGGREGATION;
 	hw->wiphy->interface_modes =
 		BIT(NL80211_IFTYPE_AP) |
 		BIT(NL80211_IFTYPE_STATION) |
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 487e34507b4..a01b6f84e3b 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -250,6 +250,9 @@ typedef unsigned char *sk_buff_data_t;
  *	@tc_verd: traffic control verdict
  *	@ndisc_nodetype: router type (from link layer)
  *	@do_not_encrypt: set to prevent encryption of this frame
+ *	@requeue: set to indicate that the wireless core should attempt
+ *		a software retry on this frame if we failed to
+ *		receive an ACK for it
  *	@dma_cookie: a cookie to one of several possible DMA operations
  *		done by skb DMA functions
  *	@secmark: security marking
@@ -326,6 +329,7 @@ struct sk_buff {
 #endif
 #if defined(CONFIG_MAC80211) || defined(CONFIG_MAC80211_MODULE)
 	__u8			do_not_encrypt:1;
+	__u8			requeue:1;
 #endif
 	/* 0/13/14 bit hole */
 
diff --git a/include/net/mac80211.h b/include/net/mac80211.h
index 16c895969e6..bba96a20388 100644
--- a/include/net/mac80211.h
+++ b/include/net/mac80211.h
@@ -242,7 +242,6 @@ struct ieee80211_bss_conf {
  * @IEEE80211_TX_CTL_RATE_CTRL_PROBE: internal to mac80211, can be
  *	set by rate control algorithms to indicate probe rate, will
  *	be cleared for fragmented frames (except on the last fragment)
- * @IEEE80211_TX_CTL_REQUEUE: REMOVE THIS
  */
 enum mac80211_tx_control_flags {
 	IEEE80211_TX_CTL_REQ_TX_STATUS		= BIT(0),
@@ -258,9 +257,6 @@ enum mac80211_tx_control_flags {
 	IEEE80211_TX_STAT_AMPDU			= BIT(10),
 	IEEE80211_TX_STAT_AMPDU_NO_BACK		= BIT(11),
 	IEEE80211_TX_CTL_RATE_CTRL_PROBE	= BIT(12),
-
-	/* XXX: remove this */
-	IEEE80211_TX_CTL_REQUEUE		= BIT(13),
 };
 
 enum mac80211_rate_control_flags {
@@ -847,6 +843,9 @@ enum ieee80211_tkip_key_type {
  * @IEEE80211_HW_SPECTRUM_MGMT:
  * 	Hardware supports spectrum management defined in 802.11h
  * 	Measurement, Channel Switch, Quieting, TPC
+ *
+ * @IEEE80211_HW_AMPDU_AGGREGATION:
+ *	Hardware supports 11n A-MPDU aggregation.
  */
 enum ieee80211_hw_flags {
 	IEEE80211_HW_RX_INCLUDES_FCS			= 1<<1,
@@ -858,6 +857,7 @@ enum ieee80211_hw_flags {
 	IEEE80211_HW_SIGNAL_DBM				= 1<<7,
 	IEEE80211_HW_NOISE_DBM				= 1<<8,
 	IEEE80211_HW_SPECTRUM_MGMT			= 1<<9,
+	IEEE80211_HW_AMPDU_AGGREGATION			= 1<<10,
 };
 
 /**
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index cdfe473181a..c4c8a33f341 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -544,6 +544,7 @@ static struct sk_buff *__skb_clone(struct sk_buff *n, struct sk_buff *skb)
 	C(truesize);
 #if defined(CONFIG_MAC80211) || defined(CONFIG_MAC80211_MODULE)
 	C(do_not_encrypt);
+	C(requeue);
 #endif
 	atomic_set(&n->users, 1);
 
diff --git a/net/mac80211/ht.c b/net/mac80211/ht.c
index 42c3e590df9..08009d4b7d6 100644
--- a/net/mac80211/ht.c
+++ b/net/mac80211/ht.c
@@ -458,7 +458,7 @@ int ieee80211_start_tx_ba_session(struct ieee80211_hw *hw, u8 *ra, u16 tid)
 	u8 *state;
 	int ret;
 
-	if (tid >= STA_TID_NUM)
+	if ((tid >= STA_TID_NUM) || !(hw->flags & IEEE80211_HW_AMPDU_AGGREGATION))
 		return -EINVAL;
 
 #ifdef CONFIG_MAC80211_HT_DEBUG
@@ -515,17 +515,19 @@ int ieee80211_start_tx_ba_session(struct ieee80211_hw *hw, u8 *ra, u16 tid)
 			(unsigned long)&sta->timer_to_tid[tid];
 	init_timer(&sta->ampdu_mlme.tid_tx[tid]->addba_resp_timer);
 
-	/* create a new queue for this aggregation */
-	ret = ieee80211_ht_agg_queue_add(local, sta, tid);
+	if (hw->ampdu_queues) {
+		/* create a new queue for this aggregation */
+		ret = ieee80211_ht_agg_queue_add(local, sta, tid);
 
-	/* case no queue is available to aggregation
-	 * don't switch to aggregation */
-	if (ret) {
+		/* case no queue is available to aggregation
+		 * don't switch to aggregation */
+		if (ret) {
 #ifdef CONFIG_MAC80211_HT_DEBUG
-		printk(KERN_DEBUG "BA request denied - queue unavailable for"
-					" tid %d\n", tid);
+			printk(KERN_DEBUG "BA request denied - "
+			       "queue unavailable for tid %d\n", tid);
 #endif /* CONFIG_MAC80211_HT_DEBUG */
-		goto err_unlock_queue;
+			goto err_unlock_queue;
+		}
 	}
 	sdata = sta->sdata;
 
@@ -544,7 +546,8 @@ int ieee80211_start_tx_ba_session(struct ieee80211_hw *hw, u8 *ra, u16 tid)
 		/* No need to requeue the packets in the agg queue, since we
 		 * held the tx lock: no packet could be enqueued to the newly
 		 * allocated queue */
-		ieee80211_ht_agg_queue_remove(local, sta, tid, 0);
+		if (hw->ampdu_queues)
+			ieee80211_ht_agg_queue_remove(local, sta, tid, 0);
 #ifdef CONFIG_MAC80211_HT_DEBUG
 		printk(KERN_DEBUG "BA request denied - HW unavailable for"
 					" tid %d\n", tid);
@@ -554,7 +557,8 @@ int ieee80211_start_tx_ba_session(struct ieee80211_hw *hw, u8 *ra, u16 tid)
 	}
 
 	/* Will put all the packets in the new SW queue */
-	ieee80211_requeue(local, ieee802_1d_to_ac[tid]);
+	if (hw->ampdu_queues)
+		ieee80211_requeue(local, ieee802_1d_to_ac[tid]);
 	spin_unlock_bh(&sta->lock);
 
 	/* send an addBA request */
@@ -622,7 +626,8 @@ int ieee80211_stop_tx_ba_session(struct ieee80211_hw *hw,
 	       ra, tid);
 #endif /* CONFIG_MAC80211_HT_DEBUG */
 
-	ieee80211_stop_queue(hw, sta->tid_to_tx_q[tid]);
+	if (hw->ampdu_queues)
+		ieee80211_stop_queue(hw, sta->tid_to_tx_q[tid]);
 
 	*state = HT_AGG_STATE_REQ_STOP_BA_MSK |
 		(initiator << HT_AGG_STATE_INITIATOR_SHIFT);
@@ -635,7 +640,8 @@ int ieee80211_stop_tx_ba_session(struct ieee80211_hw *hw,
 	if (ret) {
 		WARN_ON(ret != -EBUSY);
 		*state = HT_AGG_STATE_OPERATIONAL;
-		ieee80211_wake_queue(hw, sta->tid_to_tx_q[tid]);
+		if (hw->ampdu_queues)
+			ieee80211_wake_queue(hw, sta->tid_to_tx_q[tid]);
 		goto stop_BA_exit;
 	}
 
@@ -691,7 +697,8 @@ void ieee80211_start_tx_ba_cb(struct ieee80211_hw *hw, u8 *ra, u16 tid)
 #ifdef CONFIG_MAC80211_HT_DEBUG
 		printk(KERN_DEBUG "Aggregation is on for tid %d \n", tid);
 #endif
-		ieee80211_wake_queue(hw, sta->tid_to_tx_q[tid]);
+		if (hw->ampdu_queues)
+			ieee80211_wake_queue(hw, sta->tid_to_tx_q[tid]);
 	}
 	spin_unlock_bh(&sta->lock);
 	rcu_read_unlock();
@@ -745,16 +752,18 @@ void ieee80211_stop_tx_ba_cb(struct ieee80211_hw *hw, u8 *ra, u8 tid)
 		ieee80211_send_delba(sta->sdata, ra, tid,
 			WLAN_BACK_INITIATOR, WLAN_REASON_QSTA_NOT_USE);
 
-	agg_queue = sta->tid_to_tx_q[tid];
-
-	ieee80211_ht_agg_queue_remove(local, sta, tid, 1);
-
-	/* We just requeued the all the frames that were in the
-	 * removed queue, and since we might miss a softirq we do
-	 * netif_schedule_queue.  ieee80211_wake_queue is not used
-	 * here as this queue is not necessarily stopped
-	 */
-	netif_schedule_queue(netdev_get_tx_queue(local->mdev, agg_queue));
+	if (hw->ampdu_queues) {
+		agg_queue = sta->tid_to_tx_q[tid];
+		ieee80211_ht_agg_queue_remove(local, sta, tid, 1);
+
+		/* We just requeued the all the frames that were in the
+		 * removed queue, and since we might miss a softirq we do
+		 * netif_schedule_queue.  ieee80211_wake_queue is not used
+		 * here as this queue is not necessarily stopped
+		 */
+		netif_schedule_queue(netdev_get_tx_queue(local->mdev,
+							 agg_queue));
+	}
 	spin_lock_bh(&sta->lock);
 	*state = HT_AGG_STATE_IDLE;
 	sta->ampdu_mlme.addba_req_num[tid] = 0;
@@ -1011,7 +1020,8 @@ void ieee80211_process_addba_resp(struct ieee80211_local *local,
 		*state |= HT_ADDBA_RECEIVED_MSK;
 		sta->ampdu_mlme.addba_req_num[tid] = 0;
 
-		if (*state == HT_AGG_STATE_OPERATIONAL)
+		if (*state == HT_AGG_STATE_OPERATIONAL &&
+		    local->hw.ampdu_queues)
 			ieee80211_wake_queue(hw, sta->tid_to_tx_q[tid]);
 
 		spin_unlock_bh(&sta->lock);
diff --git a/net/mac80211/main.c b/net/mac80211/main.c
index 88c1975a97a..fa0cc7a1e6b 100644
--- a/net/mac80211/main.c
+++ b/net/mac80211/main.c
@@ -386,8 +386,6 @@ static void ieee80211_handle_filtered_frame(struct ieee80211_local *local,
 					    struct sta_info *sta,
 					    struct sk_buff *skb)
 {
-	struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb);
-
 	sta->tx_filtered_count++;
 
 	/*
@@ -434,10 +432,9 @@ static void ieee80211_handle_filtered_frame(struct ieee80211_local *local,
 		return;
 	}
 
-	if (!test_sta_flags(sta, WLAN_STA_PS) &&
-	    !(info->flags & IEEE80211_TX_CTL_REQUEUE)) {
+	if (!test_sta_flags(sta, WLAN_STA_PS) && !skb->requeue) {
 		/* Software retry the packet once */
-		info->flags |= IEEE80211_TX_CTL_REQUEUE;
+		skb->requeue = 1;
 		ieee80211_remove_tx_extra(local, sta->key, skb);
 		dev_queue_xmit(skb);
 		return;
diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c
index c4c95f1db60..648a1d0e6c8 100644
--- a/net/mac80211/rx.c
+++ b/net/mac80211/rx.c
@@ -669,7 +669,6 @@ static int ap_sta_ps_end(struct sta_info *sta)
 	struct ieee80211_local *local = sdata->local;
 	struct sk_buff *skb;
 	int sent = 0;
-	struct ieee80211_tx_info *info;
 
 	atomic_dec(&sdata->bss->num_sta_ps);
 
@@ -685,13 +684,11 @@ static int ap_sta_ps_end(struct sta_info *sta)
 
 	/* Send all buffered frames to the station */
 	while ((skb = skb_dequeue(&sta->tx_filtered)) != NULL) {
-		info = IEEE80211_SKB_CB(skb);
 		sent++;
-		info->flags |= IEEE80211_TX_CTL_REQUEUE;
+		skb->requeue = 1;
 		dev_queue_xmit(skb);
 	}
 	while ((skb = skb_dequeue(&sta->ps_tx_buf)) != NULL) {
-		info = IEEE80211_SKB_CB(skb);
 		local->total_ps_buffered--;
 		sent++;
 #ifdef CONFIG_MAC80211_VERBOSE_PS_DEBUG
@@ -699,7 +696,7 @@ static int ap_sta_ps_end(struct sta_info *sta)
 		       "since STA not sleeping anymore\n", sdata->dev->name,
 		       sta->sta.addr, sta->sta.aid);
 #endif /* CONFIG_MAC80211_VERBOSE_PS_DEBUG */
-		info->flags |= IEEE80211_TX_CTL_REQUEUE;
+		skb->requeue = 1;
 		dev_queue_xmit(skb);
 	}
 
diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c
index 541e3e64493..d6392af9cd2 100644
--- a/net/mac80211/tx.c
+++ b/net/mac80211/tx.c
@@ -661,6 +661,7 @@ ieee80211_tx_h_sequence(struct ieee80211_tx_data *tx)
 static ieee80211_tx_result debug_noinline
 ieee80211_tx_h_fragment(struct ieee80211_tx_data *tx)
 {
+	struct ieee80211_tx_info *info = IEEE80211_SKB_CB(tx->skb);
 	struct ieee80211_hdr *hdr = (struct ieee80211_hdr *)tx->skb->data;
 	size_t hdrlen, per_fragm, num_fragm, payload_len, left;
 	struct sk_buff **frags, *first, *frag;
@@ -677,9 +678,7 @@ ieee80211_tx_h_fragment(struct ieee80211_tx_data *tx)
 	 * This scenario is handled in __ieee80211_tx_prepare but extra
 	 * caution taken here as fragmented ampdu may cause Tx stop.
 	 */
-	if (WARN_ON(tx->flags & IEEE80211_TX_CTL_AMPDU ||
-		    skb_get_queue_mapping(tx->skb) >=
-			ieee80211_num_regular_queues(&tx->local->hw)))
+	if (WARN_ON(info->flags & IEEE80211_TX_CTL_AMPDU))
 		return TX_DROP;
 
 	first = tx->skb;
@@ -951,7 +950,8 @@ __ieee80211_tx_prepare(struct ieee80211_tx_data *tx,
 	struct ieee80211_sub_if_data *sdata;
 	struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb);
 
-	int hdrlen;
+	int hdrlen, tid;
+	u8 *qc, *state;
 
 	memset(tx, 0, sizeof(*tx));
 	tx->skb = skb;
@@ -982,6 +982,15 @@ __ieee80211_tx_prepare(struct ieee80211_tx_data *tx,
 
 	tx->sta = sta_info_get(local, hdr->addr1);
 
+	if (tx->sta && ieee80211_is_data_qos(hdr->frame_control)) {
+		qc = ieee80211_get_qos_ctl(hdr);
+		tid = *qc & IEEE80211_QOS_CTL_TID_MASK;
+
+		state = &tx->sta->ampdu_mlme.tid_state_tx[tid];
+		if (*state == HT_AGG_STATE_OPERATIONAL)
+			info->flags |= IEEE80211_TX_CTL_AMPDU;
+	}
+
 	if (is_multicast_ether_addr(hdr->addr1)) {
 		tx->flags &= ~IEEE80211_TX_UNICAST;
 		info->flags |= IEEE80211_TX_CTL_NO_ACK;
@@ -1172,7 +1181,7 @@ retry:
 		 * queues, there's no reason for a driver to reject
 		 * a frame there, warn and drop it.
 		 */
-		if (WARN_ON(queue >= ieee80211_num_regular_queues(&local->hw)))
+		if (WARN_ON(info->flags & IEEE80211_TX_CTL_AMPDU))
 			goto drop;
 
 		store = &local->pending_packet[queue];
diff --git a/net/mac80211/wme.c b/net/mac80211/wme.c
index d27ef7f2d4a..ac71b38f7cb 100644
--- a/net/mac80211/wme.c
+++ b/net/mac80211/wme.c
@@ -114,8 +114,8 @@ u16 ieee80211_select_queue(struct net_device *dev, struct sk_buff *skb)
 {
 	struct ieee80211_master_priv *mpriv = netdev_priv(dev);
 	struct ieee80211_local *local = mpriv->local;
+	struct ieee80211_hw *hw = &local->hw;
 	struct ieee80211_hdr *hdr = (struct ieee80211_hdr *) skb->data;
-	struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb);
 	struct sta_info *sta;
 	u16 queue;
 	u8 tid;
@@ -124,21 +124,19 @@ u16 ieee80211_select_queue(struct net_device *dev, struct sk_buff *skb)
 	if (unlikely(queue >= local->hw.queues))
 		queue = local->hw.queues - 1;
 
-	if (info->flags & IEEE80211_TX_CTL_REQUEUE) {
+	if (skb->requeue) {
+		if (!hw->ampdu_queues)
+			return queue;
+
 		rcu_read_lock();
 		sta = sta_info_get(local, hdr->addr1);
 		tid = skb->priority & IEEE80211_QOS_CTL_TAG1D_MASK;
 		if (sta) {
-			struct ieee80211_hw *hw = &local->hw;
 			int ampdu_queue = sta->tid_to_tx_q[tid];
 
 			if ((ampdu_queue < ieee80211_num_queues(hw)) &&
-			    test_bit(ampdu_queue, local->queue_pool)) {
+			    test_bit(ampdu_queue, local->queue_pool))
 				queue = ampdu_queue;
-				info->flags |= IEEE80211_TX_CTL_AMPDU;
-			} else {
-				info->flags &= ~IEEE80211_TX_CTL_AMPDU;
-			}
 		}
 		rcu_read_unlock();
 
@@ -159,20 +157,18 @@ u16 ieee80211_select_queue(struct net_device *dev, struct sk_buff *skb)
 		*p++ = ack_policy | tid;
 		*p = 0;
 
+		if (!hw->ampdu_queues)
+			return queue;
+
 		rcu_read_lock();
 
 		sta = sta_info_get(local, hdr->addr1);
 		if (sta) {
 			int ampdu_queue = sta->tid_to_tx_q[tid];
-			struct ieee80211_hw *hw = &local->hw;
 
 			if ((ampdu_queue < ieee80211_num_queues(hw)) &&
-			    test_bit(ampdu_queue, local->queue_pool)) {
+			    test_bit(ampdu_queue, local->queue_pool))
 				queue = ampdu_queue;
-				info->flags |= IEEE80211_TX_CTL_AMPDU;
-			} else {
-				info->flags &= ~IEEE80211_TX_CTL_AMPDU;
-			}
 		}
 
 		rcu_read_unlock();
-- 
cgit v1.2.3-70-g09d2


From 127cafbb276266b1b8da967bfe25a062ab1d42ab Mon Sep 17 00:00:00 2001
From: Lai Jiangshan <laijs@cn.fujitsu.com>
Date: Tue, 28 Oct 2008 10:51:53 +0800
Subject: tracepoint: introduce *_noupdate APIs.

Impact: add new tracepoint APIs to allow the batched registration of probes

new APIs separate tracepoint_probe_register(),
tracepoint_probe_unregister() into 2 steps. The first step of them
is just update tracepoint_entry, not connect or disconnect.

this patch introduces tracepoint_probe_update_all() for update all.

these APIs are very useful for registering lots of probes
but just updating once. Another very important thing is that
*_noupdate APIs do not require module_mutex.

Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com>
Acked-by: Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/tracepoint.h |   4 ++
 kernel/tracepoint.c        | 170 +++++++++++++++++++++++++++++++++++----------
 2 files changed, 136 insertions(+), 38 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h
index c5bb39c7a77..63064e9403f 100644
--- a/include/linux/tracepoint.h
+++ b/include/linux/tracepoint.h
@@ -112,6 +112,10 @@ extern int tracepoint_probe_register(const char *name, void *probe);
  */
 extern int tracepoint_probe_unregister(const char *name, void *probe);
 
+extern int tracepoint_probe_register_noupdate(const char *name, void *probe);
+extern int tracepoint_probe_unregister_noupdate(const char *name, void *probe);
+extern void tracepoint_probe_update_all(void);
+
 struct tracepoint_iter {
 	struct module *module;
 	struct tracepoint *tracepoint;
diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c
index 3e22867184e..e96590f17de 100644
--- a/kernel/tracepoint.c
+++ b/kernel/tracepoint.c
@@ -59,7 +59,10 @@ struct tracepoint_entry {
 };
 
 struct tp_probes {
-	struct rcu_head rcu;
+	union {
+		struct rcu_head rcu;
+		struct list_head list;
+	} u;
 	void *probes[0];
 };
 
@@ -72,7 +75,7 @@ static inline void *allocate_probes(int count)
 
 static void rcu_free_old_probes(struct rcu_head *head)
 {
-	kfree(container_of(head, struct tp_probes, rcu));
+	kfree(container_of(head, struct tp_probes, u.rcu));
 }
 
 static inline void release_probes(void *old)
@@ -80,7 +83,7 @@ static inline void release_probes(void *old)
 	if (old) {
 		struct tp_probes *tp_probes = container_of(old,
 			struct tp_probes, probes[0]);
-		call_rcu(&tp_probes->rcu, rcu_free_old_probes);
+		call_rcu_sched(&tp_probes->u.rcu, rcu_free_old_probes);
 	}
 }
 
@@ -299,6 +302,23 @@ static void tracepoint_update_probes(void)
 	module_update_tracepoints();
 }
 
+static void *tracepoint_add_probe(const char *name, void *probe)
+{
+	struct tracepoint_entry *entry;
+	void *old;
+
+	entry = get_tracepoint(name);
+	if (!entry) {
+		entry = add_tracepoint(name);
+		if (IS_ERR(entry))
+			return entry;
+	}
+	old = tracepoint_entry_add_probe(entry, probe);
+	if (IS_ERR(old) && !entry->refcount)
+		remove_tracepoint(entry);
+	return old;
+}
+
 /**
  * tracepoint_probe_register -  Connect a probe to a tracepoint
  * @name: tracepoint name
@@ -309,36 +329,36 @@ static void tracepoint_update_probes(void)
  */
 int tracepoint_probe_register(const char *name, void *probe)
 {
-	struct tracepoint_entry *entry;
-	int ret = 0;
 	void *old;
 
 	mutex_lock(&tracepoints_mutex);
-	entry = get_tracepoint(name);
-	if (!entry) {
-		entry = add_tracepoint(name);
-		if (IS_ERR(entry)) {
-			ret = PTR_ERR(entry);
-			goto end;
-		}
-	}
-	old = tracepoint_entry_add_probe(entry, probe);
-	if (IS_ERR(old)) {
-		if (!entry->refcount)
-			remove_tracepoint(entry);
-		ret = PTR_ERR(old);
-		goto end;
-	}
+	old = tracepoint_add_probe(name, probe);
 	mutex_unlock(&tracepoints_mutex);
+	if (IS_ERR(old))
+		return PTR_ERR(old);
+
 	tracepoint_update_probes();		/* may update entry */
 	release_probes(old);
 	return 0;
-end:
-	mutex_unlock(&tracepoints_mutex);
-	return ret;
 }
 EXPORT_SYMBOL_GPL(tracepoint_probe_register);
 
+static void *tracepoint_remove_probe(const char *name, void *probe)
+{
+	struct tracepoint_entry *entry;
+	void *old;
+
+	entry = get_tracepoint(name);
+	if (!entry)
+		return ERR_PTR(-ENOENT);
+	old = tracepoint_entry_remove_probe(entry, probe);
+	if (IS_ERR(old))
+		return old;
+	if (!entry->refcount)
+		remove_tracepoint(entry);
+	return old;
+}
+
 /**
  * tracepoint_probe_unregister -  Disconnect a probe from a tracepoint
  * @name: tracepoint name
@@ -351,31 +371,105 @@ EXPORT_SYMBOL_GPL(tracepoint_probe_register);
  */
 int tracepoint_probe_unregister(const char *name, void *probe)
 {
-	struct tracepoint_entry *entry;
 	void *old;
-	int ret = -ENOENT;
 
 	mutex_lock(&tracepoints_mutex);
-	entry = get_tracepoint(name);
-	if (!entry)
-		goto end;
-	old = tracepoint_entry_remove_probe(entry, probe);
-	if (IS_ERR(old)) {
-		ret = PTR_ERR(old);
-		goto end;
-	}
-	if (!entry->refcount)
-		remove_tracepoint(entry);
+	old = tracepoint_remove_probe(name, probe);
 	mutex_unlock(&tracepoints_mutex);
+	if (IS_ERR(old))
+		return PTR_ERR(old);
+
 	tracepoint_update_probes();		/* may update entry */
 	release_probes(old);
 	return 0;
-end:
-	mutex_unlock(&tracepoints_mutex);
-	return ret;
 }
 EXPORT_SYMBOL_GPL(tracepoint_probe_unregister);
 
+static LIST_HEAD(old_probes);
+static int need_update;
+
+static void tracepoint_add_old_probes(void *old)
+{
+	need_update = 1;
+	if (old) {
+		struct tp_probes *tp_probes = container_of(old,
+			struct tp_probes, probes[0]);
+		list_add(&tp_probes->u.list, &old_probes);
+	}
+}
+
+/**
+ * tracepoint_probe_register_noupdate -  register a probe but not connect
+ * @name: tracepoint name
+ * @probe: probe handler
+ *
+ * caller must call tracepoint_probe_update_all()
+ */
+int tracepoint_probe_register_noupdate(const char *name, void *probe)
+{
+	void *old;
+
+	mutex_lock(&tracepoints_mutex);
+	old = tracepoint_add_probe(name, probe);
+	if (IS_ERR(old)) {
+		mutex_unlock(&tracepoints_mutex);
+		return PTR_ERR(old);
+	}
+	tracepoint_add_old_probes(old);
+	mutex_unlock(&tracepoints_mutex);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(tracepoint_probe_register_noupdate);
+
+/**
+ * tracepoint_probe_unregister_noupdate -  remove a probe but not disconnect
+ * @name: tracepoint name
+ * @probe: probe function pointer
+ *
+ * caller must call tracepoint_probe_update_all()
+ */
+int tracepoint_probe_unregister_noupdate(const char *name, void *probe)
+{
+	void *old;
+
+	mutex_lock(&tracepoints_mutex);
+	old = tracepoint_remove_probe(name, probe);
+	if (IS_ERR(old)) {
+		mutex_unlock(&tracepoints_mutex);
+		return PTR_ERR(old);
+	}
+	tracepoint_add_old_probes(old);
+	mutex_unlock(&tracepoints_mutex);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(tracepoint_probe_unregister_noupdate);
+
+/**
+ * tracepoint_probe_update_all -  update tracepoints
+ */
+void tracepoint_probe_update_all(void)
+{
+	LIST_HEAD(release_probes);
+	struct tp_probes *pos, *next;
+
+	mutex_lock(&tracepoints_mutex);
+	if (!need_update) {
+		mutex_unlock(&tracepoints_mutex);
+		return;
+	}
+	if (!list_empty(&old_probes))
+		list_replace_init(&old_probes, &release_probes);
+	need_update = 0;
+	mutex_unlock(&tracepoints_mutex);
+
+	tracepoint_update_probes();
+	list_for_each_entry_safe(pos, next, &release_probes, u.list) {
+		list_del(&pos->u.list);
+		call_rcu_sched(&pos->u.rcu, rcu_free_old_probes);
+	}
+}
+EXPORT_SYMBOL_GPL(tracepoint_probe_update_all);
+
 /**
  * tracepoint_get_iter_range - Get a next tracepoint iterator given a range.
  * @tracepoint: current tracepoints (in), next tracepoint (out)
-- 
cgit v1.2.3-70-g09d2


From 7e5e26a3d8ac4bcadb380073dc9604c07a9a6198 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Fri, 31 Oct 2008 09:36:38 -0400
Subject: ftrace: fix hardirq header for non ftrace archs

Impact: build fix for non-ftrace architectures

Not all archs implement ftrace, and therefore do not have an asm/ftrace.h.
This patch corrects the problem.

The ftrace_nmi_enter/exit now must be defined for all archs that implement
dynamic ftrace. Currently, only x86 does.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/arm/include/asm/ftrace.h     |  5 -----
 arch/powerpc/include/asm/ftrace.h |  5 -----
 arch/sh/include/asm/ftrace.h      |  5 -----
 arch/sparc/include/asm/ftrace.h   |  5 -----
 arch/x86/include/asm/ftrace.h     | 16 ----------------
 include/linux/ftrace.h            |  5 ++++-
 include/linux/hardirq.h           |  2 +-
 7 files changed, 5 insertions(+), 38 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arm/include/asm/ftrace.h b/arch/arm/include/asm/ftrace.h
index 3f3a1d1508e..39c8bc1a006 100644
--- a/arch/arm/include/asm/ftrace.h
+++ b/arch/arm/include/asm/ftrace.h
@@ -1,11 +1,6 @@
 #ifndef _ASM_ARM_FTRACE
 #define _ASM_ARM_FTRACE
 
-#ifndef __ASSEMBLY__
-static inline void ftrace_nmi_enter(void) { }
-static inline void ftrace_nmi_exit(void) { }
-#endif
-
 #ifdef CONFIG_FUNCTION_TRACER
 #define MCOUNT_ADDR		((long)(mcount))
 #define MCOUNT_INSN_SIZE	4 /* sizeof mcount call */
diff --git a/arch/powerpc/include/asm/ftrace.h b/arch/powerpc/include/asm/ftrace.h
index 1cd72700fbc..b298f7a631e 100644
--- a/arch/powerpc/include/asm/ftrace.h
+++ b/arch/powerpc/include/asm/ftrace.h
@@ -1,11 +1,6 @@
 #ifndef _ASM_POWERPC_FTRACE
 #define _ASM_POWERPC_FTRACE
 
-#ifndef __ASSEMBLY__
-static inline void ftrace_nmi_enter(void) { }
-static inline void ftrace_nmi_exit(void) { }
-#endif
-
 #ifdef CONFIG_FUNCTION_TRACER
 #define MCOUNT_ADDR		((long)(_mcount))
 #define MCOUNT_INSN_SIZE	4 /* sizeof mcount call */
diff --git a/arch/sh/include/asm/ftrace.h b/arch/sh/include/asm/ftrace.h
index 31ada0370cb..3aed362c946 100644
--- a/arch/sh/include/asm/ftrace.h
+++ b/arch/sh/include/asm/ftrace.h
@@ -1,11 +1,6 @@
 #ifndef __ASM_SH_FTRACE_H
 #define __ASM_SH_FTRACE_H
 
-#ifndef __ASSEMBLY__
-static inline void ftrace_nmi_enter(void) { }
-static inline void ftrace_nmi_exit(void) { }
-#endif
-
 #ifndef __ASSEMBLY__
 extern void mcount(void);
 #endif
diff --git a/arch/sparc/include/asm/ftrace.h b/arch/sparc/include/asm/ftrace.h
index 62055ac0496..d27716cd38c 100644
--- a/arch/sparc/include/asm/ftrace.h
+++ b/arch/sparc/include/asm/ftrace.h
@@ -1,11 +1,6 @@
 #ifndef _ASM_SPARC64_FTRACE
 #define _ASM_SPARC64_FTRACE
 
-#ifndef __ASSEMBLY__
-static inline void ftrace_nmi_enter(void) { }
-static inline void ftrace_nmi_exit(void) { }
-#endif
-
 #ifdef CONFIG_MCOUNT
 #define MCOUNT_ADDR		((long)(_mcount))
 #define MCOUNT_INSN_SIZE	4 /* sizeof mcount call */
diff --git a/arch/x86/include/asm/ftrace.h b/arch/x86/include/asm/ftrace.h
index a23468194b8..f8173ed1c97 100644
--- a/arch/x86/include/asm/ftrace.h
+++ b/arch/x86/include/asm/ftrace.h
@@ -17,23 +17,7 @@ static inline unsigned long ftrace_call_adjust(unsigned long addr)
 	 */
 	return addr - 1;
 }
-
-#ifdef CONFIG_DYNAMIC_FTRACE
-extern void ftrace_nmi_enter(void);
-extern void ftrace_nmi_exit(void);
-#else
-static inline void ftrace_nmi_enter(void) { }
-static inline void ftrace_nmi_exit(void) { }
-#endif
 #endif /* __ASSEMBLY__ */
-
-#else /* CONFIG_FUNCTION_TRACER */
-
-#ifndef __ASSEMBLY__
-static inline void ftrace_nmi_enter(void) { }
-static inline void ftrace_nmi_exit(void) { }
-#endif
-
 #endif /* CONFIG_FUNCTION_TRACER */
 
 #endif /* _ASM_X86_FTRACE_H */
diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index e46a7b34037..0ad1b48aea6 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -44,7 +44,6 @@ static inline void ftrace_kill(void) { }
 #endif /* CONFIG_FUNCTION_TRACER */
 
 #ifdef CONFIG_DYNAMIC_FTRACE
-
 enum {
 	FTRACE_FL_FREE		= (1 << 0),
 	FTRACE_FL_FAILED	= (1 << 1),
@@ -105,6 +104,8 @@ extern void ftrace_release(void *start, unsigned long size);
 
 extern void ftrace_disable_daemon(void);
 extern void ftrace_enable_daemon(void);
+extern void ftrace_nmi_enter(void);
+extern void ftrace_nmi_exit(void);
 
 #else
 # define skip_trace(ip)				({ 0; })
@@ -113,6 +114,8 @@ extern void ftrace_enable_daemon(void);
 # define ftrace_disable_daemon()		do { } while (0)
 # define ftrace_enable_daemon()			do { } while (0)
 static inline void ftrace_release(void *start, unsigned long size) { }
+static inline void ftrace_nmi_enter(void) { }
+static inline void ftrace_nmi_exit(void) { }
 #endif /* CONFIG_DYNAMIC_FTRACE */
 
 /* totally disable ftrace - can not re-enable after this */
diff --git a/include/linux/hardirq.h b/include/linux/hardirq.h
index 0087cb43bec..ffc16ab5a87 100644
--- a/include/linux/hardirq.h
+++ b/include/linux/hardirq.h
@@ -4,8 +4,8 @@
 #include <linux/preempt.h>
 #include <linux/smp_lock.h>
 #include <linux/lockdep.h>
+#include <linux/ftrace.h>
 #include <asm/hardirq.h>
-#include <asm/ftrace.h>
 #include <asm/system.h>
 
 /*
-- 
cgit v1.2.3-70-g09d2


From 29cbda77a67cf263d636feea65d3bbc9c7de2e24 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Mon, 3 Nov 2008 09:16:39 -0800
Subject: rcu: increase RCU stall-check timeouts

Impact: increase timeout of debug check feature

Increase RCU stall period timeouts to reduce the likelyhood of
false positives.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/rcuclassic.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/rcuclassic.h b/include/linux/rcuclassic.h
index 5f89b62e698..301dda829e3 100644
--- a/include/linux/rcuclassic.h
+++ b/include/linux/rcuclassic.h
@@ -41,7 +41,7 @@
 #include <linux/seqlock.h>
 
 #ifdef CONFIG_RCU_CPU_STALL_DETECTOR
-#define RCU_SECONDS_TILL_STALL_CHECK	( 3 * HZ) /* for rcp->jiffies_stall */
+#define RCU_SECONDS_TILL_STALL_CHECK	(10 * HZ) /* for rcp->jiffies_stall */
 #define RCU_SECONDS_TILL_STALL_RECHECK	(30 * HZ) /* for rcp->jiffies_stall */
 #endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
 
-- 
cgit v1.2.3-70-g09d2


From 6cf3f41e6c08bca6641a695449791c38a25f35ff Mon Sep 17 00:00:00 2001
From: Jay Vosburgh <fubar@us.ibm.com>
Date: Mon, 3 Nov 2008 18:16:50 -0800
Subject: bonding, net: Move last_rx update into bonding recv logic

	The only user of the net_device->last_rx field is bonding.
This patch adds a conditional update of last_rx to the bonding special
logic in skb_bond_should_drop, causing last_rx to only be updated when
the ARP monitor is running.

	This frees network device drivers from the necessity of
updating last_rx, which can have cache line thrash issues.

Signed-off-by: Jay Vosburgh <fubar@us.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/bonding/bond_main.c  |  2 ++
 drivers/net/bonding/bond_sysfs.c |  3 +++
 include/linux/if.h               |  1 +
 include/linux/netdevice.h        | 32 ++++++++++++++++++--------------
 4 files changed, 24 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index 56c823c175f..39575d76497 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -4564,6 +4564,8 @@ static int bond_init(struct net_device *bond_dev, struct bond_params *params)
 	bond_dev->tx_queue_len = 0;
 	bond_dev->flags |= IFF_MASTER|IFF_MULTICAST;
 	bond_dev->priv_flags |= IFF_BONDING;
+	if (bond->params.arp_interval)
+		bond_dev->priv_flags |= IFF_MASTER_ARPMON;
 
 	/* At first, we block adding VLANs. That's the only way to
 	 * prevent problems that occur when adding VLANs over an
diff --git a/drivers/net/bonding/bond_sysfs.c b/drivers/net/bonding/bond_sysfs.c
index 296a865b75d..e400d7dfdfc 100644
--- a/drivers/net/bonding/bond_sysfs.c
+++ b/drivers/net/bonding/bond_sysfs.c
@@ -620,6 +620,8 @@ static ssize_t bonding_store_arp_interval(struct device *d,
 	       ": %s: Setting ARP monitoring interval to %d.\n",
 	       bond->dev->name, new_value);
 	bond->params.arp_interval = new_value;
+	if (bond->params.arp_interval)
+		bond->dev->priv_flags |= IFF_MASTER_ARPMON;
 	if (bond->params.miimon) {
 		printk(KERN_INFO DRV_NAME
 		       ": %s: ARP monitoring cannot be used with MII monitoring. "
@@ -1039,6 +1041,7 @@ static ssize_t bonding_store_miimon(struct device *d,
 			       "ARP monitoring. Disabling ARP monitoring...\n",
 			       bond->dev->name);
 			bond->params.arp_interval = 0;
+			bond->dev->priv_flags &= ~IFF_MASTER_ARPMON;
 			if (bond->params.arp_validate) {
 				bond_unregister_arp(bond);
 				bond->params.arp_validate =
diff --git a/include/linux/if.h b/include/linux/if.h
index 65246846c84..2a6e29620a9 100644
--- a/include/linux/if.h
+++ b/include/linux/if.h
@@ -65,6 +65,7 @@
 #define IFF_BONDING	0x20		/* bonding master or slave	*/
 #define IFF_SLAVE_NEEDARP 0x40		/* need ARPs for validation	*/
 #define IFF_ISATAP	0x80		/* ISATAP interface (RFC4214)	*/
+#define IFF_MASTER_ARPMON 0x100		/* bonding master, ARP mon in use */
 
 #define IF_GET_IFACE	0x0001		/* for querying only */
 #define IF_GET_PROTO	0x0002
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 9d77b1d7dca..f1b0dbe5846 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1742,22 +1742,26 @@ static inline int skb_bond_should_drop(struct sk_buff *skb)
 	struct net_device *dev = skb->dev;
 	struct net_device *master = dev->master;
 
-	if (master &&
-	    (dev->priv_flags & IFF_SLAVE_INACTIVE)) {
-		if ((dev->priv_flags & IFF_SLAVE_NEEDARP) &&
-		    skb->protocol == __constant_htons(ETH_P_ARP))
-			return 0;
-
-		if (master->priv_flags & IFF_MASTER_ALB) {
-			if (skb->pkt_type != PACKET_BROADCAST &&
-			    skb->pkt_type != PACKET_MULTICAST)
+	if (master) {
+		if (master->priv_flags & IFF_MASTER_ARPMON)
+			dev->last_rx = jiffies;
+
+		if (dev->priv_flags & IFF_SLAVE_INACTIVE) {
+			if ((dev->priv_flags & IFF_SLAVE_NEEDARP) &&
+			    skb->protocol == __constant_htons(ETH_P_ARP))
 				return 0;
-		}
-		if (master->priv_flags & IFF_MASTER_8023AD &&
-		    skb->protocol == __constant_htons(ETH_P_SLOW))
-			return 0;
 
-		return 1;
+			if (master->priv_flags & IFF_MASTER_ALB) {
+				if (skb->pkt_type != PACKET_BROADCAST &&
+				    skb->pkt_type != PACKET_MULTICAST)
+					return 0;
+			}
+			if (master->priv_flags & IFF_MASTER_8023AD &&
+			    skb->protocol == __constant_htons(ETH_P_SLOW))
+				return 0;
+
+			return 1;
+		}
 	}
 	return 0;
 }
-- 
cgit v1.2.3-70-g09d2


From 511061e2dd1b84bb21bb97c9216a19606c29ac02 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Tue, 4 Nov 2008 14:22:55 +0100
Subject: netfilter: netns ebtables: part 1

* propagate netns from userspace, register table in passed netns
* remporarily register every ebt_table in init_net

P. S.: one needs to add ".netns_ok = 1" to igmp_protocol to test with
ebtables(8) in netns.

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Signed-off-by: Patrick McHardy <kaber@trash.net>
---
 include/linux/netfilter_bridge/ebtables.h |  2 +-
 net/bridge/netfilter/ebtable_broute.c     |  2 +-
 net/bridge/netfilter/ebtable_filter.c     |  2 +-
 net/bridge/netfilter/ebtable_nat.c        |  2 +-
 net/bridge/netfilter/ebtables.c           | 27 ++++++++++++++-------------
 5 files changed, 18 insertions(+), 17 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netfilter_bridge/ebtables.h b/include/linux/netfilter_bridge/ebtables.h
index d45e29cd1cf..624e7883068 100644
--- a/include/linux/netfilter_bridge/ebtables.h
+++ b/include/linux/netfilter_bridge/ebtables.h
@@ -300,7 +300,7 @@ struct ebt_table
 
 #define EBT_ALIGN(s) (((s) + (__alignof__(struct ebt_replace)-1)) & \
 		     ~(__alignof__(struct ebt_replace)-1))
-extern int ebt_register_table(struct ebt_table *table);
+extern int ebt_register_table(struct net *net, struct ebt_table *table);
 extern void ebt_unregister_table(struct ebt_table *table);
 extern unsigned int ebt_do_table(unsigned int hook, struct sk_buff *skb,
    const struct net_device *in, const struct net_device *out,
diff --git a/net/bridge/netfilter/ebtable_broute.c b/net/bridge/netfilter/ebtable_broute.c
index 246626bb0c8..1731ce8f747 100644
--- a/net/bridge/netfilter/ebtable_broute.c
+++ b/net/bridge/netfilter/ebtable_broute.c
@@ -66,7 +66,7 @@ static int __init ebtable_broute_init(void)
 {
 	int ret;
 
-	ret = ebt_register_table(&broute_table);
+	ret = ebt_register_table(&init_net, &broute_table);
 	if (ret < 0)
 		return ret;
 	/* see br_input.c */
diff --git a/net/bridge/netfilter/ebtable_filter.c b/net/bridge/netfilter/ebtable_filter.c
index 1a58af51a2e..af8953c9a57 100644
--- a/net/bridge/netfilter/ebtable_filter.c
+++ b/net/bridge/netfilter/ebtable_filter.c
@@ -95,7 +95,7 @@ static int __init ebtable_filter_init(void)
 {
 	int ret;
 
-	ret = ebt_register_table(&frame_filter);
+	ret = ebt_register_table(&init_net, &frame_filter);
 	if (ret < 0)
 		return ret;
 	ret = nf_register_hooks(ebt_ops_filter, ARRAY_SIZE(ebt_ops_filter));
diff --git a/net/bridge/netfilter/ebtable_nat.c b/net/bridge/netfilter/ebtable_nat.c
index f60c1e78e57..bafe16029bd 100644
--- a/net/bridge/netfilter/ebtable_nat.c
+++ b/net/bridge/netfilter/ebtable_nat.c
@@ -102,7 +102,7 @@ static int __init ebtable_nat_init(void)
 {
 	int ret;
 
-	ret = ebt_register_table(&frame_nat);
+	ret = ebt_register_table(&init_net, &frame_nat);
 	if (ret < 0)
 		return ret;
 	ret = nf_register_hooks(ebt_ops_nat, ARRAY_SIZE(ebt_ops_nat));
diff --git a/net/bridge/netfilter/ebtables.c b/net/bridge/netfilter/ebtables.c
index 0fa208e8640..c1a82b2826e 100644
--- a/net/bridge/netfilter/ebtables.c
+++ b/net/bridge/netfilter/ebtables.c
@@ -55,7 +55,6 @@
 
 
 static DEFINE_MUTEX(ebt_mutex);
-static LIST_HEAD(ebt_tables);
 
 static struct xt_target ebt_standard_target = {
 	.name       = "standard",
@@ -315,9 +314,11 @@ find_inlist_lock(struct list_head *head, const char *name, const char *prefix,
 }
 
 static inline struct ebt_table *
-find_table_lock(const char *name, int *error, struct mutex *mutex)
+find_table_lock(struct net *net, const char *name, int *error,
+		struct mutex *mutex)
 {
-	return find_inlist_lock(&ebt_tables, name, "ebtable_", error, mutex);
+	return find_inlist_lock(&net->xt.tables[NFPROTO_BRIDGE], name,
+				"ebtable_", error, mutex);
 }
 
 static inline int
@@ -944,7 +945,7 @@ static void get_counters(struct ebt_counter *oldcounters,
 }
 
 /* replace the table */
-static int do_replace(void __user *user, unsigned int len)
+static int do_replace(struct net *net, void __user *user, unsigned int len)
 {
 	int ret, i, countersize;
 	struct ebt_table_info *newinfo;
@@ -1016,7 +1017,7 @@ static int do_replace(void __user *user, unsigned int len)
 	if (ret != 0)
 		goto free_counterstmp;
 
-	t = find_table_lock(tmp.name, &ret, &ebt_mutex);
+	t = find_table_lock(net, tmp.name, &ret, &ebt_mutex);
 	if (!t) {
 		ret = -ENOENT;
 		goto free_iterate;
@@ -1097,7 +1098,7 @@ free_newinfo:
 	return ret;
 }
 
-int ebt_register_table(struct ebt_table *table)
+int ebt_register_table(struct net *net, struct ebt_table *table)
 {
 	struct ebt_table_info *newinfo;
 	struct ebt_table *t;
@@ -1157,7 +1158,7 @@ int ebt_register_table(struct ebt_table *table)
 	if (ret != 0)
 		goto free_chainstack;
 
-	list_for_each_entry(t, &ebt_tables, list) {
+	list_for_each_entry(t, &net->xt.tables[NFPROTO_BRIDGE], list) {
 		if (strcmp(t->name, table->name) == 0) {
 			ret = -EEXIST;
 			BUGPRINT("Table name already exists\n");
@@ -1170,7 +1171,7 @@ int ebt_register_table(struct ebt_table *table)
 		ret = -ENOENT;
 		goto free_unlock;
 	}
-	list_add(&table->list, &ebt_tables);
+	list_add(&table->list, &net->xt.tables[NFPROTO_BRIDGE]);
 	mutex_unlock(&ebt_mutex);
 	return 0;
 free_unlock:
@@ -1208,7 +1209,7 @@ void ebt_unregister_table(struct ebt_table *table)
 }
 
 /* userspace just supplied us with counters */
-static int update_counters(void __user *user, unsigned int len)
+static int update_counters(struct net *net, void __user *user, unsigned int len)
 {
 	int i, ret;
 	struct ebt_counter *tmp;
@@ -1228,7 +1229,7 @@ static int update_counters(void __user *user, unsigned int len)
 		return -ENOMEM;
 	}
 
-	t = find_table_lock(hlp.name, &ret, &ebt_mutex);
+	t = find_table_lock(net, hlp.name, &ret, &ebt_mutex);
 	if (!t)
 		goto free_tmp;
 
@@ -1386,10 +1387,10 @@ static int do_ebt_set_ctl(struct sock *sk,
 
 	switch(cmd) {
 	case EBT_SO_SET_ENTRIES:
-		ret = do_replace(user, len);
+		ret = do_replace(sock_net(sk), user, len);
 		break;
 	case EBT_SO_SET_COUNTERS:
-		ret = update_counters(user, len);
+		ret = update_counters(sock_net(sk), user, len);
 		break;
 	default:
 		ret = -EINVAL;
@@ -1406,7 +1407,7 @@ static int do_ebt_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
 	if (copy_from_user(&tmp, user, sizeof(tmp)))
 		return -EFAULT;
 
-	t = find_table_lock(tmp.name, &ret, &ebt_mutex);
+	t = find_table_lock(sock_net(sk), tmp.name, &ret, &ebt_mutex);
 	if (!t)
 		return ret;
 
-- 
cgit v1.2.3-70-g09d2


From 6beceee5aa2cb94c4ae9f0784c7d3135d343f5b5 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Tue, 4 Nov 2008 14:27:15 +0100
Subject: netfilter: netns ebtables: part 2

* return ebt_table from ebt_register_table(), module code will save it into
  per-netns data for unregistration
* duplicate ebt_table at the very beginning of registration -- it's added into
  list, so one ebt_table wouldn't end up in many lists (and each netns has
  different one)
* introduce underscored tables in individial modules, this is temporary to not
  break bisection.

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Signed-off-by: Patrick McHardy <kaber@trash.net>
---
 include/linux/netfilter_bridge/ebtables.h |  3 ++-
 net/bridge/netfilter/ebtable_broute.c     | 19 +++++++++----------
 net/bridge/netfilter/ebtable_filter.c     | 17 +++++++++--------
 net/bridge/netfilter/ebtable_nat.c        | 19 ++++++++++---------
 net/bridge/netfilter/ebtables.c           | 23 +++++++++++++++++------
 5 files changed, 47 insertions(+), 34 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netfilter_bridge/ebtables.h b/include/linux/netfilter_bridge/ebtables.h
index 624e7883068..e40ddb94b1a 100644
--- a/include/linux/netfilter_bridge/ebtables.h
+++ b/include/linux/netfilter_bridge/ebtables.h
@@ -300,7 +300,8 @@ struct ebt_table
 
 #define EBT_ALIGN(s) (((s) + (__alignof__(struct ebt_replace)-1)) & \
 		     ~(__alignof__(struct ebt_replace)-1))
-extern int ebt_register_table(struct net *net, struct ebt_table *table);
+extern struct ebt_table *ebt_register_table(struct net *net,
+					    struct ebt_table *table);
 extern void ebt_unregister_table(struct ebt_table *table);
 extern unsigned int ebt_do_table(unsigned int hook, struct sk_buff *skb,
    const struct net_device *in, const struct net_device *out,
diff --git a/net/bridge/netfilter/ebtable_broute.c b/net/bridge/netfilter/ebtable_broute.c
index 1731ce8f747..3277d682dfc 100644
--- a/net/bridge/netfilter/ebtable_broute.c
+++ b/net/bridge/netfilter/ebtable_broute.c
@@ -41,22 +41,23 @@ static int check(const struct ebt_table_info *info, unsigned int valid_hooks)
 	return 0;
 }
 
-static struct ebt_table broute_table =
+static struct ebt_table __broute_table =
 {
 	.name		= "broute",
 	.table		= &initial_table,
 	.valid_hooks	= 1 << NF_BR_BROUTING,
-	.lock		= __RW_LOCK_UNLOCKED(broute_table.lock),
+	.lock		= __RW_LOCK_UNLOCKED(__broute_table.lock),
 	.check		= check,
 	.me		= THIS_MODULE,
 };
+static struct ebt_table *broute_table;
 
 static int ebt_broute(struct sk_buff *skb)
 {
 	int ret;
 
 	ret = ebt_do_table(NF_BR_BROUTING, skb, skb->dev, NULL,
-	   &broute_table);
+	   broute_table);
 	if (ret == NF_DROP)
 		return 1; /* route it */
 	return 0; /* bridge it */
@@ -64,21 +65,19 @@ static int ebt_broute(struct sk_buff *skb)
 
 static int __init ebtable_broute_init(void)
 {
-	int ret;
-
-	ret = ebt_register_table(&init_net, &broute_table);
-	if (ret < 0)
-		return ret;
+	broute_table = ebt_register_table(&init_net, &__broute_table);
+	if (IS_ERR(broute_table))
+		return PTR_ERR(broute_table);
 	/* see br_input.c */
 	rcu_assign_pointer(br_should_route_hook, ebt_broute);
-	return ret;
+	return 0;
 }
 
 static void __exit ebtable_broute_fini(void)
 {
 	rcu_assign_pointer(br_should_route_hook, NULL);
 	synchronize_net();
-	ebt_unregister_table(&broute_table);
+	ebt_unregister_table(broute_table);
 }
 
 module_init(ebtable_broute_init);
diff --git a/net/bridge/netfilter/ebtable_filter.c b/net/bridge/netfilter/ebtable_filter.c
index af8953c9a57..596564c7aa5 100644
--- a/net/bridge/netfilter/ebtable_filter.c
+++ b/net/bridge/netfilter/ebtable_filter.c
@@ -50,21 +50,22 @@ static int check(const struct ebt_table_info *info, unsigned int valid_hooks)
 	return 0;
 }
 
-static struct ebt_table frame_filter =
+static struct ebt_table __frame_filter =
 {
 	.name		= "filter",
 	.table		= &initial_table,
 	.valid_hooks	= FILTER_VALID_HOOKS,
-	.lock		= __RW_LOCK_UNLOCKED(frame_filter.lock),
+	.lock		= __RW_LOCK_UNLOCKED(__frame_filter.lock),
 	.check		= check,
 	.me		= THIS_MODULE,
 };
+static struct ebt_table *frame_filter;
 
 static unsigned int
 ebt_hook(unsigned int hook, struct sk_buff *skb, const struct net_device *in,
    const struct net_device *out, int (*okfn)(struct sk_buff *))
 {
-	return ebt_do_table(hook, skb, in, out, &frame_filter);
+	return ebt_do_table(hook, skb, in, out, frame_filter);
 }
 
 static struct nf_hook_ops ebt_ops_filter[] __read_mostly = {
@@ -95,19 +96,19 @@ static int __init ebtable_filter_init(void)
 {
 	int ret;
 
-	ret = ebt_register_table(&init_net, &frame_filter);
-	if (ret < 0)
-		return ret;
+	frame_filter = ebt_register_table(&init_net, &__frame_filter);
+	if (IS_ERR(frame_filter))
+		return PTR_ERR(frame_filter);
 	ret = nf_register_hooks(ebt_ops_filter, ARRAY_SIZE(ebt_ops_filter));
 	if (ret < 0)
-		ebt_unregister_table(&frame_filter);
+		ebt_unregister_table(frame_filter);
 	return ret;
 }
 
 static void __exit ebtable_filter_fini(void)
 {
 	nf_unregister_hooks(ebt_ops_filter, ARRAY_SIZE(ebt_ops_filter));
-	ebt_unregister_table(&frame_filter);
+	ebt_unregister_table(frame_filter);
 }
 
 module_init(ebtable_filter_init);
diff --git a/net/bridge/netfilter/ebtable_nat.c b/net/bridge/netfilter/ebtable_nat.c
index bafe16029bd..0d8fc5bcddd 100644
--- a/net/bridge/netfilter/ebtable_nat.c
+++ b/net/bridge/netfilter/ebtable_nat.c
@@ -50,28 +50,29 @@ static int check(const struct ebt_table_info *info, unsigned int valid_hooks)
 	return 0;
 }
 
-static struct ebt_table frame_nat =
+static struct ebt_table __frame_nat =
 {
 	.name		= "nat",
 	.table		= &initial_table,
 	.valid_hooks	= NAT_VALID_HOOKS,
-	.lock		= __RW_LOCK_UNLOCKED(frame_nat.lock),
+	.lock		= __RW_LOCK_UNLOCKED(__frame_nat.lock),
 	.check		= check,
 	.me		= THIS_MODULE,
 };
+static struct ebt_table *frame_nat;
 
 static unsigned int
 ebt_nat_dst(unsigned int hook, struct sk_buff *skb, const struct net_device *in
    , const struct net_device *out, int (*okfn)(struct sk_buff *))
 {
-	return ebt_do_table(hook, skb, in, out, &frame_nat);
+	return ebt_do_table(hook, skb, in, out, frame_nat);
 }
 
 static unsigned int
 ebt_nat_src(unsigned int hook, struct sk_buff *skb, const struct net_device *in
    , const struct net_device *out, int (*okfn)(struct sk_buff *))
 {
-	return ebt_do_table(hook, skb, in, out, &frame_nat);
+	return ebt_do_table(hook, skb, in, out, frame_nat);
 }
 
 static struct nf_hook_ops ebt_ops_nat[] __read_mostly = {
@@ -102,19 +103,19 @@ static int __init ebtable_nat_init(void)
 {
 	int ret;
 
-	ret = ebt_register_table(&init_net, &frame_nat);
-	if (ret < 0)
-		return ret;
+	frame_nat = ebt_register_table(&init_net, &__frame_nat);
+	if (IS_ERR(frame_nat))
+		return PTR_ERR(frame_nat);
 	ret = nf_register_hooks(ebt_ops_nat, ARRAY_SIZE(ebt_ops_nat));
 	if (ret < 0)
-		ebt_unregister_table(&frame_nat);
+		ebt_unregister_table(frame_nat);
 	return ret;
 }
 
 static void __exit ebtable_nat_fini(void)
 {
 	nf_unregister_hooks(ebt_ops_nat, ARRAY_SIZE(ebt_ops_nat));
-	ebt_unregister_table(&frame_nat);
+	ebt_unregister_table(frame_nat);
 }
 
 module_init(ebtable_nat_init);
diff --git a/net/bridge/netfilter/ebtables.c b/net/bridge/netfilter/ebtables.c
index c1a82b2826e..82e17527e21 100644
--- a/net/bridge/netfilter/ebtables.c
+++ b/net/bridge/netfilter/ebtables.c
@@ -1098,7 +1098,7 @@ free_newinfo:
 	return ret;
 }
 
-int ebt_register_table(struct net *net, struct ebt_table *table)
+struct ebt_table *ebt_register_table(struct net *net, struct ebt_table *table)
 {
 	struct ebt_table_info *newinfo;
 	struct ebt_table *t;
@@ -1110,14 +1110,21 @@ int ebt_register_table(struct net *net, struct ebt_table *table)
 	    repl->entries_size == 0 ||
 	    repl->counters || table->private) {
 		BUGPRINT("Bad table data for ebt_register_table!!!\n");
-		return -EINVAL;
+		return ERR_PTR(-EINVAL);
+	}
+
+	/* Don't add one table to multiple lists. */
+	table = kmemdup(table, sizeof(struct ebt_table), GFP_KERNEL);
+	if (!table) {
+		ret = -ENOMEM;
+		goto out;
 	}
 
 	countersize = COUNTER_OFFSET(repl->nentries) * nr_cpu_ids;
 	newinfo = vmalloc(sizeof(*newinfo) + countersize);
 	ret = -ENOMEM;
 	if (!newinfo)
-		return -ENOMEM;
+		goto free_table;
 
 	p = vmalloc(repl->entries_size);
 	if (!p)
@@ -1149,7 +1156,7 @@ int ebt_register_table(struct net *net, struct ebt_table *table)
 
 	if (table->check && table->check(newinfo, table->valid_hooks)) {
 		BUGPRINT("The table doesn't like its own initial data, lol\n");
-		return -EINVAL;
+		return ERR_PTR(-EINVAL);
 	}
 
 	table->private = newinfo;
@@ -1173,7 +1180,7 @@ int ebt_register_table(struct net *net, struct ebt_table *table)
 	}
 	list_add(&table->list, &net->xt.tables[NFPROTO_BRIDGE]);
 	mutex_unlock(&ebt_mutex);
-	return 0;
+	return table;
 free_unlock:
 	mutex_unlock(&ebt_mutex);
 free_chainstack:
@@ -1185,7 +1192,10 @@ free_chainstack:
 	vfree(newinfo->entries);
 free_newinfo:
 	vfree(newinfo);
-	return ret;
+free_table:
+	kfree(table);
+out:
+	return ERR_PTR(ret);
 }
 
 void ebt_unregister_table(struct ebt_table *table)
@@ -1206,6 +1216,7 @@ void ebt_unregister_table(struct ebt_table *table)
 		vfree(table->private->chainstack);
 	}
 	vfree(table->private);
+	kfree(table);
 }
 
 /* userspace just supplied us with counters */
-- 
cgit v1.2.3-70-g09d2


From c5995bd2819dc577d0b32b26be0836d16c977e24 Mon Sep 17 00:00:00 2001
From: Stefano Panella <stefano.panella@csr.com>
Date: Tue, 4 Nov 2008 14:06:31 +0000
Subject: uwb: infrastructure for handling Relinquish Request IEs

The structures and event handler needed to handle Relinish Request IEs
received from neighbors.  Nothing is done with these IEs yet.

Signed-off-by: Stefano Panella <stefano.panella@csr.com>
Signed-off-by: David Vrabel <david.vrabel@csr.com>
---
 drivers/uwb/Makefile       |  1 +
 drivers/uwb/ie-rcv.c       | 55 ++++++++++++++++++++++++++++++++++++++++++++++
 drivers/uwb/uwb-internal.h |  1 +
 drivers/uwb/uwbd.c         |  4 ++++
 include/linux/uwb/spec.h   | 28 +++++++++++++++++++++++
 5 files changed, 89 insertions(+)
 create mode 100644 drivers/uwb/ie-rcv.c

(limited to 'include/linux')

diff --git a/drivers/uwb/Makefile b/drivers/uwb/Makefile
index 257e6908304..2b99c3e6167 100644
--- a/drivers/uwb/Makefile
+++ b/drivers/uwb/Makefile
@@ -13,6 +13,7 @@ uwb-objs :=		\
 	drp-ie.o	\
 	est.o		\
 	ie.o		\
+	ie-rcv.o	\
 	lc-dev.o	\
 	lc-rc.o		\
 	neh.o		\
diff --git a/drivers/uwb/ie-rcv.c b/drivers/uwb/ie-rcv.c
new file mode 100644
index 00000000000..917e6d78a79
--- /dev/null
+++ b/drivers/uwb/ie-rcv.c
@@ -0,0 +1,55 @@
+/*
+ * Ultra Wide Band
+ * IE Received notification handling.
+ *
+ * Copyright (C) 2008 Cambridge Silicon Radio Ltd.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version
+ * 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <linux/errno.h>
+#include <linux/module.h>
+#include <linux/device.h>
+#include <linux/bitmap.h>
+#include "uwb-internal.h"
+
+/*
+ * Process an incoming IE Received notification.
+ */
+int uwbd_evt_handle_rc_ie_rcv(struct uwb_event *evt)
+{
+	int result = -EINVAL;
+	struct device *dev = &evt->rc->uwb_dev.dev;
+	struct uwb_rc_evt_ie_rcv *iercv;
+	size_t iesize;
+
+	/* Is there enough data to decode it? */
+	if (evt->notif.size < sizeof(*iercv)) {
+		dev_err(dev, "IE Received notification: Not enough data to "
+			"decode (%zu vs %zu bytes needed)\n",
+			evt->notif.size, sizeof(*iercv));
+		goto error;
+	}
+	iercv = container_of(evt->notif.rceb, struct uwb_rc_evt_ie_rcv, rceb);
+	iesize = le16_to_cpu(iercv->wIELength);
+
+	dev_dbg(dev, "IE received, element ID=%d\n", iercv->IEData[0]);
+
+	if (iercv->IEData[0] == UWB_RELINQUISH_REQUEST_IE) {
+		dev_warn(dev, "unhandled Relinquish Request IE\n");
+	}
+
+	return 0;
+error:
+	return result;
+}
diff --git a/drivers/uwb/uwb-internal.h b/drivers/uwb/uwb-internal.h
index 983ebc4dd8d..031e8a88568 100644
--- a/drivers/uwb/uwb-internal.h
+++ b/drivers/uwb/uwb-internal.h
@@ -167,6 +167,7 @@ extern void uwbd_event_queue(struct uwb_event *);
 void uwbd_flush(struct uwb_rc *rc);
 
 /* UWB event handlers */
+extern int uwbd_evt_handle_rc_ie_rcv(struct uwb_event *);
 extern int uwbd_evt_handle_rc_beacon(struct uwb_event *);
 extern int uwbd_evt_handle_rc_beacon_size(struct uwb_event *);
 extern int uwbd_evt_handle_rc_bpoie_change(struct uwb_event *);
diff --git a/drivers/uwb/uwbd.c b/drivers/uwb/uwbd.c
index 78908416e42..f75113571f4 100644
--- a/drivers/uwb/uwbd.c
+++ b/drivers/uwb/uwbd.c
@@ -104,6 +104,10 @@ struct uwbd_event {
 /** Table of handlers for and properties of the UWBD Radio Control Events */
 static
 struct uwbd_event uwbd_events[] = {
+	[UWB_RC_EVT_IE_RCV] = {
+		.handler = uwbd_evt_handle_rc_ie_rcv,
+		.name = "IE_RECEIVED"
+	},
 	[UWB_RC_EVT_BEACON] = {
 		.handler = uwbd_evt_handle_rc_beacon,
 		.name = "BEACON_RECEIVED"
diff --git a/include/linux/uwb/spec.h b/include/linux/uwb/spec.h
index 198c15f8e25..a30436ea53a 100644
--- a/include/linux/uwb/spec.h
+++ b/include/linux/uwb/spec.h
@@ -200,6 +200,12 @@ enum uwb_drp_reason {
 	UWB_DRP_REASON_MODIFIED,
 };
 
+/** Relinquish Request Reason Codes ([ECMA-368] table 113) */
+enum uwb_relinquish_req_reason {
+	UWB_RELINQUISH_REQ_REASON_NON_SPECIFIC = 0,
+	UWB_RELINQUISH_REQ_REASON_OVER_ALLOCATION,
+};
+
 /**
  *  DRP Notification Reason Codes (WHCI 0.95 [3.1.4.9])
  */
@@ -252,6 +258,7 @@ enum uwb_ie {
 	UWB_APP_SPEC_PROBE_IE = 15,
 	UWB_IDENTIFICATION_IE = 19,
 	UWB_MASTER_KEY_ID_IE = 20,
+	UWB_RELINQUISH_REQUEST_IE = 21,
 	UWB_IE_WLP = 250, /* WiMedia Logical Link Control Protocol WLP 0.99 */
 	UWB_APP_SPEC_IE = 255,
 };
@@ -365,6 +372,27 @@ struct uwb_ie_drp_avail {
 	DECLARE_BITMAP(bmp, UWB_NUM_MAS);
 } __attribute__((packed));
 
+/* Relinqish Request IE ([ECMA-368] section 16.8.19). */
+struct uwb_relinquish_request_ie {
+        struct uwb_ie_hdr       hdr;
+        __le16                  relinquish_req_control;
+        struct uwb_dev_addr     dev_addr;
+        struct uwb_drp_alloc    allocs[];
+} __attribute__((packed));
+
+static inline int uwb_ie_relinquish_req_reason_code(struct uwb_relinquish_request_ie *ie)
+{
+	return (le16_to_cpu(ie->relinquish_req_control) >> 0) & 0xf;
+}
+
+static inline void uwb_ie_relinquish_req_set_reason_code(struct uwb_relinquish_request_ie *ie,
+							 int reason_code)
+{
+	u16 ctrl = le16_to_cpu(ie->relinquish_req_control);
+	ctrl = (ctrl & ~(0xf << 0)) | (reason_code << 0);
+	ie->relinquish_req_control = cpu_to_le16(ctrl);
+}
+
 /**
  * The Vendor ID is set to an OUI that indicates the vendor of the device.
  * ECMA-368 [16.8.10]
-- 
cgit v1.2.3-70-g09d2


From 6d5a681dfb583b2f1eefe7cd5505419ca2d4d6c8 Mon Sep 17 00:00:00 2001
From: Stefano Panella <stefano.panella@csr.com>
Date: Tue, 4 Nov 2008 14:24:57 +0000
Subject: uwb: add commands to add/remove IEs to the debug interface

Add the commands UWB_DBG_CMD_IE_ADD and UWB_DBG_CMD_IE_RM to the debug
interface and make them call uwb_rc_ie_add() and uwb_rc_ie_rm().

Signed-off-by: Stefano Panella <stefano.panella@csr.com>
Signed-off-by: David Vrabel <david.vrabel@csr.com>
---
 drivers/uwb/uwb-debug.c       | 20 +++++++++++++++++++-
 include/linux/uwb/debug-cmd.h |  9 +++++++++
 2 files changed, 28 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/uwb/uwb-debug.c b/drivers/uwb/uwb-debug.c
index 6db641e4531..88e6ac71381 100644
--- a/drivers/uwb/uwb-debug.c
+++ b/drivers/uwb/uwb-debug.c
@@ -168,6 +168,18 @@ static int cmd_rsv_terminate(struct uwb_rc *rc,
 	return 0;
 }
 
+static int cmd_ie_add(struct uwb_rc *rc, struct uwb_dbg_cmd_ie *ie_to_add)
+{
+	return uwb_rc_ie_add(rc,
+			     (const struct uwb_ie_hdr *) ie_to_add->data,
+			     ie_to_add->len);
+}
+
+static int cmd_ie_rm(struct uwb_rc *rc, struct uwb_dbg_cmd_ie *ie_to_rm)
+{
+	return uwb_rc_ie_rm(rc, ie_to_rm->data[0]);
+}
+
 static int command_open(struct inode *inode, struct file *file)
 {
 	file->private_data = inode->i_private;
@@ -195,6 +207,12 @@ static ssize_t command_write(struct file *file, const char __user *buf,
 	case UWB_DBG_CMD_RSV_TERMINATE:
 		ret = cmd_rsv_terminate(rc, &cmd.rsv_terminate);
 		break;
+	case UWB_DBG_CMD_IE_ADD:
+		ret = cmd_ie_add(rc, &cmd.ie_add);
+		break;
+	case UWB_DBG_CMD_IE_RM:
+		ret = cmd_ie_rm(rc, &cmd.ie_rm);
+		break;
 	default:
 		return -EINVAL;
 	}
@@ -332,7 +350,7 @@ void uwb_dbg_add_rc(struct uwb_rc *rc)
 }
 
 /**
- * uwb_dbg_add_rc - remove a radio controller's debug interface
+ * uwb_dbg_del_rc - remove a radio controller's debug interface
  * @rc: the radio controller
  */
 void uwb_dbg_del_rc(struct uwb_rc *rc)
diff --git a/include/linux/uwb/debug-cmd.h b/include/linux/uwb/debug-cmd.h
index 1141f41bab5..6a16566f022 100644
--- a/include/linux/uwb/debug-cmd.h
+++ b/include/linux/uwb/debug-cmd.h
@@ -32,6 +32,8 @@
 enum uwb_dbg_cmd_type {
 	UWB_DBG_CMD_RSV_ESTABLISH = 1,
 	UWB_DBG_CMD_RSV_TERMINATE = 2,
+	UWB_DBG_CMD_IE_ADD = 3,
+	UWB_DBG_CMD_IE_RM = 4,
 };
 
 struct uwb_dbg_cmd_rsv_establish {
@@ -46,11 +48,18 @@ struct uwb_dbg_cmd_rsv_terminate {
 	int index;
 };
 
+struct uwb_dbg_cmd_ie {
+	__u8 data[128];
+	int len;
+};
+
 struct uwb_dbg_cmd {
 	__u32 type;
 	union {
 		struct uwb_dbg_cmd_rsv_establish rsv_establish;
 		struct uwb_dbg_cmd_rsv_terminate rsv_terminate;
+		struct uwb_dbg_cmd_ie ie_add;
+		struct uwb_dbg_cmd_ie ie_rm;
 	};
 };
 
-- 
cgit v1.2.3-70-g09d2


From fec1a5932f16c0eb1b3f5ca2e18d81d860924088 Mon Sep 17 00:00:00 2001
From: Stefano Panella <stefano.panella@csr.com>
Date: Tue, 4 Nov 2008 15:39:08 +0000
Subject: uwb: per-radio controller event thread and beacon cache

Use an event thread per-radio controller so processing events from one
radio controller doesn't delay another.

A radio controller shouldn't have information on devices seen by a
different radio controller (they may be on different channels) so make the
beacon cache per-radio controller.

Signed-off-by: Stefano Panella <stefano.panella@csr.com>
Signed-off-by: David Vrabel <david.vrabel@csr.com>
---
 drivers/uwb/beacon.c       | 61 ++++++++++++++---------------
 drivers/uwb/driver.c       |  2 -
 drivers/uwb/lc-rc.c        | 21 ++++++----
 drivers/uwb/uwb-internal.h | 20 ++--------
 drivers/uwb/uwbd.c         | 98 +++++++++++++++-------------------------------
 include/linux/uwb.h        | 20 ++++++++++
 6 files changed, 100 insertions(+), 122 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/uwb/beacon.c b/drivers/uwb/beacon.c
index ad823987ced..d9f2a8acc59 100644
--- a/drivers/uwb/beacon.c
+++ b/drivers/uwb/beacon.c
@@ -168,12 +168,6 @@ out_up:
  * FIXME: use something faster for search than a list
  */
 
-struct uwb_beca uwb_beca = {
-	.list = LIST_HEAD_INIT(uwb_beca.list),
-	.mutex = __MUTEX_INITIALIZER(uwb_beca.mutex)
-};
-
-
 void uwb_bce_kfree(struct kref *_bce)
 {
 	struct uwb_beca_e *bce = container_of(_bce, struct uwb_beca_e, refcnt);
@@ -185,10 +179,11 @@ void uwb_bce_kfree(struct kref *_bce)
 
 /* Find a beacon by dev addr in the cache */
 static
-struct uwb_beca_e *__uwb_beca_find_bydev(const struct uwb_dev_addr *dev_addr)
+struct uwb_beca_e *__uwb_beca_find_bydev(struct uwb_rc *rc,
+					 const struct uwb_dev_addr *dev_addr)
 {
 	struct uwb_beca_e *bce, *next;
-	list_for_each_entry_safe(bce, next, &uwb_beca.list, node) {
+	list_for_each_entry_safe(bce, next, &rc->uwb_beca.list, node) {
 		d_printf(6, NULL, "looking for addr %02x:%02x in %02x:%02x\n",
 			 dev_addr->data[0], dev_addr->data[1],
 			 bce->dev_addr.data[0], bce->dev_addr.data[1]);
@@ -202,10 +197,11 @@ out:
 
 /* Find a beacon by dev addr in the cache */
 static
-struct uwb_beca_e *__uwb_beca_find_bymac(const struct uwb_mac_addr *mac_addr)
+struct uwb_beca_e *__uwb_beca_find_bymac(struct uwb_rc *rc, 
+					 const struct uwb_mac_addr *mac_addr)
 {
 	struct uwb_beca_e *bce, *next;
-	list_for_each_entry_safe(bce, next, &uwb_beca.list, node) {
+	list_for_each_entry_safe(bce, next, &rc->uwb_beca.list, node) {
 		if (!memcmp(bce->mac_addr, mac_addr->data,
 			    sizeof(struct uwb_mac_addr)))
 			goto out;
@@ -229,11 +225,11 @@ struct uwb_dev *uwb_dev_get_by_devaddr(struct uwb_rc *rc,
 	struct uwb_dev *found = NULL;
 	struct uwb_beca_e *bce;
 
-	mutex_lock(&uwb_beca.mutex);
-	bce = __uwb_beca_find_bydev(devaddr);
+	mutex_lock(&rc->uwb_beca.mutex);
+	bce = __uwb_beca_find_bydev(rc, devaddr);
 	if (bce)
 		found = uwb_dev_try_get(rc, bce->uwb_dev);
-	mutex_unlock(&uwb_beca.mutex);
+	mutex_unlock(&rc->uwb_beca.mutex);
 
 	return found;
 }
@@ -249,11 +245,11 @@ struct uwb_dev *uwb_dev_get_by_macaddr(struct uwb_rc *rc,
 	struct uwb_dev *found = NULL;
 	struct uwb_beca_e *bce;
 
-	mutex_lock(&uwb_beca.mutex);
-	bce = __uwb_beca_find_bymac(macaddr);
+	mutex_lock(&rc->uwb_beca.mutex);
+	bce = __uwb_beca_find_bymac(rc, macaddr);
 	if (bce)
 		found = uwb_dev_try_get(rc, bce->uwb_dev);
-	mutex_unlock(&uwb_beca.mutex);
+	mutex_unlock(&rc->uwb_beca.mutex);
 
 	return found;
 }
@@ -274,7 +270,9 @@ static void uwb_beca_e_init(struct uwb_beca_e *bce)
  * @bf:         Beacon frame (part of b, really)
  * @ts_jiffies: Timestamp (in jiffies) when the beacon was received
  */
-struct uwb_beca_e *__uwb_beca_add(struct uwb_rc_evt_beacon *be,
+static
+struct uwb_beca_e *__uwb_beca_add(struct uwb_rc *rc,
+				  struct uwb_rc_evt_beacon *be,
 				  struct uwb_beacon_frame *bf,
 				  unsigned long ts_jiffies)
 {
@@ -286,7 +284,7 @@ struct uwb_beca_e *__uwb_beca_add(struct uwb_rc_evt_beacon *be,
 	uwb_beca_e_init(bce);
 	bce->ts_jiffies = ts_jiffies;
 	bce->uwb_dev = NULL;
-	list_add(&bce->node, &uwb_beca.list);
+	list_add(&bce->node, &rc->uwb_beca.list);
 	return bce;
 }
 
@@ -295,13 +293,13 @@ struct uwb_beca_e *__uwb_beca_add(struct uwb_rc_evt_beacon *be,
  *
  * Remove associated devicest too.
  */
-void uwb_beca_purge(void)
+void uwb_beca_purge(struct uwb_rc *rc)
 {
 	struct uwb_beca_e *bce, *next;
 	unsigned long expires;
 
-	mutex_lock(&uwb_beca.mutex);
-	list_for_each_entry_safe(bce, next, &uwb_beca.list, node) {
+	mutex_lock(&rc->uwb_beca.mutex);
+	list_for_each_entry_safe(bce, next, &rc->uwb_beca.list, node) {
 		expires = bce->ts_jiffies + msecs_to_jiffies(beacon_timeout_ms);
 		if (time_after(jiffies, expires)) {
 			uwbd_dev_offair(bce);
@@ -309,19 +307,20 @@ void uwb_beca_purge(void)
 			uwb_bce_put(bce);
 		}
 	}
-	mutex_unlock(&uwb_beca.mutex);
+	mutex_unlock(&rc->uwb_beca.mutex);
 }
 
 /* Clean up the whole beacon cache. Called on shutdown */
-void uwb_beca_release(void)
+void uwb_beca_release(struct uwb_rc *rc)
 {
 	struct uwb_beca_e *bce, *next;
-	mutex_lock(&uwb_beca.mutex);
-	list_for_each_entry_safe(bce, next, &uwb_beca.list, node) {
+
+	mutex_lock(&rc->uwb_beca.mutex);
+	list_for_each_entry_safe(bce, next, &rc->uwb_beca.list, node) {
 		list_del(&bce->node);
 		uwb_bce_put(bce);
 	}
-	mutex_unlock(&uwb_beca.mutex);
+	mutex_unlock(&rc->uwb_beca.mutex);
 }
 
 static void uwb_beacon_print(struct uwb_rc *rc, struct uwb_rc_evt_beacon *be,
@@ -437,18 +436,18 @@ int uwbd_evt_handle_rc_beacon(struct uwb_event *evt)
 	if (uwb_mac_addr_bcast(&bf->Device_Identifier))
 		return 0;
 
-	mutex_lock(&uwb_beca.mutex);
-	bce = __uwb_beca_find_bymac(&bf->Device_Identifier);
+	mutex_lock(&rc->uwb_beca.mutex);
+	bce = __uwb_beca_find_bymac(rc, &bf->Device_Identifier);
 	if (bce == NULL) {
 		/* Not in there, a new device is pinging */
 		uwb_beacon_print(evt->rc, be, bf);
-		bce = __uwb_beca_add(be, bf, evt->ts_jiffies);
+		bce = __uwb_beca_add(rc, be, bf, evt->ts_jiffies);
 		if (bce == NULL) {
-			mutex_unlock(&uwb_beca.mutex);
+			mutex_unlock(&rc->uwb_beca.mutex);
 			return -ENOMEM;
 		}
 	}
-	mutex_unlock(&uwb_beca.mutex);
+	mutex_unlock(&rc->uwb_beca.mutex);
 
 	mutex_lock(&bce->mutex);
 	/* purge old beacon data */
diff --git a/drivers/uwb/driver.c b/drivers/uwb/driver.c
index 521cdeb8497..f57c26580de 100644
--- a/drivers/uwb/driver.c
+++ b/drivers/uwb/driver.c
@@ -118,7 +118,6 @@ static int __init uwb_subsys_init(void)
 	result = class_register(&uwb_rc_class);
 	if (result < 0)
 		goto error_uwb_rc_class_register;
-	uwbd_start();
 	uwb_dbg_init();
 	return 0;
 
@@ -132,7 +131,6 @@ module_init(uwb_subsys_init);
 static void __exit uwb_subsys_exit(void)
 {
 	uwb_dbg_exit();
-	uwbd_stop();
 	class_unregister(&uwb_rc_class);
 	uwb_est_destroy();
 	return;
diff --git a/drivers/uwb/lc-rc.c b/drivers/uwb/lc-rc.c
index 1129e8767b5..38e3d57ec8f 100644
--- a/drivers/uwb/lc-rc.c
+++ b/drivers/uwb/lc-rc.c
@@ -36,8 +36,6 @@
 #include <linux/etherdevice.h>
 #include <linux/usb.h>
 
-#define D_LOCAL 1
-#include <linux/uwb/debug.h>
 #include "uwb-internal.h"
 
 static int uwb_rc_index_match(struct device *dev, void *data)
@@ -83,7 +81,6 @@ static void uwb_rc_sys_release(struct device *dev)
 
 	uwb_rc_neh_destroy(rc);
 	uwb_rc_ie_release(rc);
-	d_printf(1, dev, "freed uwb_rc %p\n", rc);
 	kfree(rc);
 }
 
@@ -100,6 +97,8 @@ void uwb_rc_init(struct uwb_rc *rc)
 	rc->scan_type = UWB_SCAN_DISABLED;
 	INIT_LIST_HEAD(&rc->notifs_chain.list);
 	mutex_init(&rc->notifs_chain.mutex);
+	INIT_LIST_HEAD(&rc->uwb_beca.list);
+	mutex_init(&rc->uwb_beca.mutex);
 	uwb_drp_avail_init(rc);
 	uwb_rc_ie_init(rc);
 	uwb_rsv_init(rc);
@@ -250,6 +249,12 @@ int uwb_rc_add(struct uwb_rc *rc, struct device *parent_dev, void *priv)
 
 	rc->priv = priv;
 
+	init_waitqueue_head(&rc->uwbd.wq);
+	INIT_LIST_HEAD(&rc->uwbd.event_list);
+	spin_lock_init(&rc->uwbd.event_list_lock);
+
+	uwbd_start(rc);
+
 	result = rc->start(rc);
 	if (result < 0)
 		goto error_rc_start;
@@ -284,7 +289,7 @@ error_sys_add:
 error_dev_add:
 error_rc_setup:
 	rc->stop(rc);
-	uwbd_flush(rc);
+	uwbd_stop(rc);
 error_rc_start:
 	return result;
 }
@@ -315,16 +320,18 @@ void uwb_rc_rm(struct uwb_rc *rc)
 	uwb_rc_reset(rc);
 
 	rc->stop(rc);
-	uwbd_flush(rc);
+
+	uwbd_stop(rc);
 
 	uwb_dev_lock(&rc->uwb_dev);
 	rc->priv = NULL;
 	rc->cmd = NULL;
 	uwb_dev_unlock(&rc->uwb_dev);
-	mutex_lock(&uwb_beca.mutex);
+	mutex_lock(&rc->uwb_beca.mutex);
 	uwb_dev_for_each(rc, uwb_dev_offair_helper, NULL);
 	__uwb_rc_sys_rm(rc);
-	mutex_unlock(&uwb_beca.mutex);
+	mutex_unlock(&rc->uwb_beca.mutex);
+ 	uwb_beca_release(rc);
 	uwb_dev_rm(&rc->uwb_dev);
 }
 EXPORT_SYMBOL_GPL(uwb_rc_rm);
diff --git a/drivers/uwb/uwb-internal.h b/drivers/uwb/uwb-internal.h
index 031e8a88568..4c244967997 100644
--- a/drivers/uwb/uwb-internal.h
+++ b/drivers/uwb/uwb-internal.h
@@ -160,8 +160,8 @@ struct uwb_event {
 	};
 };
 
-extern void uwbd_start(void);
-extern void uwbd_stop(void);
+extern void uwbd_start(struct uwb_rc *rc);
+extern void uwbd_stop(struct uwb_rc *rc);
 extern struct uwb_event *uwb_event_alloc(size_t, gfp_t gfp_mask);
 extern void uwbd_event_queue(struct uwb_event *);
 void uwbd_flush(struct uwb_rc *rc);
@@ -194,15 +194,6 @@ int uwbd_evt_handle_rc_dev_addr_conflict(struct uwb_event *evt);
 
 extern unsigned long beacon_timeout_ms;
 
-/** Beacon cache list */
-struct uwb_beca {
-	struct list_head list;
-	size_t entries;
-	struct mutex mutex;
-};
-
-extern struct uwb_beca uwb_beca;
-
 /**
  * Beacon cache entry
  *
@@ -229,9 +220,6 @@ struct uwb_beca_e {
 struct uwb_beacon_frame;
 extern ssize_t uwb_bce_print_IEs(struct uwb_dev *, struct uwb_beca_e *,
 				 char *, size_t);
-extern struct uwb_beca_e *__uwb_beca_add(struct uwb_rc_evt_beacon *,
-					 struct uwb_beacon_frame *,
-					 unsigned long);
 
 extern void uwb_bce_kfree(struct kref *_bce);
 static inline void uwb_bce_get(struct uwb_beca_e *bce)
@@ -242,8 +230,8 @@ static inline void uwb_bce_put(struct uwb_beca_e *bce)
 {
 	kref_put(&bce->refcnt, uwb_bce_kfree);
 }
-extern void uwb_beca_purge(void);
-extern void uwb_beca_release(void);
+extern void uwb_beca_purge(struct uwb_rc *rc);
+extern void uwb_beca_release(struct uwb_rc *rc);
 
 struct uwb_dev *uwb_dev_get_by_devaddr(struct uwb_rc *rc,
 				       const struct uwb_dev_addr *devaddr);
diff --git a/drivers/uwb/uwbd.c b/drivers/uwb/uwbd.c
index f75113571f4..ec42ce92dbc 100644
--- a/drivers/uwb/uwbd.c
+++ b/drivers/uwb/uwbd.c
@@ -170,8 +170,6 @@ static const struct uwbd_event uwbd_message_handlers[] = {
 	},
 };
 
-static DEFINE_MUTEX(uwbd_event_mutex);
-
 /**
  * Handle an URC event passed to the UWB Daemon
  *
@@ -235,19 +233,10 @@ static void uwbd_event_handle_message(struct uwb_event *evt)
 		return;
 	}
 
-	/* If this is a reset event we need to drop the
-	 * uwbd_event_mutex or it deadlocks when the reset handler
-	 * attempts to flush the uwbd events. */
-	if (evt->message == UWB_EVT_MSG_RESET)
-		mutex_unlock(&uwbd_event_mutex);
-
 	result = uwbd_message_handlers[evt->message].handler(evt);
 	if (result < 0)
 		dev_err(&rc->uwb_dev.dev, "UWBD: '%s' message failed: %d\n",
 			uwbd_message_handlers[evt->message].name, result);
-
-	if (evt->message == UWB_EVT_MSG_RESET)
-		mutex_lock(&uwbd_event_mutex);
 }
 
 static void uwbd_event_handle(struct uwb_event *evt)
@@ -275,20 +264,6 @@ static void uwbd_event_handle(struct uwb_event *evt)
 
 	__uwb_rc_put(rc);	/* for the __uwb_rc_get() in uwb_rc_notif_cb() */
 }
-/* The UWB Daemon */
-
-
-/** Daemon's PID: used to decide if we can queue or not */
-static int uwbd_pid;
-/** Daemon's task struct for managing the kthread */
-static struct task_struct *uwbd_task;
-/** Daemon's waitqueue for waiting for new events */
-static DECLARE_WAIT_QUEUE_HEAD(uwbd_wq);
-/** Daemon's list of events; we queue/dequeue here */
-static struct list_head uwbd_event_list = LIST_HEAD_INIT(uwbd_event_list);
-/** Daemon's list lock to protect concurent access */
-static DEFINE_SPINLOCK(uwbd_event_list_lock);
-
 
 /**
  * UWB Daemon
@@ -302,65 +277,58 @@ static DEFINE_SPINLOCK(uwbd_event_list_lock);
  * FIXME: should change so we don't have a 1HZ timer all the time, but
  *        only if there are devices.
  */
-static int uwbd(void *unused)
+static int uwbd(void *param)
 {
+	struct uwb_rc *rc = param;
 	unsigned long flags;
-	struct list_head list = LIST_HEAD_INIT(list);
-	struct uwb_event *evt, *nxt;
+	struct uwb_event *evt;
 	int should_stop = 0;
+
 	while (1) {
 		wait_event_interruptible_timeout(
-			uwbd_wq,
-			!list_empty(&uwbd_event_list)
+			rc->uwbd.wq,
+			!list_empty(&rc->uwbd.event_list)
 			  || (should_stop = kthread_should_stop()),
 			HZ);
 		if (should_stop)
 			break;
 		try_to_freeze();
 
-		mutex_lock(&uwbd_event_mutex);
-		spin_lock_irqsave(&uwbd_event_list_lock, flags);
-		list_splice_init(&uwbd_event_list, &list);
-		spin_unlock_irqrestore(&uwbd_event_list_lock, flags);
-		list_for_each_entry_safe(evt, nxt, &list, list_node) {
+		spin_lock_irqsave(&rc->uwbd.event_list_lock, flags);
+		if (!list_empty(&rc->uwbd.event_list)) {
+			evt = list_first_entry(&rc->uwbd.event_list, struct uwb_event, list_node);
 			list_del(&evt->list_node);
+		} else
+			evt = NULL;
+		spin_unlock_irqrestore(&rc->uwbd.event_list_lock, flags);
+
+		if (evt) {
 			uwbd_event_handle(evt);
 			kfree(evt);
 		}
-		mutex_unlock(&uwbd_event_mutex);
 
-		uwb_beca_purge();	/* Purge devices that left */
+		uwb_beca_purge(rc);	/* Purge devices that left */
 	}
 	return 0;
 }
 
 
 /** Start the UWB daemon */
-void uwbd_start(void)
+void uwbd_start(struct uwb_rc *rc)
 {
-	uwbd_task = kthread_run(uwbd, NULL, "uwbd");
-	if (uwbd_task == NULL)
+	rc->uwbd.task = kthread_run(uwbd, rc, "uwbd");
+	if (rc->uwbd.task == NULL)
 		printk(KERN_ERR "UWB: Cannot start management daemon; "
 		       "UWB won't work\n");
 	else
-		uwbd_pid = uwbd_task->pid;
+		rc->uwbd.pid = rc->uwbd.task->pid;
 }
 
 /* Stop the UWB daemon and free any unprocessed events */
-void uwbd_stop(void)
+void uwbd_stop(struct uwb_rc *rc)
 {
-	unsigned long flags;
-	struct uwb_event *evt, *nxt;
-	kthread_stop(uwbd_task);
-	spin_lock_irqsave(&uwbd_event_list_lock, flags);
-	uwbd_pid = 0;
-	list_for_each_entry_safe(evt, nxt, &uwbd_event_list, list_node) {
-		if (evt->type == UWB_EVT_TYPE_NOTIF)
-			kfree(evt->notif.rceb);
-		kfree(evt);
-	}
-	spin_unlock_irqrestore(&uwbd_event_list_lock, flags);
-	uwb_beca_release();
+	kthread_stop(rc->uwbd.task);
+	uwbd_flush(rc);
 }
 
 /*
@@ -377,18 +345,20 @@ void uwbd_stop(void)
  */
 void uwbd_event_queue(struct uwb_event *evt)
 {
+	struct uwb_rc *rc = evt->rc;
 	unsigned long flags;
-	spin_lock_irqsave(&uwbd_event_list_lock, flags);
-	if (uwbd_pid != 0) {
-		list_add(&evt->list_node, &uwbd_event_list);
-		wake_up_all(&uwbd_wq);
+
+	spin_lock_irqsave(&rc->uwbd.event_list_lock, flags);
+	if (rc->uwbd.pid != 0) {
+		list_add(&evt->list_node, &rc->uwbd.event_list);
+		wake_up_all(&rc->uwbd.wq);
 	} else {
 		__uwb_rc_put(evt->rc);
 		if (evt->type == UWB_EVT_TYPE_NOTIF)
 			kfree(evt->notif.rceb);
 		kfree(evt);
 	}
-	spin_unlock_irqrestore(&uwbd_event_list_lock, flags);
+	spin_unlock_irqrestore(&rc->uwbd.event_list_lock, flags);
 	return;
 }
 
@@ -396,10 +366,8 @@ void uwbd_flush(struct uwb_rc *rc)
 {
 	struct uwb_event *evt, *nxt;
 
-	mutex_lock(&uwbd_event_mutex);
-
-	spin_lock_irq(&uwbd_event_list_lock);
-	list_for_each_entry_safe(evt, nxt, &uwbd_event_list, list_node) {
+	spin_lock_irq(&rc->uwbd.event_list_lock);
+	list_for_each_entry_safe(evt, nxt, &rc->uwbd.event_list, list_node) {
 		if (evt->rc == rc) {
 			__uwb_rc_put(rc);
 			list_del(&evt->list_node);
@@ -408,7 +376,5 @@ void uwbd_flush(struct uwb_rc *rc)
 			kfree(evt);
 		}
 	}
-	spin_unlock_irq(&uwbd_event_list_lock);
-
-	mutex_unlock(&uwbd_event_mutex);
+	spin_unlock_irq(&rc->uwbd.event_list_lock);
 }
diff --git a/include/linux/uwb.h b/include/linux/uwb.h
index 6d93f54b887..881f0c5b6d2 100644
--- a/include/linux/uwb.h
+++ b/include/linux/uwb.h
@@ -30,6 +30,7 @@
 #include <linux/device.h>
 #include <linux/mutex.h>
 #include <linux/timer.h>
+#include <linux/wait.h>
 #include <linux/workqueue.h>
 #include <linux/uwb/spec.h>
 
@@ -86,6 +87,22 @@ struct uwb_notifs_chain {
 	struct mutex mutex;
 };
 
+/* Beacon cache list */
+struct uwb_beca {
+	struct list_head list;
+	size_t entries;
+	struct mutex mutex;
+};
+
+/* Event handling thread. */
+struct uwbd {
+	int pid;
+	struct task_struct *task;
+	wait_queue_head_t wq;
+	struct list_head event_list;
+	spinlock_t event_list_lock;
+};
+
 /**
  * struct uwb_mas_bm - a bitmap of all MAS in a superframe
  * @bm: a bitmap of length #UWB_NUM_MAS
@@ -342,6 +359,9 @@ struct uwb_rc {
 	enum uwb_scan_type scan_type:3;
 	unsigned ready:1;
 	struct uwb_notifs_chain notifs_chain;
+	struct uwb_beca uwb_beca;
+
+	struct uwbd uwbd;
 
 	struct uwb_drp_avail drp_avail;
 	struct list_head reservations;
-- 
cgit v1.2.3-70-g09d2


From 71566a0d161edec70361b7f90f6e54af6a6d5d05 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Fri, 31 Oct 2008 12:57:20 +0100
Subject: tracing/fastboot: Enable boot tracing only during initcalls

Impact: modify boot tracer

We used to disable the initcall tracing at a specified time (IE: end
of builtin initcalls). But we don't need it anymore. It will be
stopped when initcalls are finished.

However we want two things:

_Start this tracing only after pre-smp initcalls are finished.

_Since we are planning to trace sched_switches at the same time, we
want to enable them only during the initcall execution.

For this purpose, this patch introduce two functions to enable/disable
the sched_switch tracing during boot.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/ftrace.h    | 24 ++++++++++++++++++++++--
 init/main.c               |  4 +++-
 kernel/trace/trace_boot.c | 28 ++++++++++++++++------------
 3 files changed, 41 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index e46a7b34037..4642959e5bd 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -234,6 +234,11 @@ ftrace_init_module(unsigned long *start, unsigned long *end) { }
 #endif
 
 
+/*
+ * Structure which defines the trace of an initcall.
+ * You don't have to fill the func field since it is
+ * only used internally by the tracer.
+ */
 struct boot_trace {
 	pid_t			caller;
 	char			func[KSYM_NAME_LEN];
@@ -244,13 +249,28 @@ struct boot_trace {
 };
 
 #ifdef CONFIG_BOOT_TRACER
+/* Append the trace on the ring-buffer */
 extern void trace_boot(struct boot_trace *it, initcall_t fn);
+
+/* Tells the tracer that smp_pre_initcall is finished.
+ * So we can start the tracing
+ */
 extern void start_boot_trace(void);
-extern void stop_boot_trace(void);
+
+/* Resume the tracing of other necessary events
+ * such as sched switches
+ */
+extern void enable_boot_trace(void);
+
+/* Suspend this tracing. Actually, only sched_switches tracing have
+ * to be suspended. Initcalls doesn't need it.)
+ */
+extern void disable_boot_trace(void);
 #else
 static inline void trace_boot(struct boot_trace *it, initcall_t fn) { }
 static inline void start_boot_trace(void) { }
-static inline void stop_boot_trace(void) { }
+static inline void enable_boot_trace(void) { }
+static inline void disable_boot_trace(void) { }
 #endif
 
 
diff --git a/init/main.c b/init/main.c
index 7e117a231af..4b03cd5656c 100644
--- a/init/main.c
+++ b/init/main.c
@@ -711,6 +711,7 @@ int do_one_initcall(initcall_t fn)
 		it.caller = task_pid_nr(current);
 		printk("calling  %pF @ %i\n", fn, it.caller);
 		it.calltime = ktime_get();
+		enable_boot_trace();
 	}
 
 	it.result = fn();
@@ -722,6 +723,7 @@ int do_one_initcall(initcall_t fn)
 		printk("initcall %pF returned %d after %Ld usecs\n", fn,
 			it.result, it.duration);
 		trace_boot(&it, fn);
+		disable_boot_trace();
 	}
 
 	msgbuf[0] = 0;
@@ -882,7 +884,7 @@ static int __init kernel_init(void * unused)
 	 * we're essentially up and running. Get rid of the
 	 * initmem segments and start the user-mode stuff..
 	 */
-	stop_boot_trace();
+
 	init_post();
 	return 0;
 }
diff --git a/kernel/trace/trace_boot.c b/kernel/trace/trace_boot.c
index d0a5e50eeff..d104d5b4641 100644
--- a/kernel/trace/trace_boot.c
+++ b/kernel/trace/trace_boot.c
@@ -13,23 +13,29 @@
 #include "trace.h"
 
 static struct trace_array *boot_trace;
-static int trace_boot_enabled;
+static bool pre_initcalls_finished;
 
-
-/* Should be started after do_pre_smp_initcalls() in init/main.c */
+/* Tells the boot tracer that the pre_smp_initcalls are finished.
+ * So we are ready .
+ * It doesn't enable sched events tracing however.
+ * You have to call enable_boot_trace to do so.
+ */
 void start_boot_trace(void)
 {
-	trace_boot_enabled = 1;
+	pre_initcalls_finished = true;
+}
+
+void enable_boot_trace(void)
+{
 }
 
-void stop_boot_trace(void)
+void disable_boot_trace(void)
 {
-	trace_boot_enabled = 0;
 }
 
 void reset_boot_trace(struct trace_array *tr)
 {
-	stop_boot_trace();
+	disable_boot_trace();
 }
 
 static void boot_trace_init(struct trace_array *tr)
@@ -37,8 +43,6 @@ static void boot_trace_init(struct trace_array *tr)
 	int cpu;
 	boot_trace = tr;
 
-	trace_boot_enabled = 0;
-
 	for_each_cpu_mask(cpu, cpu_possible_map)
 		tracing_reset(tr, cpu);
 }
@@ -46,9 +50,9 @@ static void boot_trace_init(struct trace_array *tr)
 static void boot_trace_ctrl_update(struct trace_array *tr)
 {
 	if (tr->ctrl)
-		start_boot_trace();
+		enable_boot_trace();
 	else
-		stop_boot_trace();
+		disable_boot_trace();
 }
 
 static enum print_line_t initcall_print_line(struct trace_iterator *iter)
@@ -99,7 +103,7 @@ void trace_boot(struct boot_trace *it, initcall_t fn)
 	unsigned long irq_flags;
 	struct trace_array *tr = boot_trace;
 
-	if (!trace_boot_enabled)
+	if (!pre_initcalls_finished)
 		return;
 
 	/* Get its name now since this function could
-- 
cgit v1.2.3-70-g09d2


From fd8cd7e1919fc1c27fe2fdccd2a1cd32f791ef0f Mon Sep 17 00:00:00 2001
From: Alok Kataria <akataria@vmware.com>
Date: Mon, 3 Nov 2008 15:50:38 -0800
Subject: x86: vmware: look for DMI string in the product serial key

Impact: Should permit VMware detection on older platforms where the
vendor is changed.  Could theoretically cause a regression if some
weird serial number scheme contains the string "VMware" by pure
chance.  Seems unlikely, especially with the mixed case.

In some user configured cases, VMware may choose not to put a VMware specific
DMI string, but the product serial key is always there and is VMware specific.
Add a interface to check the serial key, when checking for VMware in the DMI
information.

Signed-off-by: Alok N Kataria <akataria@vmware.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/kernel/cpu/vmware.c |  7 ++++++-
 drivers/firmware/dmi_scan.c  | 11 +++++++++++
 include/linux/dmi.h          |  2 ++
 3 files changed, 19 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/arch/x86/kernel/cpu/vmware.c b/arch/x86/kernel/cpu/vmware.c
index a0905ecfe7d..c034bda842d 100644
--- a/arch/x86/kernel/cpu/vmware.c
+++ b/arch/x86/kernel/cpu/vmware.c
@@ -61,6 +61,11 @@ static unsigned long __vmware_get_tsc_khz(void)
         return tsc_hz;
 }
 
+/*
+ * While checking the dmi string infomation, just checking the product
+ * serial key should be enough, as this will always have a VMware
+ * specific string when running under VMware hypervisor.
+ */
 int vmware_platform(void)
 {
 	if (cpu_has_hypervisor) {
@@ -74,7 +79,7 @@ int vmware_platform(void)
 		hyper_vendor_id[12] = '\0';
 		if (!strcmp(hyper_vendor_id, "VMwareVMware"))
 			return 1;
-	} else if (dmi_available && dmi_name_in_vendors("VMware") &&
+	} else if (dmi_available && dmi_name_in_serial("VMware") &&
 		   __vmware_platform())
 		return 1;
 
diff --git a/drivers/firmware/dmi_scan.c b/drivers/firmware/dmi_scan.c
index 3e526b6d00c..d66d4128290 100644
--- a/drivers/firmware/dmi_scan.c
+++ b/drivers/firmware/dmi_scan.c
@@ -467,6 +467,17 @@ const char *dmi_get_system_info(int field)
 }
 EXPORT_SYMBOL(dmi_get_system_info);
 
+/**
+ *	dmi_name_in_serial - 	Check if string is in the DMI product serial
+ *				information.
+ */
+int dmi_name_in_serial(const char *str)
+{
+	int f = DMI_PRODUCT_SERIAL;
+	if (dmi_ident[f] && strstr(dmi_ident[f], str))
+		return 1;
+	return 0;
+}
 
 /**
  *	dmi_name_in_vendors - Check if string is anywhere in the DMI vendor information.
diff --git a/include/linux/dmi.h b/include/linux/dmi.h
index e5084eb5943..2bfda178f27 100644
--- a/include/linux/dmi.h
+++ b/include/linux/dmi.h
@@ -44,6 +44,7 @@ extern const struct dmi_device * dmi_find_device(int type, const char *name,
 extern void dmi_scan_machine(void);
 extern int dmi_get_year(int field);
 extern int dmi_name_in_vendors(const char *str);
+extern int dmi_name_in_serial(const char *str);
 extern int dmi_available;
 extern int dmi_walk(void (*decode)(const struct dmi_header *));
 
@@ -56,6 +57,7 @@ static inline const struct dmi_device * dmi_find_device(int type, const char *na
 static inline void dmi_scan_machine(void) { return; }
 static inline int dmi_get_year(int year) { return 0; }
 static inline int dmi_name_in_vendors(const char *s) { return 0; }
+static inline int dmi_name_in_serial(const char *s) { return 0; }
 #define dmi_available 0
 static inline int dmi_walk(void (*decode)(const struct dmi_header *))
 	{ return -1; }
-- 
cgit v1.2.3-70-g09d2


From 7d43d1a0f2cf535167ec7247f110a1f85cecac43 Mon Sep 17 00:00:00 2001
From: Gerrit Renker <gerrit@erg.abdn.ac.uk>
Date: Tue, 4 Nov 2008 23:43:47 -0800
Subject: dccp: Implement lookup table for feature-negotiation information

A lookup table for feature-negotiation information, extracted from RFC
4340/42, is provided by this patch. All currently known features can
be found in this table, along with their feature location, their
default value, and type.

Signed-off-by: Gerrit Renker <gerrit@erg.abdn.ac.uk>
Acked-by: Ian McDonald <ian.mcdonald@jandi.co.nz>
Acked-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/dccp.h |  9 +++++----
 net/dccp/feat.c      | 39 +++++++++++++++++++++++++++++++++++++++
 2 files changed, 44 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/dccp.h b/include/linux/dccp.h
index 6080449fbec..3978aff197d 100644
--- a/include/linux/dccp.h
+++ b/include/linux/dccp.h
@@ -176,19 +176,20 @@ enum {
 };
 
 /* DCCP features (RFC 4340 section 6.4) */
-enum {
+enum dccp_feature_numbers {
 	DCCPF_RESERVED = 0,
 	DCCPF_CCID = 1,
-	DCCPF_SHORT_SEQNOS = 2,		/* XXX: not yet implemented */
+	DCCPF_SHORT_SEQNOS = 2,
 	DCCPF_SEQUENCE_WINDOW = 3,
-	DCCPF_ECN_INCAPABLE = 4,	/* XXX: not yet implemented */
+	DCCPF_ECN_INCAPABLE = 4,
 	DCCPF_ACK_RATIO = 5,
 	DCCPF_SEND_ACK_VECTOR = 6,
 	DCCPF_SEND_NDP_COUNT = 7,
 	DCCPF_MIN_CSUM_COVER = 8,
-	DCCPF_DATA_CHECKSUM = 9,	/* XXX: not yet implemented */
+	DCCPF_DATA_CHECKSUM = 9,
 	/* 10-127 reserved */
 	DCCPF_MIN_CCID_SPECIFIC = 128,
+	DCCPF_SEND_LEV_RATE = 192,	/* RFC 4342, sec. 8.4 */
 	DCCPF_MAX_CCID_SPECIFIC = 255,
 };
 
diff --git a/net/dccp/feat.c b/net/dccp/feat.c
index 933a0ecf8d4..45e36fc7943 100644
--- a/net/dccp/feat.c
+++ b/net/dccp/feat.c
@@ -23,6 +23,43 @@
 
 #define DCCP_FEAT_SP_NOAGREE (-123)
 
+static const struct {
+	u8			feat_num;		/* DCCPF_xxx */
+	enum dccp_feat_type	rxtx;			/* RX or TX  */
+	enum dccp_feat_type	reconciliation;		/* SP or NN  */
+	u8			default_value;		/* as in 6.4 */
+/*
+ *    Lookup table for location and type of features (from RFC 4340/4342)
+ *  +--------------------------+----+-----+----+----+---------+-----------+
+ *  | Feature                  | Location | Reconc. | Initial |  Section  |
+ *  |                          | RX | TX  | SP | NN |  Value  | Reference |
+ *  +--------------------------+----+-----+----+----+---------+-----------+
+ *  | DCCPF_CCID               |    |  X  | X  |    |   2     | 10        |
+ *  | DCCPF_SHORT_SEQNOS       |    |  X  | X  |    |   0     |  7.6.1    |
+ *  | DCCPF_SEQUENCE_WINDOW    |    |  X  |    | X  | 100     |  7.5.2    |
+ *  | DCCPF_ECN_INCAPABLE      | X  |     | X  |    |   0     | 12.1      |
+ *  | DCCPF_ACK_RATIO          |    |  X  |    | X  |   2     | 11.3      |
+ *  | DCCPF_SEND_ACK_VECTOR    | X  |     | X  |    |   0     | 11.5      |
+ *  | DCCPF_SEND_NDP_COUNT     |    |  X  | X  |    |   0     |  7.7.2    |
+ *  | DCCPF_MIN_CSUM_COVER     | X  |     | X  |    |   0     |  9.2.1    |
+ *  | DCCPF_DATA_CHECKSUM      | X  |     | X  |    |   0     |  9.3.1    |
+ *  | DCCPF_SEND_LEV_RATE      | X  |     | X  |    |   0     | 4342/8.4  |
+ *  +--------------------------+----+-----+----+----+---------+-----------+
+ */
+} dccp_feat_table[] = {
+	{ DCCPF_CCID,		 FEAT_AT_TX, FEAT_SP, 2 },
+	{ DCCPF_SHORT_SEQNOS,	 FEAT_AT_TX, FEAT_SP, 0 },
+	{ DCCPF_SEQUENCE_WINDOW, FEAT_AT_TX, FEAT_NN, 100 },
+	{ DCCPF_ECN_INCAPABLE,	 FEAT_AT_RX, FEAT_SP, 0 },
+	{ DCCPF_ACK_RATIO,	 FEAT_AT_TX, FEAT_NN, 2 },
+	{ DCCPF_SEND_ACK_VECTOR, FEAT_AT_RX, FEAT_SP, 0 },
+	{ DCCPF_SEND_NDP_COUNT,  FEAT_AT_TX, FEAT_SP, 0 },
+	{ DCCPF_MIN_CSUM_COVER,  FEAT_AT_RX, FEAT_SP, 0 },
+	{ DCCPF_DATA_CHECKSUM,	 FEAT_AT_RX, FEAT_SP, 0 },
+	{ DCCPF_SEND_LEV_RATE,	 FEAT_AT_RX, FEAT_SP, 0 },
+};
+#define DCCP_FEAT_SUPPORTED_MAX		ARRAY_SIZE(dccp_feat_table)
+
 int dccp_feat_change(struct dccp_minisock *dmsk, u8 type, u8 feature,
 		     u8 *val, u8 len, gfp_t gfp)
 {
@@ -639,6 +676,8 @@ const char *dccp_feat_name(const u8 feat)
 	if (feat > DCCPF_DATA_CHECKSUM && feat < DCCPF_MIN_CCID_SPECIFIC)
 		return feature_names[DCCPF_RESERVED];
 
+	if (feat ==  DCCPF_SEND_LEV_RATE)
+		return "Send Loss Event Rate";
 	if (feat >= DCCPF_MIN_CCID_SPECIFIC)
 		return "CCID-specific";
 
-- 
cgit v1.2.3-70-g09d2


From ac75773c2742d82cbcb078708df406e9017224b7 Mon Sep 17 00:00:00 2001
From: Gerrit Renker <gerrit@erg.abdn.ac.uk>
Date: Tue, 4 Nov 2008 23:55:49 -0800
Subject: dccp: Per-socket initialisation of feature negotiation

This provides feature-negotiation initialisation for both DCCP sockets
and DCCP request_sockets, to support feature negotiation during
connection setup.

It also resolves a FIXME regarding the congestion control
initialisation.

Thanks to Wei Yongjun for help with the IPv6 side of this patch.

Signed-off-by: Gerrit Renker <gerrit@erg.abdn.ac.uk>
Acked-by: Ian McDonald <ian.mcdonald@jandi.co.nz>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/dccp.h |  4 ++++
 net/dccp/dccp.h      |  3 ++-
 net/dccp/feat.c      | 55 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 net/dccp/feat.h      |  1 +
 net/dccp/input.c     |  2 --
 net/dccp/ipv4.c      |  3 ++-
 net/dccp/ipv6.c      |  3 ++-
 net/dccp/minisocks.c |  7 ++++++-
 net/dccp/proto.c     |  1 +
 9 files changed, 73 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/dccp.h b/include/linux/dccp.h
index 3978aff197d..484b8a1fb02 100644
--- a/include/linux/dccp.h
+++ b/include/linux/dccp.h
@@ -412,6 +412,7 @@ extern void dccp_minisock_init(struct dccp_minisock *dmsk);
  * @dreq_iss: initial sequence number sent on the Response (RFC 4340, 7.1)
  * @dreq_isr: initial sequence number received on the Request
  * @dreq_service: service code present on the Request (there is just one)
+ * @dreq_featneg: feature negotiation options for this connection
  * The following two fields are analogous to the ones in dccp_sock:
  * @dreq_timestamp_echo: last received timestamp to echo (13.1)
  * @dreq_timestamp_echo: the time of receiving the last @dreq_timestamp_echo
@@ -421,6 +422,7 @@ struct dccp_request_sock {
 	__u64			 dreq_iss;
 	__u64			 dreq_isr;
 	__be32			 dreq_service;
+	struct list_head	 dreq_featneg;
 	__u32			 dreq_timestamp_echo;
 	__u32			 dreq_timestamp_time;
 };
@@ -498,6 +500,7 @@ struct dccp_ackvec;
  * @dccps_mss_cache - current value of MSS (path MTU minus header sizes)
  * @dccps_rate_last - timestamp for rate-limiting DCCP-Sync (RFC 4340, 7.5.4)
  * @dccps_minisock - associated minisock (accessed via dccp_msk)
+ * @dccps_featneg - tracks feature-negotiation state (mostly during handshake)
  * @dccps_hc_rx_ackvec - rx half connection ack vector
  * @dccps_hc_rx_ccid - CCID used for the receiver (or receiving half-connection)
  * @dccps_hc_tx_ccid - CCID used for the sender (or sending half-connection)
@@ -535,6 +538,7 @@ struct dccp_sock {
 	__u64				dccps_ndp_count:48;
 	unsigned long			dccps_rate_last;
 	struct dccp_minisock		dccps_minisock;
+	struct list_head		dccps_featneg;
 	struct dccp_ackvec		*dccps_hc_rx_ackvec;
 	struct ccid			*dccps_hc_rx_ccid;
 	struct ccid			*dccps_hc_tx_ccid;
diff --git a/net/dccp/dccp.h b/net/dccp/dccp.h
index d6fed595a3f..dee4a90886d 100644
--- a/net/dccp/dccp.h
+++ b/net/dccp/dccp.h
@@ -252,7 +252,8 @@ extern const char *dccp_state_name(const int state);
 extern void dccp_set_state(struct sock *sk, const int state);
 extern void dccp_done(struct sock *sk);
 
-extern void dccp_reqsk_init(struct request_sock *req, struct sk_buff *skb);
+extern int  dccp_reqsk_init(struct request_sock *rq, struct dccp_sock const *dp,
+			    struct sk_buff const *skb);
 
 extern int dccp_v4_conn_request(struct sock *sk, struct sk_buff *skb);
 
diff --git a/net/dccp/feat.c b/net/dccp/feat.c
index 9a37b6ce3ac..069d8ffe4c6 100644
--- a/net/dccp/feat.c
+++ b/net/dccp/feat.c
@@ -90,6 +90,20 @@ static u8 dccp_feat_type(u8 feat_num)
 	return dccp_feat_table[idx].reconciliation;
 }
 
+/* copy constructor, fval must not already contain allocated memory */
+static int dccp_feat_clone_sp_val(dccp_feat_val *fval, u8 const *val, u8 len)
+{
+	fval->sp.len = len;
+	if (fval->sp.len > 0) {
+		fval->sp.vec = kmemdup(val, len, gfp_any());
+		if (fval->sp.vec == NULL) {
+			fval->sp.len = 0;
+			return -ENOBUFS;
+		}
+	}
+	return 0;
+}
+
 static void dccp_feat_val_destructor(u8 feat_num, dccp_feat_val *val)
 {
 	if (unlikely(val == NULL))
@@ -99,6 +113,28 @@ static void dccp_feat_val_destructor(u8 feat_num, dccp_feat_val *val)
 	memset(val, 0, sizeof(*val));
 }
 
+static struct dccp_feat_entry *
+	      dccp_feat_clone_entry(struct dccp_feat_entry const *original)
+{
+	struct dccp_feat_entry *new;
+	u8 type = dccp_feat_type(original->feat_num);
+
+	if (type == FEAT_UNKNOWN)
+		return NULL;
+
+	new = kmemdup(original, sizeof(struct dccp_feat_entry), gfp_any());
+	if (new == NULL)
+		return NULL;
+
+	if (type == FEAT_SP && dccp_feat_clone_sp_val(&new->val,
+						      original->val.sp.vec,
+						      original->val.sp.len)) {
+		kfree(new);
+		return NULL;
+	}
+	return new;
+}
+
 static void dccp_feat_entry_destructor(struct dccp_feat_entry *entry)
 {
 	if (entry != NULL) {
@@ -133,6 +169,25 @@ void dccp_feat_list_purge(struct list_head *fn_list)
 }
 EXPORT_SYMBOL_GPL(dccp_feat_list_purge);
 
+/* generate @to as full clone of @from - @to must not contain any nodes */
+int dccp_feat_clone_list(struct list_head const *from, struct list_head *to)
+{
+	struct dccp_feat_entry *entry, *new;
+
+	INIT_LIST_HEAD(to);
+	list_for_each_entry(entry, from, node) {
+		new = dccp_feat_clone_entry(entry);
+		if (new == NULL)
+			goto cloning_failed;
+		list_add_tail(&new->node, to);
+	}
+	return 0;
+
+cloning_failed:
+	dccp_feat_list_purge(to);
+	return -ENOMEM;
+}
+
 int dccp_feat_change(struct dccp_minisock *dmsk, u8 type, u8 feature,
 		     u8 *val, u8 len, gfp_t gfp)
 {
diff --git a/net/dccp/feat.h b/net/dccp/feat.h
index 56df82ceef0..f5e99b39712 100644
--- a/net/dccp/feat.h
+++ b/net/dccp/feat.h
@@ -96,6 +96,7 @@ extern int  dccp_feat_confirm_recv(struct sock *sk, u8 type, u8 feature,
 				   u8 *val, u8 len);
 extern void dccp_feat_clean(struct dccp_minisock *dmsk);
 extern int  dccp_feat_clone(struct sock *oldsk, struct sock *newsk);
+extern int  dccp_feat_clone_list(struct list_head const *, struct list_head *);
 extern int  dccp_feat_init(struct dccp_minisock *dmsk);
 
 #endif /* _DCCP_FEAT_H */
diff --git a/net/dccp/input.c b/net/dccp/input.c
index 779d0ed9ae9..3070015edc7 100644
--- a/net/dccp/input.c
+++ b/net/dccp/input.c
@@ -590,8 +590,6 @@ int dccp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
 			if (inet_csk(sk)->icsk_af_ops->conn_request(sk,
 								    skb) < 0)
 				return 1;
-
-			/* FIXME: do congestion control initialization */
 			goto discard;
 		}
 		if (dh->dccph_type == DCCP_PKT_RESET)
diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c
index 01e3e020625..cbf522dfecc 100644
--- a/net/dccp/ipv4.c
+++ b/net/dccp/ipv4.c
@@ -595,7 +595,8 @@ int dccp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
 	if (req == NULL)
 		goto drop;
 
-	dccp_reqsk_init(req, skb);
+	if (dccp_reqsk_init(req, dccp_sk(sk), skb))
+		goto drop_and_free;
 
 	dreq = dccp_rsk(req);
 	if (dccp_parse_options(sk, dreq, skb))
diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c
index d4ce1224e00..4e172ccfd76 100644
--- a/net/dccp/ipv6.c
+++ b/net/dccp/ipv6.c
@@ -426,7 +426,8 @@ static int dccp_v6_conn_request(struct sock *sk, struct sk_buff *skb)
 	if (req == NULL)
 		goto drop;
 
-	dccp_reqsk_init(req, skb);
+	if (dccp_reqsk_init(req, dccp_sk(sk), skb))
+		goto drop_and_free;
 
 	dreq = dccp_rsk(req);
 	if (dccp_parse_options(sk, dreq, skb))
diff --git a/net/dccp/minisocks.c b/net/dccp/minisocks.c
index e6bf99e3e41..afdacbb94d7 100644
--- a/net/dccp/minisocks.c
+++ b/net/dccp/minisocks.c
@@ -125,6 +125,7 @@ struct sock *dccp_create_openreq_child(struct sock *sk,
 		newdp->dccps_timestamp_time = dreq->dreq_timestamp_time;
 		newicsk->icsk_rto	    = DCCP_TIMEOUT_INIT;
 
+		INIT_LIST_HEAD(&newdp->dccps_featneg);
 		if (dccp_feat_clone(sk, newsk))
 			goto out_free;
 
@@ -304,7 +305,8 @@ void dccp_reqsk_send_ack(struct sock *sk, struct sk_buff *skb,
 
 EXPORT_SYMBOL_GPL(dccp_reqsk_send_ack);
 
-void dccp_reqsk_init(struct request_sock *req, struct sk_buff *skb)
+int dccp_reqsk_init(struct request_sock *req,
+		    struct dccp_sock const *dp, struct sk_buff const *skb)
 {
 	struct dccp_request_sock *dreq = dccp_rsk(req);
 
@@ -313,6 +315,9 @@ void dccp_reqsk_init(struct request_sock *req, struct sk_buff *skb)
 	inet_rsk(req)->acked	  = 0;
 	req->rcv_wnd		  = sysctl_dccp_feat_sequence_window;
 	dreq->dreq_timestamp_echo = 0;
+
+	/* inherit feature negotiation options from listening socket */
+	return dccp_feat_clone_list(&dp->dccps_featneg, &dreq->dreq_featneg);
 }
 
 EXPORT_SYMBOL_GPL(dccp_reqsk_init);
diff --git a/net/dccp/proto.c b/net/dccp/proto.c
index d0bd3481976..1cdf4ae9960 100644
--- a/net/dccp/proto.c
+++ b/net/dccp/proto.c
@@ -193,6 +193,7 @@ int dccp_init_sock(struct sock *sk, const __u8 ctl_sock_initialized)
 
 	dccp_init_xmit_timers(sk);
 
+	INIT_LIST_HEAD(&dp->dccps_featneg);
 	/*
 	 * FIXME: We're hardcoding the CCID, and doing this at this point makes
 	 * the listening (master) sock get CCID control blocks, which is not
-- 
cgit v1.2.3-70-g09d2


From 1f29fae29709b4668979e244c09b2fa78ff1ad59 Mon Sep 17 00:00:00 2001
From: "Serge E. Hallyn" <serue@us.ibm.com>
Date: Wed, 5 Nov 2008 16:08:52 -0600
Subject: file capabilities: add no_file_caps switch (v4)

Add a no_file_caps boot option when file capabilities are
compiled into the kernel (CONFIG_SECURITY_FILE_CAPABILITIES=y).

This allows distributions to ship a kernel with file capabilities
compiled in, without forcing users to use (and understand and
trust) them.

When no_file_caps is specified at boot, then when a process executes
a file, any file capabilities stored with that file will not be
used in the calculation of the process' new capability sets.

This means that booting with the no_file_caps boot option will
not be the same as booting a kernel with file capabilities
compiled out - in particular a task with  CAP_SETPCAP will not
have any chance of passing capabilities to another task (which
isn't "really" possible anyway, and which may soon by killed
altogether by David Howells in any case), and it will instead
be able to put new capabilities in its pI.  However since fI
will always be empty and pI is masked with fI, it gains the
task nothing.

We also support the extra prctl options, setting securebits and
dropping capabilities from the per-process bounding set.

The other remaining difference is that killpriv, task_setscheduler,
setioprio, and setnice will continue to be hooked.  That will
be noticable in the case where a root task changed its uid
while keeping some caps, and another task owned by the new uid
tries to change settings for the more privileged task.

Changelog:
	Nov 05 2008: (v4) trivial port on top of always-start-\
		with-clear-caps patch
	Sep 23 2008: nixed file_caps_enabled when file caps are
		not compiled in as it isn't used.
		Document no_file_caps in kernel-parameters.txt.

Signed-off-by: Serge Hallyn <serue@us.ibm.com>
Acked-by: Andrew G. Morgan <morgan@kernel.org>
Signed-off-by: James Morris <jmorris@namei.org>
---
 Documentation/kernel-parameters.txt |  4 ++++
 include/linux/capability.h          |  3 +++
 kernel/capability.c                 | 11 +++++++++++
 security/commoncap.c                |  3 +++
 4 files changed, 21 insertions(+)

(limited to 'include/linux')

diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index 1bbcaa8982b..784443acca9 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -1459,6 +1459,10 @@ and is between 256 and 4096 characters. It is defined in the file
 			instruction doesn't work correctly and not to
 			use it.
 
+	no_file_caps	Tells the kernel not to honor file capabilities.  The
+			only way then for a file to be executed with privilege
+			is to be setuid root or executed by root.
+
 	nohalt		[IA-64] Tells the kernel not to use the power saving
 			function PAL_HALT_LIGHT when idle. This increases
 			power-consumption. On the positive side, it reduces
diff --git a/include/linux/capability.h b/include/linux/capability.h
index 9d1fe30b6f6..5bc145bd759 100644
--- a/include/linux/capability.h
+++ b/include/linux/capability.h
@@ -68,6 +68,9 @@ typedef struct __user_cap_data_struct {
 #define VFS_CAP_U32             VFS_CAP_U32_2
 #define VFS_CAP_REVISION	VFS_CAP_REVISION_2
 
+#ifdef CONFIG_SECURITY_FILE_CAPABILITIES
+extern int file_caps_enabled;
+#endif
 
 struct vfs_cap_data {
 	__le32 magic_etc;            /* Little endian */
diff --git a/kernel/capability.c b/kernel/capability.c
index 33e51e78c2d..e13a68535ad 100644
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -33,6 +33,17 @@ EXPORT_SYMBOL(__cap_empty_set);
 EXPORT_SYMBOL(__cap_full_set);
 EXPORT_SYMBOL(__cap_init_eff_set);
 
+#ifdef CONFIG_SECURITY_FILE_CAPABILITIES
+int file_caps_enabled = 1;
+
+static int __init file_caps_disable(char *str)
+{
+	file_caps_enabled = 0;
+	return 1;
+}
+__setup("no_file_caps", file_caps_disable);
+#endif
+
 /*
  * More recent versions of libcap are available from:
  *
diff --git a/security/commoncap.c b/security/commoncap.c
index 3976613db82..f88119cb2bc 100644
--- a/security/commoncap.c
+++ b/security/commoncap.c
@@ -281,6 +281,9 @@ static int get_file_caps(struct linux_binprm *bprm)
 
 	bprm_clear_caps(bprm);
 
+	if (!file_caps_enabled)
+		return 0;
+
 	if (bprm->file->f_vfsmnt->mnt_flags & MNT_NOSUID)
 		return 0;
 
-- 
cgit v1.2.3-70-g09d2


From ae33bc40c0d96d02f51a996482ea7e41c5152695 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Wed, 5 Nov 2008 16:00:02 -0800
Subject: net: Guaranetee the proper ordering of the loopback device.

I was recently hunting a bug that occurred in network namespace
cleanup.  In looking at the code it became apparrent that we have
and will continue to have cases where if we have anything going
on in a network namespace there will be assumptions that the
loopback device is present.   Things like sending igmp unsubscribe
messages when we bring down network devices invokes the routing
code which assumes that at least the loopback driver is present.

Therefore to avoid magic initcall ordering hackery that is hard
to follow and hard to get right insert a call to register the
loopback device directly from net_dev_init().    This guarantes
that the loopback device is the first device registered and
the last network device to go away.

Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/loopback.c    | 13 ++-----------
 include/linux/netdevice.h |  1 +
 net/core/dev.c            | 12 ++++++++++++
 3 files changed, 15 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/loopback.c b/drivers/net/loopback.c
index 91d08585a6d..c4516b580ba 100644
--- a/drivers/net/loopback.c
+++ b/drivers/net/loopback.c
@@ -204,17 +204,8 @@ static __net_exit void loopback_net_exit(struct net *net)
 	unregister_netdev(dev);
 }
 
-static struct pernet_operations __net_initdata loopback_net_ops = {
+/* Registered in net/core/dev.c */
+struct pernet_operations __net_initdata loopback_net_ops = {
        .init = loopback_net_init,
        .exit = loopback_net_exit,
 };
-
-static int __init loopback_init(void)
-{
-	return register_pernet_device(&loopback_net_ops);
-}
-
-/* Loopback is special. It should be initialized before any other network
- * device and network subsystem.
- */
-fs_initcall(loopback_init);
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index f1b0dbe5846..12d7f4469dc 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1766,6 +1766,7 @@ static inline int skb_bond_should_drop(struct sk_buff *skb)
 	return 0;
 }
 
+extern struct pernet_operations __net_initdata loopback_net_ops;
 #endif /* __KERNEL__ */
 
 #endif	/* _LINUX_DEV_H */
diff --git a/net/core/dev.c b/net/core/dev.c
index 9475f3e624a..811507c3980 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -4904,6 +4904,18 @@ static int __init net_dev_init(void)
 	if (register_pernet_subsys(&netdev_net_ops))
 		goto out;
 
+	/* The loopback device is special if any other network devices
+	 * is present in a network namespace the loopback device must
+	 * be present. Since we now dynamically allocate and free the
+	 * loopback device ensure this invariant is maintained by
+	 * keeping the loopback device as the first device on the
+	 * list of network devices.  Ensuring the loopback devices
+	 * is the first device that appears and the last network device
+	 * that disappears.
+	 */
+	if (register_pernet_device(&loopback_net_ops))
+		goto out;
+
 	if (register_pernet_device(&default_device_ops))
 		goto out;
 
-- 
cgit v1.2.3-70-g09d2


From fd9abb3d97c2ab883e4732ec1214fe64190236e7 Mon Sep 17 00:00:00 2001
From: Steve Glendinning <steve.glendinning@smsc.com>
Date: Wed, 5 Nov 2008 00:35:37 +0000
Subject: SMSC LAN911x and LAN921x vendor driver

Attached is a driver for SMSC's LAN911x and LAN921x families of embedded
ethernet controllers.

There is an existing smc911x driver in the tree; this is intended to
replace it.  Dustin McIntire (the author of the smc911x driver) has
expressed his support for switching to this driver.

This driver contains workarounds for all known hardware issues, and has
been tested on all flavours of the chip on multiple architectures.

This driver now uses phylib, so this patch also adds support for the
device's internal phy

Signed-off-by: Steve Glendinning <steve.glendinning@smsc.com>
Signed-off-by: Bahadir Balban <Bahadir.Balban@arm.com>
Signed-off-by: Dustin Mcintire <dustin@sensoria.com>
Signed-off-by: Bill Gatliff <bgat@billgatliff.com>
Signed-off-by: Jeff Garzik <jgarzik@redhat.com>
---
 MAINTAINERS              |    6 +
 drivers/net/Kconfig      |   14 +
 drivers/net/Makefile     |    1 +
 drivers/net/phy/smsc.c   |   28 +
 drivers/net/smsc911x.c   | 2091 ++++++++++++++++++++++++++++++++++++++++++++++
 drivers/net/smsc911x.h   |  394 +++++++++
 include/linux/smsc911x.h |   42 +
 7 files changed, 2576 insertions(+)
 create mode 100644 drivers/net/smsc911x.c
 create mode 100644 drivers/net/smsc911x.h
 create mode 100644 include/linux/smsc911x.h

(limited to 'include/linux')

diff --git a/MAINTAINERS b/MAINTAINERS
index 129117fe649..5d951f3db01 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -3853,6 +3853,12 @@ M:	mhoffman@lightlink.com
 L:	lm-sensors@lm-sensors.org
 S:	Maintained
 
+SMSC911x ETHERNET DRIVER
+P:	Steve Glendinning
+M:	steve.glendinning@smsc.com
+L:	netdev@vger.kernel.org
+S:	Supported
+
 SMX UIO Interface
 P:	Ben Nizette
 M:	bn@niasdigital.com
diff --git a/drivers/net/Kconfig b/drivers/net/Kconfig
index f1d0a137169..74a18a7f8f4 100644
--- a/drivers/net/Kconfig
+++ b/drivers/net/Kconfig
@@ -979,6 +979,20 @@ config SMC911X
 	  called smc911x.  If you want to compile it as a module, say M 
 	  here and read <file:Documentation/kbuild/modules.txt>
 
+config SMSC911X
+	tristate "SMSC LAN911x/LAN921x families embedded ethernet support"
+	depends on ARM || SUPERH
+	select CRC32
+	select MII
+	select PHYLIB
+	---help---
+	  Say Y here if you want support for SMSC LAN911x and LAN921x families
+	  of ethernet controllers.
+
+	  To compile this driver as a module, choose M here and read
+	  <file:Documentation/networking/net-modules.txt>. The module
+	  will be called smsc911x.
+
 config NET_VENDOR_RACAL
 	bool "Racal-Interlan (Micom) NI cards"
 	depends on ISA
diff --git a/drivers/net/Makefile b/drivers/net/Makefile
index 657c47b1a6b..e06829aa75b 100644
--- a/drivers/net/Makefile
+++ b/drivers/net/Makefile
@@ -219,6 +219,7 @@ obj-$(CONFIG_S2IO) += s2io.o
 obj-$(CONFIG_MYRI10GE) += myri10ge/
 obj-$(CONFIG_SMC91X) += smc91x.o
 obj-$(CONFIG_SMC911X) += smc911x.o
+obj-$(CONFIG_SMSC911X) += smsc911x.o
 obj-$(CONFIG_BFIN_MAC) += bfin_mac.o
 obj-$(CONFIG_DM9000) += dm9000.o
 obj-$(CONFIG_PASEMI_MAC) += pasemi_mac_driver.o
diff --git a/drivers/net/phy/smsc.c b/drivers/net/phy/smsc.c
index 73baa7a3bb0..c05d38d4635 100644
--- a/drivers/net/phy/smsc.c
+++ b/drivers/net/phy/smsc.c
@@ -126,6 +126,27 @@ static struct phy_driver lan8700_driver = {
 	.driver		= { .owner = THIS_MODULE, }
 };
 
+static struct phy_driver lan911x_int_driver = {
+	.phy_id		= 0x0007c0d0, /* OUI=0x00800f, Model#=0x0d */
+	.phy_id_mask	= 0xfffffff0,
+	.name		= "SMSC LAN911x Internal PHY",
+
+	.features	= (PHY_BASIC_FEATURES | SUPPORTED_Pause
+				| SUPPORTED_Asym_Pause),
+	.flags		= PHY_HAS_INTERRUPT | PHY_HAS_MAGICANEG,
+
+	/* basic functions */
+	.config_aneg	= genphy_config_aneg,
+	.read_status	= genphy_read_status,
+	.config_init	= smsc_phy_config_init,
+
+	/* IRQ related */
+	.ack_interrupt	= smsc_phy_ack_interrupt,
+	.config_intr	= smsc_phy_config_intr,
+
+	.driver		= { .owner = THIS_MODULE, }
+};
+
 static int __init smsc_init(void)
 {
 	int ret;
@@ -142,8 +163,14 @@ static int __init smsc_init(void)
 	if (ret)
 		goto err3;
 
+	ret = phy_driver_register (&lan911x_int_driver);
+	if (ret)
+		goto err4;
+
 	return 0;
 
+err4:
+	phy_driver_unregister (&lan8700_driver);
 err3:
 	phy_driver_unregister (&lan8187_driver);
 err2:
@@ -154,6 +181,7 @@ err1:
 
 static void __exit smsc_exit(void)
 {
+	phy_driver_unregister (&lan911x_int_driver);
 	phy_driver_unregister (&lan8700_driver);
 	phy_driver_unregister (&lan8187_driver);
 	phy_driver_unregister (&lan83c185_driver);
diff --git a/drivers/net/smsc911x.c b/drivers/net/smsc911x.c
new file mode 100644
index 00000000000..fe517880fc9
--- /dev/null
+++ b/drivers/net/smsc911x.c
@@ -0,0 +1,2091 @@
+/***************************************************************************
+ *
+ * Copyright (C) 2004-2008 SMSC
+ * Copyright (C) 2005-2008 ARM
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
+ *
+ ***************************************************************************
+ * Rewritten, heavily based on smsc911x simple driver by SMSC.
+ * Partly uses io macros from smc91x.c by Nicolas Pitre
+ *
+ * Supported devices:
+ *   LAN9115, LAN9116, LAN9117, LAN9118
+ *   LAN9215, LAN9216, LAN9217, LAN9218
+ *   LAN9210, LAN9211
+ *   LAN9220, LAN9221
+ *
+ */
+
+#include <linux/crc32.h>
+#include <linux/delay.h>
+#include <linux/errno.h>
+#include <linux/etherdevice.h>
+#include <linux/ethtool.h>
+#include <linux/init.h>
+#include <linux/ioport.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/netdevice.h>
+#include <linux/platform_device.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/timer.h>
+#include <linux/version.h>
+#include <linux/bug.h>
+#include <linux/bitops.h>
+#include <linux/irq.h>
+#include <linux/io.h>
+#include <linux/phy.h>
+#include <linux/smsc911x.h>
+#include "smsc911x.h"
+
+#define SMSC_CHIPNAME		"smsc911x"
+#define SMSC_MDIONAME		"smsc911x-mdio"
+#define SMSC_DRV_VERSION	"2008-10-21"
+
+MODULE_LICENSE("GPL");
+MODULE_VERSION(SMSC_DRV_VERSION);
+
+#if USE_DEBUG > 0
+static int debug = 16;
+#else
+static int debug = 3;
+#endif
+
+module_param(debug, int, 0);
+MODULE_PARM_DESC(debug, "Debug level (0=none,...,16=all)");
+
+struct smsc911x_data {
+	void __iomem *ioaddr;
+
+	unsigned int idrev;
+
+	/* used to decide which workarounds apply */
+	unsigned int generation;
+
+	/* device configuration (copied from platform_data during probe) */
+	unsigned int irq_polarity;
+	unsigned int irq_type;
+	phy_interface_t phy_interface;
+
+	/* This needs to be acquired before calling any of below:
+	 * smsc911x_mac_read(), smsc911x_mac_write()
+	 */
+	spinlock_t mac_lock;
+
+#if (!SMSC_CAN_USE_32BIT)
+	/* spinlock to ensure 16-bit accesses are serialised */
+	spinlock_t dev_lock;
+#endif
+
+	struct phy_device *phy_dev;
+	struct mii_bus *mii_bus;
+	int phy_irq[PHY_MAX_ADDR];
+	unsigned int using_extphy;
+	int last_duplex;
+	int last_carrier;
+
+	u32 msg_enable;
+	unsigned int gpio_setting;
+	unsigned int gpio_orig_setting;
+	struct net_device *dev;
+	struct napi_struct napi;
+
+	unsigned int software_irq_signal;
+
+#ifdef USE_PHY_WORK_AROUND
+#define MIN_PACKET_SIZE (64)
+	char loopback_tx_pkt[MIN_PACKET_SIZE];
+	char loopback_rx_pkt[MIN_PACKET_SIZE];
+	unsigned int resetcount;
+#endif
+
+	/* Members for Multicast filter workaround */
+	unsigned int multicast_update_pending;
+	unsigned int set_bits_mask;
+	unsigned int clear_bits_mask;
+	unsigned int hashhi;
+	unsigned int hashlo;
+};
+
+#if SMSC_CAN_USE_32BIT
+
+static inline u32 smsc911x_reg_read(struct smsc911x_data *pdata, u32 reg)
+{
+	return readl(pdata->ioaddr + reg);
+}
+
+static inline void smsc911x_reg_write(struct smsc911x_data *pdata, u32 reg,
+				      u32 val)
+{
+	writel(val, pdata->ioaddr + reg);
+}
+
+/* Writes a packet to the TX_DATA_FIFO */
+static inline void
+smsc911x_tx_writefifo(struct smsc911x_data *pdata, unsigned int *buf,
+		      unsigned int wordcount)
+{
+	writesl(pdata->ioaddr + TX_DATA_FIFO, buf, wordcount);
+}
+
+/* Reads a packet out of the RX_DATA_FIFO */
+static inline void
+smsc911x_rx_readfifo(struct smsc911x_data *pdata, unsigned int *buf,
+		     unsigned int wordcount)
+{
+	readsl(pdata->ioaddr + RX_DATA_FIFO, buf, wordcount);
+}
+
+#else				/* SMSC_CAN_USE_32BIT */
+
+/* These 16-bit access functions are significantly slower, due to the locking
+ * necessary.  If your bus hardware can be configured to do this for you
+ * (in response to a single 32-bit operation from software), you should use
+ * the 32-bit access functions instead. */
+
+static inline u32 smsc911x_reg_read(struct smsc911x_data *pdata, u32 reg)
+{
+	unsigned long flags;
+	u32 data;
+
+	/* these two 16-bit reads must be performed consecutively, so must
+	 * not be interrupted by our own ISR (which would start another
+	 * read operation) */
+	spin_lock_irqsave(&pdata->dev_lock, flags);
+	data = ((readw(pdata->ioaddr + reg) & 0xFFFF) |
+		((readw(pdata->ioaddr + reg + 2) & 0xFFFF) << 16));
+	spin_unlock_irqrestore(&pdata->dev_lock, flags);
+
+	return data;
+}
+
+static inline void smsc911x_reg_write(struct smsc911x_data *pdata, u32 reg,
+				      u32 val)
+{
+	unsigned long flags;
+
+	/* these two 16-bit writes must be performed consecutively, so must
+	 * not be interrupted by our own ISR (which would start another
+	 * read operation) */
+	spin_lock_irqsave(&pdata->dev_lock, flags);
+	writew(val & 0xFFFF, pdata->ioaddr + reg);
+	writew((val >> 16) & 0xFFFF, pdata->ioaddr + reg + 2);
+	spin_unlock_irqrestore(&pdata->dev_lock, flags);
+}
+
+/* Writes a packet to the TX_DATA_FIFO */
+static inline void
+smsc911x_tx_writefifo(struct smsc911x_data *pdata, unsigned int *buf,
+		      unsigned int wordcount)
+{
+	while (wordcount--)
+		smsc911x_reg_write(pdata, TX_DATA_FIFO, *buf++);
+}
+
+/* Reads a packet out of the RX_DATA_FIFO */
+static inline void
+smsc911x_rx_readfifo(struct smsc911x_data *pdata, unsigned int *buf,
+		     unsigned int wordcount)
+{
+	while (wordcount--)
+		*buf++ = smsc911x_reg_read(pdata, RX_DATA_FIFO);
+}
+
+#endif				/* SMSC_CAN_USE_32BIT */
+
+/* waits for MAC not busy, with timeout.  Only called by smsc911x_mac_read
+ * and smsc911x_mac_write, so assumes mac_lock is held */
+static int smsc911x_mac_complete(struct smsc911x_data *pdata)
+{
+	int i;
+	u32 val;
+
+	SMSC_ASSERT_MAC_LOCK(pdata);
+
+	for (i = 0; i < 40; i++) {
+		val = smsc911x_reg_read(pdata, MAC_CSR_CMD);
+		if (!(val & MAC_CSR_CMD_CSR_BUSY_))
+			return 0;
+	}
+	SMSC_WARNING(HW, "Timed out waiting for MAC not BUSY. "
+		"MAC_CSR_CMD: 0x%08X", val);
+	return -EIO;
+}
+
+/* Fetches a MAC register value. Assumes mac_lock is acquired */
+static u32 smsc911x_mac_read(struct smsc911x_data *pdata, unsigned int offset)
+{
+	unsigned int temp;
+
+	SMSC_ASSERT_MAC_LOCK(pdata);
+
+	temp = smsc911x_reg_read(pdata, MAC_CSR_CMD);
+	if (unlikely(temp & MAC_CSR_CMD_CSR_BUSY_)) {
+		SMSC_WARNING(HW, "MAC busy at entry");
+		return 0xFFFFFFFF;
+	}
+
+	/* Send the MAC cmd */
+	smsc911x_reg_write(pdata, MAC_CSR_CMD, ((offset & 0xFF) |
+		MAC_CSR_CMD_CSR_BUSY_ | MAC_CSR_CMD_R_NOT_W_));
+
+	/* Workaround for hardware read-after-write restriction */
+	temp = smsc911x_reg_read(pdata, BYTE_TEST);
+
+	/* Wait for the read to complete */
+	if (likely(smsc911x_mac_complete(pdata) == 0))
+		return smsc911x_reg_read(pdata, MAC_CSR_DATA);
+
+	SMSC_WARNING(HW, "MAC busy after read");
+	return 0xFFFFFFFF;
+}
+
+/* Set a mac register, mac_lock must be acquired before calling */
+static void smsc911x_mac_write(struct smsc911x_data *pdata,
+			       unsigned int offset, u32 val)
+{
+	unsigned int temp;
+
+	SMSC_ASSERT_MAC_LOCK(pdata);
+
+	temp = smsc911x_reg_read(pdata, MAC_CSR_CMD);
+	if (unlikely(temp & MAC_CSR_CMD_CSR_BUSY_)) {
+		SMSC_WARNING(HW,
+			"smsc911x_mac_write failed, MAC busy at entry");
+		return;
+	}
+
+	/* Send data to write */
+	smsc911x_reg_write(pdata, MAC_CSR_DATA, val);
+
+	/* Write the actual data */
+	smsc911x_reg_write(pdata, MAC_CSR_CMD, ((offset & 0xFF) |
+		MAC_CSR_CMD_CSR_BUSY_));
+
+	/* Workaround for hardware read-after-write restriction */
+	temp = smsc911x_reg_read(pdata, BYTE_TEST);
+
+	/* Wait for the write to complete */
+	if (likely(smsc911x_mac_complete(pdata) == 0))
+		return;
+
+	SMSC_WARNING(HW,
+		"smsc911x_mac_write failed, MAC busy after write");
+}
+
+/* Get a phy register */
+static int smsc911x_mii_read(struct mii_bus *bus, int phyaddr, int regidx)
+{
+	struct smsc911x_data *pdata = (struct smsc911x_data *)bus->priv;
+	unsigned long flags;
+	unsigned int addr;
+	int i, reg;
+
+	spin_lock_irqsave(&pdata->mac_lock, flags);
+
+	/* Confirm MII not busy */
+	if (unlikely(smsc911x_mac_read(pdata, MII_ACC) & MII_ACC_MII_BUSY_)) {
+		SMSC_WARNING(HW,
+			"MII is busy in smsc911x_mii_read???");
+		reg = -EIO;
+		goto out;
+	}
+
+	/* Set the address, index & direction (read from PHY) */
+	addr = ((phyaddr & 0x1F) << 11) | ((regidx & 0x1F) << 6);
+	smsc911x_mac_write(pdata, MII_ACC, addr);
+
+	/* Wait for read to complete w/ timeout */
+	for (i = 0; i < 100; i++)
+		if (!(smsc911x_mac_read(pdata, MII_ACC) & MII_ACC_MII_BUSY_)) {
+			reg = smsc911x_mac_read(pdata, MII_DATA);
+			goto out;
+		}
+
+	SMSC_WARNING(HW, "Timed out waiting for MII write to finish");
+	reg = -EIO;
+
+out:
+	spin_unlock_irqrestore(&pdata->mac_lock, flags);
+	return reg;
+}
+
+/* Set a phy register */
+static int smsc911x_mii_write(struct mii_bus *bus, int phyaddr, int regidx,
+			   u16 val)
+{
+	struct smsc911x_data *pdata = (struct smsc911x_data *)bus->priv;
+	unsigned long flags;
+	unsigned int addr;
+	int i, reg;
+
+	spin_lock_irqsave(&pdata->mac_lock, flags);
+
+	/* Confirm MII not busy */
+	if (unlikely(smsc911x_mac_read(pdata, MII_ACC) & MII_ACC_MII_BUSY_)) {
+		SMSC_WARNING(HW,
+			"MII is busy in smsc911x_mii_write???");
+		reg = -EIO;
+		goto out;
+	}
+
+	/* Put the data to write in the MAC */
+	smsc911x_mac_write(pdata, MII_DATA, val);
+
+	/* Set the address, index & direction (write to PHY) */
+	addr = ((phyaddr & 0x1F) << 11) | ((regidx & 0x1F) << 6) |
+		MII_ACC_MII_WRITE_;
+	smsc911x_mac_write(pdata, MII_ACC, addr);
+
+	/* Wait for write to complete w/ timeout */
+	for (i = 0; i < 100; i++)
+		if (!(smsc911x_mac_read(pdata, MII_ACC) & MII_ACC_MII_BUSY_)) {
+			reg = 0;
+			goto out;
+		}
+
+	SMSC_WARNING(HW, "Timed out waiting for MII write to finish");
+	reg = -EIO;
+
+out:
+	spin_unlock_irqrestore(&pdata->mac_lock, flags);
+	return reg;
+}
+
+/* Autodetects and initialises external phy for SMSC9115 and SMSC9117 flavors.
+ * If something goes wrong, returns -ENODEV to revert back to internal phy.
+ * Performed at initialisation only, so interrupts are enabled */
+static int smsc911x_phy_initialise_external(struct smsc911x_data *pdata)
+{
+	unsigned int hwcfg = smsc911x_reg_read(pdata, HW_CFG);
+
+	/* External phy is requested, supported, and detected */
+	if (hwcfg & HW_CFG_EXT_PHY_DET_) {
+
+		/* Switch to external phy. Assuming tx and rx are stopped
+		 * because smsc911x_phy_initialise is called before
+		 * smsc911x_rx_initialise and tx_initialise. */
+
+		/* Disable phy clocks to the MAC */
+		hwcfg &= (~HW_CFG_PHY_CLK_SEL_);
+		hwcfg |= HW_CFG_PHY_CLK_SEL_CLK_DIS_;
+		smsc911x_reg_write(pdata, HW_CFG, hwcfg);
+		udelay(10);	/* Enough time for clocks to stop */
+
+		/* Switch to external phy */
+		hwcfg |= HW_CFG_EXT_PHY_EN_;
+		smsc911x_reg_write(pdata, HW_CFG, hwcfg);
+
+		/* Enable phy clocks to the MAC */
+		hwcfg &= (~HW_CFG_PHY_CLK_SEL_);
+		hwcfg |= HW_CFG_PHY_CLK_SEL_EXT_PHY_;
+		smsc911x_reg_write(pdata, HW_CFG, hwcfg);
+		udelay(10);	/* Enough time for clocks to restart */
+
+		hwcfg |= HW_CFG_SMI_SEL_;
+		smsc911x_reg_write(pdata, HW_CFG, hwcfg);
+
+		SMSC_TRACE(HW, "Successfully switched to external PHY");
+		pdata->using_extphy = 1;
+	} else {
+		SMSC_WARNING(HW, "No external PHY detected, "
+			"Using internal PHY instead.");
+		/* Use internal phy */
+		return -ENODEV;
+	}
+	return 0;
+}
+
+/* Fetches a tx status out of the status fifo */
+static unsigned int smsc911x_tx_get_txstatus(struct smsc911x_data *pdata)
+{
+	unsigned int result =
+	    smsc911x_reg_read(pdata, TX_FIFO_INF) & TX_FIFO_INF_TSUSED_;
+
+	if (result != 0)
+		result = smsc911x_reg_read(pdata, TX_STATUS_FIFO);
+
+	return result;
+}
+
+/* Fetches the next rx status */
+static unsigned int smsc911x_rx_get_rxstatus(struct smsc911x_data *pdata)
+{
+	unsigned int result =
+	    smsc911x_reg_read(pdata, RX_FIFO_INF) & RX_FIFO_INF_RXSUSED_;
+
+	if (result != 0)
+		result = smsc911x_reg_read(pdata, RX_STATUS_FIFO);
+
+	return result;
+}
+
+#ifdef USE_PHY_WORK_AROUND
+static int smsc911x_phy_check_loopbackpkt(struct smsc911x_data *pdata)
+{
+	unsigned int tries;
+	u32 wrsz;
+	u32 rdsz;
+	ulong bufp;
+
+	for (tries = 0; tries < 10; tries++) {
+		unsigned int txcmd_a;
+		unsigned int txcmd_b;
+		unsigned int status;
+		unsigned int pktlength;
+		unsigned int i;
+
+		/* Zero-out rx packet memory */
+		memset(pdata->loopback_rx_pkt, 0, MIN_PACKET_SIZE);
+
+		/* Write tx packet to 118 */
+		txcmd_a = (u32)((ulong)pdata->loopback_tx_pkt & 0x03) << 16;
+		txcmd_a |= TX_CMD_A_FIRST_SEG_ | TX_CMD_A_LAST_SEG_;
+		txcmd_a |= MIN_PACKET_SIZE;
+
+		txcmd_b = MIN_PACKET_SIZE << 16 | MIN_PACKET_SIZE;
+
+		smsc911x_reg_write(pdata, TX_DATA_FIFO, txcmd_a);
+		smsc911x_reg_write(pdata, TX_DATA_FIFO, txcmd_b);
+
+		bufp = (ulong)pdata->loopback_tx_pkt & (~0x3);
+		wrsz = MIN_PACKET_SIZE + 3;
+		wrsz += (u32)((ulong)pdata->loopback_tx_pkt & 0x3);
+		wrsz >>= 2;
+
+		smsc911x_tx_writefifo(pdata, (unsigned int *)bufp, wrsz);
+
+		/* Wait till transmit is done */
+		i = 60;
+		do {
+			udelay(5);
+			status = smsc911x_tx_get_txstatus(pdata);
+		} while ((i--) && (!status));
+
+		if (!status) {
+			SMSC_WARNING(HW, "Failed to transmit "
+				"during loopback test");
+			continue;
+		}
+		if (status & TX_STS_ES_) {
+			SMSC_WARNING(HW, "Transmit encountered "
+				"errors during loopback test");
+			continue;
+		}
+
+		/* Wait till receive is done */
+		i = 60;
+		do {
+			udelay(5);
+			status = smsc911x_rx_get_rxstatus(pdata);
+		} while ((i--) && (!status));
+
+		if (!status) {
+			SMSC_WARNING(HW,
+				"Failed to receive during loopback test");
+			continue;
+		}
+		if (status & RX_STS_ES_) {
+			SMSC_WARNING(HW, "Receive encountered "
+				"errors during loopback test");
+			continue;
+		}
+
+		pktlength = ((status & 0x3FFF0000UL) >> 16);
+		bufp = (ulong)pdata->loopback_rx_pkt;
+		rdsz = pktlength + 3;
+		rdsz += (u32)((ulong)pdata->loopback_rx_pkt & 0x3);
+		rdsz >>= 2;
+
+		smsc911x_rx_readfifo(pdata, (unsigned int *)bufp, rdsz);
+
+		if (pktlength != (MIN_PACKET_SIZE + 4)) {
+			SMSC_WARNING(HW, "Unexpected packet size "
+				"during loop back test, size=%d, will retry",
+				pktlength);
+		} else {
+			unsigned int j;
+			int mismatch = 0;
+			for (j = 0; j < MIN_PACKET_SIZE; j++) {
+				if (pdata->loopback_tx_pkt[j]
+				    != pdata->loopback_rx_pkt[j]) {
+					mismatch = 1;
+					break;
+				}
+			}
+			if (!mismatch) {
+				SMSC_TRACE(HW, "Successfully verified "
+					   "loopback packet");
+				return 0;
+			} else {
+				SMSC_WARNING(HW, "Data mismatch "
+					"during loop back test, will retry");
+			}
+		}
+	}
+
+	return -EIO;
+}
+
+static int smsc911x_phy_reset(struct smsc911x_data *pdata)
+{
+	struct phy_device *phy_dev = pdata->phy_dev;
+	unsigned int temp;
+	unsigned int i = 100000;
+
+	BUG_ON(!phy_dev);
+	BUG_ON(!phy_dev->bus);
+
+	SMSC_TRACE(HW, "Performing PHY BCR Reset");
+	smsc911x_mii_write(phy_dev->bus, phy_dev->addr, MII_BMCR, BMCR_RESET);
+	do {
+		msleep(1);
+		temp = smsc911x_mii_read(phy_dev->bus, phy_dev->addr,
+			MII_BMCR);
+	} while ((i--) && (temp & BMCR_RESET));
+
+	if (temp & BMCR_RESET) {
+		SMSC_WARNING(HW, "PHY reset failed to complete.");
+		return -EIO;
+	}
+	/* Extra delay required because the phy may not be completed with
+	* its reset when BMCR_RESET is cleared. Specs say 256 uS is
+	* enough delay but using 1ms here to be safe */
+	msleep(1);
+
+	return 0;
+}
+
+static int smsc911x_phy_loopbacktest(struct net_device *dev)
+{
+	struct smsc911x_data *pdata = netdev_priv(dev);
+	struct phy_device *phy_dev = pdata->phy_dev;
+	int result = -EIO;
+	unsigned int i, val;
+	unsigned long flags;
+
+	/* Initialise tx packet using broadcast destination address */
+	memset(pdata->loopback_tx_pkt, 0xff, ETH_ALEN);
+
+	/* Use incrementing source address */
+	for (i = 6; i < 12; i++)
+		pdata->loopback_tx_pkt[i] = (char)i;
+
+	/* Set length type field */
+	pdata->loopback_tx_pkt[12] = 0x00;
+	pdata->loopback_tx_pkt[13] = 0x00;
+
+	for (i = 14; i < MIN_PACKET_SIZE; i++)
+		pdata->loopback_tx_pkt[i] = (char)i;
+
+	val = smsc911x_reg_read(pdata, HW_CFG);
+	val &= HW_CFG_TX_FIF_SZ_;
+	val |= HW_CFG_SF_;
+	smsc911x_reg_write(pdata, HW_CFG, val);
+
+	smsc911x_reg_write(pdata, TX_CFG, TX_CFG_TX_ON_);
+	smsc911x_reg_write(pdata, RX_CFG,
+		(u32)((ulong)pdata->loopback_rx_pkt & 0x03) << 8);
+
+	for (i = 0; i < 10; i++) {
+		/* Set PHY to 10/FD, no ANEG, and loopback mode */
+		smsc911x_mii_write(phy_dev->bus, phy_dev->addr,	MII_BMCR,
+			BMCR_LOOPBACK | BMCR_FULLDPLX);
+
+		/* Enable MAC tx/rx, FD */
+		spin_lock_irqsave(&pdata->mac_lock, flags);
+		smsc911x_mac_write(pdata, MAC_CR, MAC_CR_FDPX_
+				   | MAC_CR_TXEN_ | MAC_CR_RXEN_);
+		spin_unlock_irqrestore(&pdata->mac_lock, flags);
+
+		if (smsc911x_phy_check_loopbackpkt(pdata) == 0) {
+			result = 0;
+			break;
+		}
+		pdata->resetcount++;
+
+		/* Disable MAC rx */
+		spin_lock_irqsave(&pdata->mac_lock, flags);
+		smsc911x_mac_write(pdata, MAC_CR, 0);
+		spin_unlock_irqrestore(&pdata->mac_lock, flags);
+
+		smsc911x_phy_reset(pdata);
+	}
+
+	/* Disable MAC */
+	spin_lock_irqsave(&pdata->mac_lock, flags);
+	smsc911x_mac_write(pdata, MAC_CR, 0);
+	spin_unlock_irqrestore(&pdata->mac_lock, flags);
+
+	/* Cancel PHY loopback mode */
+	smsc911x_mii_write(phy_dev->bus, phy_dev->addr, MII_BMCR, 0);
+
+	smsc911x_reg_write(pdata, TX_CFG, 0);
+	smsc911x_reg_write(pdata, RX_CFG, 0);
+
+	return result;
+}
+#endif				/* USE_PHY_WORK_AROUND */
+
+static u8 smsc95xx_resolve_flowctrl_fulldplx(u16 lcladv, u16 rmtadv)
+{
+	u8 cap = 0;
+
+	if (lcladv & ADVERTISE_PAUSE_CAP) {
+		if (lcladv & ADVERTISE_PAUSE_ASYM) {
+			if (rmtadv & LPA_PAUSE_CAP)
+				cap = FLOW_CTRL_TX | FLOW_CTRL_RX;
+			else if (rmtadv & LPA_PAUSE_ASYM)
+				cap = FLOW_CTRL_RX;
+		} else {
+			if (rmtadv & LPA_PAUSE_CAP)
+				cap = FLOW_CTRL_TX | FLOW_CTRL_RX;
+		}
+	} else if (lcladv & ADVERTISE_PAUSE_ASYM) {
+		if ((rmtadv & LPA_PAUSE_CAP) && (rmtadv & LPA_PAUSE_ASYM))
+			cap = FLOW_CTRL_TX;
+	}
+
+	return cap;
+}
+
+static void smsc911x_phy_update_flowcontrol(struct smsc911x_data *pdata)
+{
+	struct phy_device *phy_dev = pdata->phy_dev;
+	u32 afc = smsc911x_reg_read(pdata, AFC_CFG);
+	u32 flow;
+	unsigned long flags;
+
+	if (phy_dev->duplex == DUPLEX_FULL) {
+		u16 lcladv = phy_read(phy_dev, MII_ADVERTISE);
+		u16 rmtadv = phy_read(phy_dev, MII_LPA);
+		u8 cap = smsc95xx_resolve_flowctrl_fulldplx(lcladv, rmtadv);
+
+		if (cap & FLOW_CTRL_RX)
+			flow = 0xFFFF0002;
+		else
+			flow = 0;
+
+		if (cap & FLOW_CTRL_TX)
+			afc |= 0xF;
+		else
+			afc &= ~0xF;
+
+		SMSC_TRACE(HW, "rx pause %s, tx pause %s",
+			(cap & FLOW_CTRL_RX ? "enabled" : "disabled"),
+			(cap & FLOW_CTRL_TX ? "enabled" : "disabled"));
+	} else {
+		SMSC_TRACE(HW, "half duplex");
+		flow = 0;
+		afc |= 0xF;
+	}
+
+	spin_lock_irqsave(&pdata->mac_lock, flags);
+	smsc911x_mac_write(pdata, FLOW, flow);
+	spin_unlock_irqrestore(&pdata->mac_lock, flags);
+
+	smsc911x_reg_write(pdata, AFC_CFG, afc);
+}
+
+/* Update link mode if anything has changed.  Called periodically when the
+ * PHY is in polling mode, even if nothing has changed. */
+static void smsc911x_phy_adjust_link(struct net_device *dev)
+{
+	struct smsc911x_data *pdata = netdev_priv(dev);
+	struct phy_device *phy_dev = pdata->phy_dev;
+	unsigned long flags;
+	int carrier;
+
+	if (phy_dev->duplex != pdata->last_duplex) {
+		unsigned int mac_cr;
+		SMSC_TRACE(HW, "duplex state has changed");
+
+		spin_lock_irqsave(&pdata->mac_lock, flags);
+		mac_cr = smsc911x_mac_read(pdata, MAC_CR);
+		if (phy_dev->duplex) {
+			SMSC_TRACE(HW,
+				"configuring for full duplex mode");
+			mac_cr |= MAC_CR_FDPX_;
+		} else {
+			SMSC_TRACE(HW,
+				"configuring for half duplex mode");
+			mac_cr &= ~MAC_CR_FDPX_;
+		}
+		smsc911x_mac_write(pdata, MAC_CR, mac_cr);
+		spin_unlock_irqrestore(&pdata->mac_lock, flags);
+
+		smsc911x_phy_update_flowcontrol(pdata);
+		pdata->last_duplex = phy_dev->duplex;
+	}
+
+	carrier = netif_carrier_ok(dev);
+	if (carrier != pdata->last_carrier) {
+		SMSC_TRACE(HW, "carrier state has changed");
+		if (carrier) {
+			SMSC_TRACE(HW, "configuring for carrier OK");
+			if ((pdata->gpio_orig_setting & GPIO_CFG_LED1_EN_) &&
+			    (!pdata->using_extphy)) {
+				/* Restore orginal GPIO configuration */
+				pdata->gpio_setting = pdata->gpio_orig_setting;
+				smsc911x_reg_write(pdata, GPIO_CFG,
+					pdata->gpio_setting);
+			}
+		} else {
+			SMSC_TRACE(HW, "configuring for no carrier");
+			/* Check global setting that LED1
+			 * usage is 10/100 indicator */
+			pdata->gpio_setting = smsc911x_reg_read(pdata,
+				GPIO_CFG);
+			if ((pdata->gpio_setting & GPIO_CFG_LED1_EN_)
+			    && (!pdata->using_extphy)) {
+				/* Force 10/100 LED off, after saving
+				 * orginal GPIO configuration */
+				pdata->gpio_orig_setting = pdata->gpio_setting;
+
+				pdata->gpio_setting &= ~GPIO_CFG_LED1_EN_;
+				pdata->gpio_setting |= (GPIO_CFG_GPIOBUF0_
+							| GPIO_CFG_GPIODIR0_
+							| GPIO_CFG_GPIOD0_);
+				smsc911x_reg_write(pdata, GPIO_CFG,
+					pdata->gpio_setting);
+			}
+		}
+		pdata->last_carrier = carrier;
+	}
+}
+
+static int smsc911x_mii_probe(struct net_device *dev)
+{
+	struct smsc911x_data *pdata = netdev_priv(dev);
+	struct phy_device *phydev = NULL;
+	int phy_addr;
+
+	/* find the first phy */
+	for (phy_addr = 0; phy_addr < PHY_MAX_ADDR; phy_addr++) {
+		if (pdata->mii_bus->phy_map[phy_addr]) {
+			phydev = pdata->mii_bus->phy_map[phy_addr];
+			SMSC_TRACE(PROBE, "PHY %d: addr %d, phy_id 0x%08X",
+				phy_addr, phydev->addr, phydev->phy_id);
+			break;
+		}
+	}
+
+	if (!phydev) {
+		pr_err("%s: no PHY found\n", dev->name);
+		return -ENODEV;
+	}
+
+	phydev = phy_connect(dev, phydev->dev.bus_id,
+		&smsc911x_phy_adjust_link, 0, pdata->phy_interface);
+
+	if (IS_ERR(phydev)) {
+		pr_err("%s: Could not attach to PHY\n", dev->name);
+		return PTR_ERR(phydev);
+	}
+
+	pr_info("%s: attached PHY driver [%s] (mii_bus:phy_addr=%s, irq=%d)\n",
+		dev->name, phydev->drv->name, phydev->dev.bus_id, phydev->irq);
+
+	/* mask with MAC supported features */
+	phydev->supported &= (PHY_BASIC_FEATURES | SUPPORTED_Pause |
+			      SUPPORTED_Asym_Pause);
+	phydev->advertising = phydev->supported;
+
+	pdata->phy_dev = phydev;
+	pdata->last_duplex = -1;
+	pdata->last_carrier = -1;
+
+#ifdef USE_PHY_WORK_AROUND
+	if (smsc911x_phy_loopbacktest(dev) < 0) {
+		SMSC_WARNING(HW, "Failed Loop Back Test");
+		return -ENODEV;
+	}
+	SMSC_TRACE(HW, "Passed Loop Back Test");
+#endif				/* USE_PHY_WORK_AROUND */
+
+	SMSC_TRACE(HW, "phy initialised succesfully");
+	return 0;
+}
+
+static int __devinit smsc911x_mii_init(struct platform_device *pdev,
+				       struct net_device *dev)
+{
+	struct smsc911x_data *pdata = netdev_priv(dev);
+	int err = -ENXIO, i;
+
+	pdata->mii_bus = mdiobus_alloc();
+	if (!pdata->mii_bus) {
+		err = -ENOMEM;
+		goto err_out_1;
+	}
+
+	pdata->mii_bus->name = SMSC_MDIONAME;
+	snprintf(pdata->mii_bus->id, MII_BUS_ID_SIZE, "%x", pdev->id);
+	pdata->mii_bus->priv = pdata;
+	pdata->mii_bus->read = smsc911x_mii_read;
+	pdata->mii_bus->write = smsc911x_mii_write;
+	pdata->mii_bus->irq = pdata->phy_irq;
+	for (i = 0; i < PHY_MAX_ADDR; ++i)
+		pdata->mii_bus->irq[i] = PHY_POLL;
+
+	pdata->mii_bus->parent = &pdev->dev;
+	dev_set_drvdata(&pdev->dev, &pdata->mii_bus);
+
+	pdata->using_extphy = 0;
+
+	switch (pdata->idrev & 0xFFFF0000) {
+	case 0x01170000:
+	case 0x01150000:
+	case 0x117A0000:
+	case 0x115A0000:
+		/* External PHY supported, try to autodetect */
+		if (smsc911x_phy_initialise_external(pdata) < 0) {
+			SMSC_TRACE(HW, "No external PHY detected, "
+				"using internal PHY");
+		}
+		break;
+	default:
+		SMSC_TRACE(HW, "External PHY is not supported, "
+			"using internal PHY");
+		break;
+	}
+
+	if (!pdata->using_extphy) {
+		/* Mask all PHYs except ID 1 (internal) */
+		pdata->mii_bus->phy_mask = ~(1 << 1);
+	}
+
+	if (mdiobus_register(pdata->mii_bus)) {
+		SMSC_WARNING(PROBE, "Error registering mii bus");
+		goto err_out_free_bus_2;
+	}
+
+	if (smsc911x_mii_probe(dev) < 0) {
+		SMSC_WARNING(PROBE, "Error registering mii bus");
+		goto err_out_unregister_bus_3;
+	}
+
+	return 0;
+
+err_out_unregister_bus_3:
+	mdiobus_unregister(pdata->mii_bus);
+err_out_free_bus_2:
+	mdiobus_free(pdata->mii_bus);
+err_out_1:
+	return err;
+}
+
+/* Gets the number of tx statuses in the fifo */
+static unsigned int smsc911x_tx_get_txstatcount(struct smsc911x_data *pdata)
+{
+	return (smsc911x_reg_read(pdata, TX_FIFO_INF)
+		& TX_FIFO_INF_TSUSED_) >> 16;
+}
+
+/* Reads tx statuses and increments counters where necessary */
+static void smsc911x_tx_update_txcounters(struct net_device *dev)
+{
+	struct smsc911x_data *pdata = netdev_priv(dev);
+	unsigned int tx_stat;
+
+	while ((tx_stat = smsc911x_tx_get_txstatus(pdata)) != 0) {
+		if (unlikely(tx_stat & 0x80000000)) {
+			/* In this driver the packet tag is used as the packet
+			 * length. Since a packet length can never reach the
+			 * size of 0x8000, this bit is reserved. It is worth
+			 * noting that the "reserved bit" in the warning above
+			 * does not reference a hardware defined reserved bit
+			 * but rather a driver defined one.
+			 */
+			SMSC_WARNING(HW,
+				"Packet tag reserved bit is high");
+		} else {
+			if (unlikely(tx_stat & 0x00008000)) {
+				dev->stats.tx_errors++;
+			} else {
+				dev->stats.tx_packets++;
+				dev->stats.tx_bytes += (tx_stat >> 16);
+			}
+			if (unlikely(tx_stat & 0x00000100)) {
+				dev->stats.collisions += 16;
+				dev->stats.tx_aborted_errors += 1;
+			} else {
+				dev->stats.collisions +=
+				    ((tx_stat >> 3) & 0xF);
+			}
+			if (unlikely(tx_stat & 0x00000800))
+				dev->stats.tx_carrier_errors += 1;
+			if (unlikely(tx_stat & 0x00000200)) {
+				dev->stats.collisions++;
+				dev->stats.tx_aborted_errors++;
+			}
+		}
+	}
+}
+
+/* Increments the Rx error counters */
+static void
+smsc911x_rx_counterrors(struct net_device *dev, unsigned int rxstat)
+{
+	int crc_err = 0;
+
+	if (unlikely(rxstat & 0x00008000)) {
+		dev->stats.rx_errors++;
+		if (unlikely(rxstat & 0x00000002)) {
+			dev->stats.rx_crc_errors++;
+			crc_err = 1;
+		}
+	}
+	if (likely(!crc_err)) {
+		if (unlikely((rxstat & 0x00001020) == 0x00001020)) {
+			/* Frame type indicates length,
+			 * and length error is set */
+			dev->stats.rx_length_errors++;
+		}
+		if (rxstat & RX_STS_MCAST_)
+			dev->stats.multicast++;
+	}
+}
+
+/* Quickly dumps bad packets */
+static void
+smsc911x_rx_fastforward(struct smsc911x_data *pdata, unsigned int pktbytes)
+{
+	unsigned int pktwords = (pktbytes + NET_IP_ALIGN + 3) >> 2;
+
+	if (likely(pktwords >= 4)) {
+		unsigned int timeout = 500;
+		unsigned int val;
+		smsc911x_reg_write(pdata, RX_DP_CTRL, RX_DP_CTRL_RX_FFWD_);
+		do {
+			udelay(1);
+			val = smsc911x_reg_read(pdata, RX_DP_CTRL);
+		} while (timeout-- && (val & RX_DP_CTRL_RX_FFWD_));
+
+		if (unlikely(timeout == 0))
+			SMSC_WARNING(HW, "Timed out waiting for "
+				"RX FFWD to finish, RX_DP_CTRL: 0x%08X", val);
+	} else {
+		unsigned int temp;
+		while (pktwords--)
+			temp = smsc911x_reg_read(pdata, RX_DATA_FIFO);
+	}
+}
+
+/* NAPI poll function */
+static int smsc911x_poll(struct napi_struct *napi, int budget)
+{
+	struct smsc911x_data *pdata =
+		container_of(napi, struct smsc911x_data, napi);
+	struct net_device *dev = pdata->dev;
+	int npackets = 0;
+
+	while (likely(netif_running(dev)) && (npackets < budget)) {
+		unsigned int pktlength;
+		unsigned int pktwords;
+		struct sk_buff *skb;
+		unsigned int rxstat = smsc911x_rx_get_rxstatus(pdata);
+
+		if (!rxstat) {
+			unsigned int temp;
+			/* We processed all packets available.  Tell NAPI it can
+			 * stop polling then re-enable rx interrupts */
+			smsc911x_reg_write(pdata, INT_STS, INT_STS_RSFL_);
+			netif_rx_complete(dev, napi);
+			temp = smsc911x_reg_read(pdata, INT_EN);
+			temp |= INT_EN_RSFL_EN_;
+			smsc911x_reg_write(pdata, INT_EN, temp);
+			break;
+		}
+
+		/* Count packet for NAPI scheduling, even if it has an error.
+		 * Error packets still require cycles to discard */
+		npackets++;
+
+		pktlength = ((rxstat & 0x3FFF0000) >> 16);
+		pktwords = (pktlength + NET_IP_ALIGN + 3) >> 2;
+		smsc911x_rx_counterrors(dev, rxstat);
+
+		if (unlikely(rxstat & RX_STS_ES_)) {
+			SMSC_WARNING(RX_ERR,
+				"Discarding packet with error bit set");
+			/* Packet has an error, discard it and continue with
+			 * the next */
+			smsc911x_rx_fastforward(pdata, pktwords);
+			dev->stats.rx_dropped++;
+			continue;
+		}
+
+		skb = netdev_alloc_skb(dev, pktlength + NET_IP_ALIGN);
+		if (unlikely(!skb)) {
+			SMSC_WARNING(RX_ERR,
+				"Unable to allocate skb for rx packet");
+			/* Drop the packet and stop this polling iteration */
+			smsc911x_rx_fastforward(pdata, pktwords);
+			dev->stats.rx_dropped++;
+			break;
+		}
+
+		skb->data = skb->head;
+		skb_reset_tail_pointer(skb);
+
+		/* Align IP on 16B boundary */
+		skb_reserve(skb, NET_IP_ALIGN);
+		skb_put(skb, pktlength - 4);
+		smsc911x_rx_readfifo(pdata, (unsigned int *)skb->head,
+				     pktwords);
+		skb->protocol = eth_type_trans(skb, dev);
+		skb->ip_summed = CHECKSUM_NONE;
+		netif_receive_skb(skb);
+
+		/* Update counters */
+		dev->stats.rx_packets++;
+		dev->stats.rx_bytes += (pktlength - 4);
+		dev->last_rx = jiffies;
+	}
+
+	/* Return total received packets */
+	return npackets;
+}
+
+/* Returns hash bit number for given MAC address
+ * Example:
+ * 01 00 5E 00 00 01 -> returns bit number 31 */
+static unsigned int smsc911x_hash(char addr[ETH_ALEN])
+{
+	return (ether_crc(ETH_ALEN, addr) >> 26) & 0x3f;
+}
+
+static void smsc911x_rx_multicast_update(struct smsc911x_data *pdata)
+{
+	/* Performs the multicast & mac_cr update.  This is called when
+	 * safe on the current hardware, and with the mac_lock held */
+	unsigned int mac_cr;
+
+	SMSC_ASSERT_MAC_LOCK(pdata);
+
+	mac_cr = smsc911x_mac_read(pdata, MAC_CR);
+	mac_cr |= pdata->set_bits_mask;
+	mac_cr &= ~(pdata->clear_bits_mask);
+	smsc911x_mac_write(pdata, MAC_CR, mac_cr);
+	smsc911x_mac_write(pdata, HASHH, pdata->hashhi);
+	smsc911x_mac_write(pdata, HASHL, pdata->hashlo);
+	SMSC_TRACE(HW, "maccr 0x%08X, HASHH 0x%08X, HASHL 0x%08X",
+		mac_cr, pdata->hashhi, pdata->hashlo);
+}
+
+static void smsc911x_rx_multicast_update_workaround(struct smsc911x_data *pdata)
+{
+	unsigned int mac_cr;
+
+	/* This function is only called for older LAN911x devices
+	 * (revA or revB), where MAC_CR, HASHH and HASHL should not
+	 * be modified during Rx - newer devices immediately update the
+	 * registers.
+	 *
+	 * This is called from interrupt context */
+
+	spin_lock(&pdata->mac_lock);
+
+	/* Check Rx has stopped */
+	if (smsc911x_mac_read(pdata, MAC_CR) & MAC_CR_RXEN_)
+		SMSC_WARNING(DRV, "Rx not stopped");
+
+	/* Perform the update - safe to do now Rx has stopped */
+	smsc911x_rx_multicast_update(pdata);
+
+	/* Re-enable Rx */
+	mac_cr = smsc911x_mac_read(pdata, MAC_CR);
+	mac_cr |= MAC_CR_RXEN_;
+	smsc911x_mac_write(pdata, MAC_CR, mac_cr);
+
+	pdata->multicast_update_pending = 0;
+
+	spin_unlock(&pdata->mac_lock);
+}
+
+static int smsc911x_soft_reset(struct smsc911x_data *pdata)
+{
+	unsigned int timeout;
+	unsigned int temp;
+
+	/* Reset the LAN911x */
+	smsc911x_reg_write(pdata, HW_CFG, HW_CFG_SRST_);
+	timeout = 10;
+	do {
+		udelay(10);
+		temp = smsc911x_reg_read(pdata, HW_CFG);
+	} while ((--timeout) && (temp & HW_CFG_SRST_));
+
+	if (unlikely(temp & HW_CFG_SRST_)) {
+		SMSC_WARNING(DRV, "Failed to complete reset");
+		return -EIO;
+	}
+	return 0;
+}
+
+/* Sets the device MAC address to dev_addr, called with mac_lock held */
+static void
+smsc911x_set_mac_address(struct smsc911x_data *pdata, u8 dev_addr[6])
+{
+	u32 mac_high16 = (dev_addr[5] << 8) | dev_addr[4];
+	u32 mac_low32 = (dev_addr[3] << 24) | (dev_addr[2] << 16) |
+	    (dev_addr[1] << 8) | dev_addr[0];
+
+	SMSC_ASSERT_MAC_LOCK(pdata);
+
+	smsc911x_mac_write(pdata, ADDRH, mac_high16);
+	smsc911x_mac_write(pdata, ADDRL, mac_low32);
+}
+
+static int smsc911x_open(struct net_device *dev)
+{
+	struct smsc911x_data *pdata = netdev_priv(dev);
+	unsigned int timeout;
+	unsigned int temp;
+	unsigned int intcfg;
+
+	/* if the phy is not yet registered, retry later*/
+	if (!pdata->phy_dev) {
+		SMSC_WARNING(HW, "phy_dev is NULL");
+		return -EAGAIN;
+	}
+
+	if (!is_valid_ether_addr(dev->dev_addr)) {
+		SMSC_WARNING(HW, "dev_addr is not a valid MAC address");
+		return -EADDRNOTAVAIL;
+	}
+
+	/* Reset the LAN911x */
+	if (smsc911x_soft_reset(pdata)) {
+		SMSC_WARNING(HW, "soft reset failed");
+		return -EIO;
+	}
+
+	smsc911x_reg_write(pdata, HW_CFG, 0x00050000);
+	smsc911x_reg_write(pdata, AFC_CFG, 0x006E3740);
+
+	/* Make sure EEPROM has finished loading before setting GPIO_CFG */
+	timeout = 50;
+	while ((timeout--) &&
+	       (smsc911x_reg_read(pdata, E2P_CMD) & E2P_CMD_EPC_BUSY_)) {
+		udelay(10);
+	}
+
+	if (unlikely(timeout == 0))
+		SMSC_WARNING(IFUP,
+			"Timed out waiting for EEPROM busy bit to clear");
+
+	smsc911x_reg_write(pdata, GPIO_CFG, 0x70070000);
+
+	/* The soft reset above cleared the device's MAC address,
+	 * restore it from local copy (set in probe) */
+	spin_lock_irq(&pdata->mac_lock);
+	smsc911x_set_mac_address(pdata, dev->dev_addr);
+	spin_unlock_irq(&pdata->mac_lock);
+
+	/* Initialise irqs, but leave all sources disabled */
+	smsc911x_reg_write(pdata, INT_EN, 0);
+	smsc911x_reg_write(pdata, INT_STS, 0xFFFFFFFF);
+
+	/* Set interrupt deassertion to 100uS */
+	intcfg = ((10 << 24) | INT_CFG_IRQ_EN_);
+
+	if (pdata->irq_polarity) {
+		SMSC_TRACE(IFUP, "irq polarity: active high");
+		intcfg |= INT_CFG_IRQ_POL_;
+	} else {
+		SMSC_TRACE(IFUP, "irq polarity: active low");
+	}
+
+	if (pdata->irq_type) {
+		SMSC_TRACE(IFUP, "irq type: push-pull");
+		intcfg |= INT_CFG_IRQ_TYPE_;
+	} else {
+		SMSC_TRACE(IFUP, "irq type: open drain");
+	}
+
+	smsc911x_reg_write(pdata, INT_CFG, intcfg);
+
+	SMSC_TRACE(IFUP, "Testing irq handler using IRQ %d", dev->irq);
+	pdata->software_irq_signal = 0;
+	smp_wmb();
+
+	temp = smsc911x_reg_read(pdata, INT_EN);
+	temp |= INT_EN_SW_INT_EN_;
+	smsc911x_reg_write(pdata, INT_EN, temp);
+
+	timeout = 1000;
+	while (timeout--) {
+		if (pdata->software_irq_signal)
+			break;
+		msleep(1);
+	}
+
+	if (!pdata->software_irq_signal) {
+		dev_warn(&dev->dev, "ISR failed signaling test (IRQ %d)\n",
+			 dev->irq);
+		return -ENODEV;
+	}
+	SMSC_TRACE(IFUP, "IRQ handler passed test using IRQ %d", dev->irq);
+
+	dev_info(&dev->dev, "SMSC911x/921x identified at %#08lx, IRQ: %d\n",
+		 (unsigned long)pdata->ioaddr, dev->irq);
+
+	/* Bring the PHY up */
+	phy_start(pdata->phy_dev);
+
+	temp = smsc911x_reg_read(pdata, HW_CFG);
+	/* Preserve TX FIFO size and external PHY configuration */
+	temp &= (HW_CFG_TX_FIF_SZ_|0x00000FFF);
+	temp |= HW_CFG_SF_;
+	smsc911x_reg_write(pdata, HW_CFG, temp);
+
+	temp = smsc911x_reg_read(pdata, FIFO_INT);
+	temp |= FIFO_INT_TX_AVAIL_LEVEL_;
+	temp &= ~(FIFO_INT_RX_STS_LEVEL_);
+	smsc911x_reg_write(pdata, FIFO_INT, temp);
+
+	/* set RX Data offset to 2 bytes for alignment */
+	smsc911x_reg_write(pdata, RX_CFG, (2 << 8));
+
+	/* enable NAPI polling before enabling RX interrupts */
+	napi_enable(&pdata->napi);
+
+	temp = smsc911x_reg_read(pdata, INT_EN);
+	temp |= (INT_EN_TDFA_EN_ | INT_EN_RSFL_EN_);
+	smsc911x_reg_write(pdata, INT_EN, temp);
+
+	spin_lock_irq(&pdata->mac_lock);
+	temp = smsc911x_mac_read(pdata, MAC_CR);
+	temp |= (MAC_CR_TXEN_ | MAC_CR_RXEN_ | MAC_CR_HBDIS_);
+	smsc911x_mac_write(pdata, MAC_CR, temp);
+	spin_unlock_irq(&pdata->mac_lock);
+
+	smsc911x_reg_write(pdata, TX_CFG, TX_CFG_TX_ON_);
+
+	netif_start_queue(dev);
+	return 0;
+}
+
+/* Entry point for stopping the interface */
+static int smsc911x_stop(struct net_device *dev)
+{
+	struct smsc911x_data *pdata = netdev_priv(dev);
+	unsigned int temp;
+
+	BUG_ON(!pdata->phy_dev);
+
+	/* Disable all device interrupts */
+	temp = smsc911x_reg_read(pdata, INT_CFG);
+	temp &= ~INT_CFG_IRQ_EN_;
+	smsc911x_reg_write(pdata, INT_CFG, temp);
+
+	/* Stop Tx and Rx polling */
+	netif_stop_queue(dev);
+	napi_disable(&pdata->napi);
+
+	/* At this point all Rx and Tx activity is stopped */
+	dev->stats.rx_dropped += smsc911x_reg_read(pdata, RX_DROP);
+	smsc911x_tx_update_txcounters(dev);
+
+	/* Bring the PHY down */
+	phy_stop(pdata->phy_dev);
+
+	SMSC_TRACE(IFDOWN, "Interface stopped");
+	return 0;
+}
+
+/* Entry point for transmitting a packet */
+static int smsc911x_hard_start_xmit(struct sk_buff *skb, struct net_device *dev)
+{
+	struct smsc911x_data *pdata = netdev_priv(dev);
+	unsigned int freespace;
+	unsigned int tx_cmd_a;
+	unsigned int tx_cmd_b;
+	unsigned int temp;
+	u32 wrsz;
+	ulong bufp;
+
+	freespace = smsc911x_reg_read(pdata, TX_FIFO_INF) & TX_FIFO_INF_TDFREE_;
+
+	if (unlikely(freespace < TX_FIFO_LOW_THRESHOLD))
+		SMSC_WARNING(TX_ERR,
+			"Tx data fifo low, space available: %d", freespace);
+
+	/* Word alignment adjustment */
+	tx_cmd_a = (u32)((ulong)skb->data & 0x03) << 16;
+	tx_cmd_a |= TX_CMD_A_FIRST_SEG_ | TX_CMD_A_LAST_SEG_;
+	tx_cmd_a |= (unsigned int)skb->len;
+
+	tx_cmd_b = ((unsigned int)skb->len) << 16;
+	tx_cmd_b |= (unsigned int)skb->len;
+
+	smsc911x_reg_write(pdata, TX_DATA_FIFO, tx_cmd_a);
+	smsc911x_reg_write(pdata, TX_DATA_FIFO, tx_cmd_b);
+
+	bufp = (ulong)skb->data & (~0x3);
+	wrsz = (u32)skb->len + 3;
+	wrsz += (u32)((ulong)skb->data & 0x3);
+	wrsz >>= 2;
+
+	smsc911x_tx_writefifo(pdata, (unsigned int *)bufp, wrsz);
+	freespace -= (skb->len + 32);
+	dev_kfree_skb(skb);
+	dev->trans_start = jiffies;
+
+	if (unlikely(smsc911x_tx_get_txstatcount(pdata) >= 30))
+		smsc911x_tx_update_txcounters(dev);
+
+	if (freespace < TX_FIFO_LOW_THRESHOLD) {
+		netif_stop_queue(dev);
+		temp = smsc911x_reg_read(pdata, FIFO_INT);
+		temp &= 0x00FFFFFF;
+		temp |= 0x32000000;
+		smsc911x_reg_write(pdata, FIFO_INT, temp);
+	}
+
+	return NETDEV_TX_OK;
+}
+
+/* Entry point for getting status counters */
+static struct net_device_stats *smsc911x_get_stats(struct net_device *dev)
+{
+	struct smsc911x_data *pdata = netdev_priv(dev);
+	smsc911x_tx_update_txcounters(dev);
+	dev->stats.rx_dropped += smsc911x_reg_read(pdata, RX_DROP);
+	return &dev->stats;
+}
+
+/* Entry point for setting addressing modes */
+static void smsc911x_set_multicast_list(struct net_device *dev)
+{
+	struct smsc911x_data *pdata = netdev_priv(dev);
+	unsigned long flags;
+
+	if (dev->flags & IFF_PROMISC) {
+		/* Enabling promiscuous mode */
+		pdata->set_bits_mask = MAC_CR_PRMS_;
+		pdata->clear_bits_mask = (MAC_CR_MCPAS_ | MAC_CR_HPFILT_);
+		pdata->hashhi = 0;
+		pdata->hashlo = 0;
+	} else if (dev->flags & IFF_ALLMULTI) {
+		/* Enabling all multicast mode */
+		pdata->set_bits_mask = MAC_CR_MCPAS_;
+		pdata->clear_bits_mask = (MAC_CR_PRMS_ | MAC_CR_HPFILT_);
+		pdata->hashhi = 0;
+		pdata->hashlo = 0;
+	} else if (dev->mc_count > 0) {
+		/* Enabling specific multicast addresses */
+		unsigned int hash_high = 0;
+		unsigned int hash_low = 0;
+		unsigned int count = 0;
+		struct dev_mc_list *mc_list = dev->mc_list;
+
+		pdata->set_bits_mask = MAC_CR_HPFILT_;
+		pdata->clear_bits_mask = (MAC_CR_PRMS_ | MAC_CR_MCPAS_);
+
+		while (mc_list) {
+			count++;
+			if ((mc_list->dmi_addrlen) == ETH_ALEN) {
+				unsigned int bitnum =
+				    smsc911x_hash(mc_list->dmi_addr);
+				unsigned int mask = 0x01 << (bitnum & 0x1F);
+				if (bitnum & 0x20)
+					hash_high |= mask;
+				else
+					hash_low |= mask;
+			} else {
+				SMSC_WARNING(DRV, "dmi_addrlen != 6");
+			}
+			mc_list = mc_list->next;
+		}
+		if (count != (unsigned int)dev->mc_count)
+			SMSC_WARNING(DRV, "mc_count != dev->mc_count");
+
+		pdata->hashhi = hash_high;
+		pdata->hashlo = hash_low;
+	} else {
+		/* Enabling local MAC address only */
+		pdata->set_bits_mask = 0;
+		pdata->clear_bits_mask =
+		    (MAC_CR_PRMS_ | MAC_CR_MCPAS_ | MAC_CR_HPFILT_);
+		pdata->hashhi = 0;
+		pdata->hashlo = 0;
+	}
+
+	spin_lock_irqsave(&pdata->mac_lock, flags);
+
+	if (pdata->generation <= 1) {
+		/* Older hardware revision - cannot change these flags while
+		 * receiving data */
+		if (!pdata->multicast_update_pending) {
+			unsigned int temp;
+			SMSC_TRACE(HW, "scheduling mcast update");
+			pdata->multicast_update_pending = 1;
+
+			/* Request the hardware to stop, then perform the
+			 * update when we get an RX_STOP interrupt */
+			smsc911x_reg_write(pdata, INT_STS, INT_STS_RXSTOP_INT_);
+			temp = smsc911x_reg_read(pdata, INT_EN);
+			temp |= INT_EN_RXSTOP_INT_EN_;
+			smsc911x_reg_write(pdata, INT_EN, temp);
+
+			temp = smsc911x_mac_read(pdata, MAC_CR);
+			temp &= ~(MAC_CR_RXEN_);
+			smsc911x_mac_write(pdata, MAC_CR, temp);
+		} else {
+			/* There is another update pending, this should now
+			 * use the newer values */
+		}
+	} else {
+		/* Newer hardware revision - can write immediately */
+		smsc911x_rx_multicast_update(pdata);
+	}
+
+	spin_unlock_irqrestore(&pdata->mac_lock, flags);
+}
+
+static irqreturn_t smsc911x_irqhandler(int irq, void *dev_id)
+{
+	struct net_device *dev = dev_id;
+	struct smsc911x_data *pdata = netdev_priv(dev);
+	u32 intsts = smsc911x_reg_read(pdata, INT_STS);
+	u32 inten = smsc911x_reg_read(pdata, INT_EN);
+	int serviced = IRQ_NONE;
+	u32 temp;
+
+	if (unlikely(intsts & inten & INT_STS_SW_INT_)) {
+		temp = smsc911x_reg_read(pdata, INT_EN);
+		temp &= (~INT_EN_SW_INT_EN_);
+		smsc911x_reg_write(pdata, INT_EN, temp);
+		smsc911x_reg_write(pdata, INT_STS, INT_STS_SW_INT_);
+		pdata->software_irq_signal = 1;
+		smp_wmb();
+		serviced = IRQ_HANDLED;
+	}
+
+	if (unlikely(intsts & inten & INT_STS_RXSTOP_INT_)) {
+		/* Called when there is a multicast update scheduled and
+		 * it is now safe to complete the update */
+		SMSC_TRACE(INTR, "RX Stop interrupt");
+		temp = smsc911x_reg_read(pdata, INT_EN);
+		temp &= (~INT_EN_RXSTOP_INT_EN_);
+		smsc911x_reg_write(pdata, INT_EN, temp);
+		smsc911x_reg_write(pdata, INT_STS, INT_STS_RXSTOP_INT_);
+		smsc911x_rx_multicast_update_workaround(pdata);
+		serviced = IRQ_HANDLED;
+	}
+
+	if (intsts & inten & INT_STS_TDFA_) {
+		temp = smsc911x_reg_read(pdata, FIFO_INT);
+		temp |= FIFO_INT_TX_AVAIL_LEVEL_;
+		smsc911x_reg_write(pdata, FIFO_INT, temp);
+		smsc911x_reg_write(pdata, INT_STS, INT_STS_TDFA_);
+		netif_wake_queue(dev);
+		serviced = IRQ_HANDLED;
+	}
+
+	if (unlikely(intsts & inten & INT_STS_RXE_)) {
+		SMSC_TRACE(INTR, "RX Error interrupt");
+		smsc911x_reg_write(pdata, INT_STS, INT_STS_RXE_);
+		serviced = IRQ_HANDLED;
+	}
+
+	if (likely(intsts & inten & INT_STS_RSFL_)) {
+		if (likely(netif_rx_schedule_prep(dev, &pdata->napi))) {
+			/* Disable Rx interrupts */
+			temp = smsc911x_reg_read(pdata, INT_EN);
+			temp &= (~INT_EN_RSFL_EN_);
+			smsc911x_reg_write(pdata, INT_EN, temp);
+			/* Schedule a NAPI poll */
+			__netif_rx_schedule(dev, &pdata->napi);
+		} else {
+			SMSC_WARNING(RX_ERR,
+				"netif_rx_schedule_prep failed");
+		}
+		serviced = IRQ_HANDLED;
+	}
+
+	return serviced;
+}
+
+#ifdef CONFIG_NET_POLL_CONTROLLER
+void smsc911x_poll_controller(struct net_device *dev)
+{
+	disable_irq(dev->irq);
+	smsc911x_irqhandler(0, dev);
+	enable_irq(dev->irq);
+}
+#endif				/* CONFIG_NET_POLL_CONTROLLER */
+
+/* Standard ioctls for mii-tool */
+static int smsc911x_do_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
+{
+	struct smsc911x_data *pdata = netdev_priv(dev);
+
+	if (!netif_running(dev) || !pdata->phy_dev)
+		return -EINVAL;
+
+	return phy_mii_ioctl(pdata->phy_dev, if_mii(ifr), cmd);
+}
+
+static int
+smsc911x_ethtool_getsettings(struct net_device *dev, struct ethtool_cmd *cmd)
+{
+	struct smsc911x_data *pdata = netdev_priv(dev);
+
+	cmd->maxtxpkt = 1;
+	cmd->maxrxpkt = 1;
+	return phy_ethtool_gset(pdata->phy_dev, cmd);
+}
+
+static int
+smsc911x_ethtool_setsettings(struct net_device *dev, struct ethtool_cmd *cmd)
+{
+	struct smsc911x_data *pdata = netdev_priv(dev);
+
+	return phy_ethtool_sset(pdata->phy_dev, cmd);
+}
+
+static void smsc911x_ethtool_getdrvinfo(struct net_device *dev,
+					struct ethtool_drvinfo *info)
+{
+	strlcpy(info->driver, SMSC_CHIPNAME, sizeof(info->driver));
+	strlcpy(info->version, SMSC_DRV_VERSION, sizeof(info->version));
+	strlcpy(info->bus_info, dev->dev.parent->bus_id,
+		sizeof(info->bus_info));
+}
+
+static int smsc911x_ethtool_nwayreset(struct net_device *dev)
+{
+	struct smsc911x_data *pdata = netdev_priv(dev);
+
+	return phy_start_aneg(pdata->phy_dev);
+}
+
+static u32 smsc911x_ethtool_getmsglevel(struct net_device *dev)
+{
+	struct smsc911x_data *pdata = netdev_priv(dev);
+	return pdata->msg_enable;
+}
+
+static void smsc911x_ethtool_setmsglevel(struct net_device *dev, u32 level)
+{
+	struct smsc911x_data *pdata = netdev_priv(dev);
+	pdata->msg_enable = level;
+}
+
+static int smsc911x_ethtool_getregslen(struct net_device *dev)
+{
+	return (((E2P_DATA - ID_REV) / 4 + 1) + (WUCSR - MAC_CR) + 1 + 32) *
+	    sizeof(u32);
+}
+
+static void
+smsc911x_ethtool_getregs(struct net_device *dev, struct ethtool_regs *regs,
+			 void *buf)
+{
+	struct smsc911x_data *pdata = netdev_priv(dev);
+	struct phy_device *phy_dev = pdata->phy_dev;
+	unsigned long flags;
+	unsigned int i;
+	unsigned int j = 0;
+	u32 *data = buf;
+
+	regs->version = pdata->idrev;
+	for (i = ID_REV; i <= E2P_DATA; i += (sizeof(u32)))
+		data[j++] = smsc911x_reg_read(pdata, i);
+
+	for (i = MAC_CR; i <= WUCSR; i++) {
+		spin_lock_irqsave(&pdata->mac_lock, flags);
+		data[j++] = smsc911x_mac_read(pdata, i);
+		spin_unlock_irqrestore(&pdata->mac_lock, flags);
+	}
+
+	for (i = 0; i <= 31; i++)
+		data[j++] = smsc911x_mii_read(phy_dev->bus, phy_dev->addr, i);
+}
+
+static void smsc911x_eeprom_enable_access(struct smsc911x_data *pdata)
+{
+	unsigned int temp = smsc911x_reg_read(pdata, GPIO_CFG);
+	temp &= ~GPIO_CFG_EEPR_EN_;
+	smsc911x_reg_write(pdata, GPIO_CFG, temp);
+	msleep(1);
+}
+
+static int smsc911x_eeprom_send_cmd(struct smsc911x_data *pdata, u32 op)
+{
+	int timeout = 100;
+	u32 e2cmd;
+
+	SMSC_TRACE(DRV, "op 0x%08x", op);
+	if (smsc911x_reg_read(pdata, E2P_CMD) & E2P_CMD_EPC_BUSY_) {
+		SMSC_WARNING(DRV, "Busy at start");
+		return -EBUSY;
+	}
+
+	e2cmd = op | E2P_CMD_EPC_BUSY_;
+	smsc911x_reg_write(pdata, E2P_CMD, e2cmd);
+
+	do {
+		msleep(1);
+		e2cmd = smsc911x_reg_read(pdata, E2P_CMD);
+	} while ((e2cmd & E2P_CMD_EPC_BUSY_) && (timeout--));
+
+	if (!timeout) {
+		SMSC_TRACE(DRV, "TIMED OUT");
+		return -EAGAIN;
+	}
+
+	if (e2cmd & E2P_CMD_EPC_TIMEOUT_) {
+		SMSC_TRACE(DRV, "Error occured during eeprom operation");
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static int smsc911x_eeprom_read_location(struct smsc911x_data *pdata,
+					 u8 address, u8 *data)
+{
+	u32 op = E2P_CMD_EPC_CMD_READ_ | address;
+	int ret;
+
+	SMSC_TRACE(DRV, "address 0x%x", address);
+	ret = smsc911x_eeprom_send_cmd(pdata, op);
+
+	if (!ret)
+		data[address] = smsc911x_reg_read(pdata, E2P_DATA);
+
+	return ret;
+}
+
+static int smsc911x_eeprom_write_location(struct smsc911x_data *pdata,
+					  u8 address, u8 data)
+{
+	u32 op = E2P_CMD_EPC_CMD_ERASE_ | address;
+	int ret;
+
+	SMSC_TRACE(DRV, "address 0x%x, data 0x%x", address, data);
+	ret = smsc911x_eeprom_send_cmd(pdata, op);
+
+	if (!ret) {
+		op = E2P_CMD_EPC_CMD_WRITE_ | address;
+		smsc911x_reg_write(pdata, E2P_DATA, (u32)data);
+		ret = smsc911x_eeprom_send_cmd(pdata, op);
+	}
+
+	return ret;
+}
+
+static int smsc911x_ethtool_get_eeprom_len(struct net_device *dev)
+{
+	return SMSC911X_EEPROM_SIZE;
+}
+
+static int smsc911x_ethtool_get_eeprom(struct net_device *dev,
+				       struct ethtool_eeprom *eeprom, u8 *data)
+{
+	struct smsc911x_data *pdata = netdev_priv(dev);
+	u8 eeprom_data[SMSC911X_EEPROM_SIZE];
+	int len;
+	int i;
+
+	smsc911x_eeprom_enable_access(pdata);
+
+	len = min(eeprom->len, SMSC911X_EEPROM_SIZE);
+	for (i = 0; i < len; i++) {
+		int ret = smsc911x_eeprom_read_location(pdata, i, eeprom_data);
+		if (ret < 0) {
+			eeprom->len = 0;
+			return ret;
+		}
+	}
+
+	memcpy(data, &eeprom_data[eeprom->offset], len);
+	eeprom->len = len;
+	return 0;
+}
+
+static int smsc911x_ethtool_set_eeprom(struct net_device *dev,
+				       struct ethtool_eeprom *eeprom, u8 *data)
+{
+	int ret;
+	struct smsc911x_data *pdata = netdev_priv(dev);
+
+	smsc911x_eeprom_enable_access(pdata);
+	smsc911x_eeprom_send_cmd(pdata, E2P_CMD_EPC_CMD_EWEN_);
+	ret = smsc911x_eeprom_write_location(pdata, eeprom->offset, *data);
+	smsc911x_eeprom_send_cmd(pdata, E2P_CMD_EPC_CMD_EWDS_);
+
+	/* Single byte write, according to man page */
+	eeprom->len = 1;
+
+	return ret;
+}
+
+static struct ethtool_ops smsc911x_ethtool_ops = {
+	.get_settings = smsc911x_ethtool_getsettings,
+	.set_settings = smsc911x_ethtool_setsettings,
+	.get_link = ethtool_op_get_link,
+	.get_drvinfo = smsc911x_ethtool_getdrvinfo,
+	.nway_reset = smsc911x_ethtool_nwayreset,
+	.get_msglevel = smsc911x_ethtool_getmsglevel,
+	.set_msglevel = smsc911x_ethtool_setmsglevel,
+	.get_regs_len = smsc911x_ethtool_getregslen,
+	.get_regs = smsc911x_ethtool_getregs,
+	.get_eeprom_len = smsc911x_ethtool_get_eeprom_len,
+	.get_eeprom = smsc911x_ethtool_get_eeprom,
+	.set_eeprom = smsc911x_ethtool_set_eeprom,
+};
+
+/* Initializing private device structures, only called from probe */
+static int __devinit smsc911x_init(struct net_device *dev)
+{
+	struct smsc911x_data *pdata = netdev_priv(dev);
+	unsigned int byte_test;
+
+	SMSC_TRACE(PROBE, "Driver Parameters:");
+	SMSC_TRACE(PROBE, "LAN base: 0x%08lX",
+		(unsigned long)pdata->ioaddr);
+	SMSC_TRACE(PROBE, "IRQ: %d", dev->irq);
+	SMSC_TRACE(PROBE, "PHY will be autodetected.");
+
+#if (!SMSC_CAN_USE_32BIT)
+	spin_lock_init(&pdata->dev_lock);
+#endif
+
+	if (pdata->ioaddr == 0) {
+		SMSC_WARNING(PROBE, "pdata->ioaddr: 0x00000000");
+		return -ENODEV;
+	}
+
+	/* Check byte ordering */
+	byte_test = smsc911x_reg_read(pdata, BYTE_TEST);
+	SMSC_TRACE(PROBE, "BYTE_TEST: 0x%08X", byte_test);
+	if (byte_test == 0x43218765) {
+		SMSC_TRACE(PROBE, "BYTE_TEST looks swapped, "
+			"applying WORD_SWAP");
+		smsc911x_reg_write(pdata, WORD_SWAP, 0xffffffff);
+
+		/* 1 dummy read of BYTE_TEST is needed after a write to
+		 * WORD_SWAP before its contents are valid */
+		byte_test = smsc911x_reg_read(pdata, BYTE_TEST);
+
+		byte_test = smsc911x_reg_read(pdata, BYTE_TEST);
+	}
+
+	if (byte_test != 0x87654321) {
+		SMSC_WARNING(DRV, "BYTE_TEST: 0x%08X", byte_test);
+		if (((byte_test >> 16) & 0xFFFF) == (byte_test & 0xFFFF)) {
+			SMSC_WARNING(PROBE,
+				"top 16 bits equal to bottom 16 bits");
+			SMSC_TRACE(PROBE, "This may mean the chip is set "
+				"for 32 bit while the bus is reading 16 bit");
+		}
+		return -ENODEV;
+	}
+
+	/* Default generation to zero (all workarounds apply) */
+	pdata->generation = 0;
+
+	pdata->idrev = smsc911x_reg_read(pdata, ID_REV);
+	switch (pdata->idrev & 0xFFFF0000) {
+	case 0x01180000:
+	case 0x01170000:
+	case 0x01160000:
+	case 0x01150000:
+		/* LAN911[5678] family */
+		pdata->generation = pdata->idrev & 0x0000FFFF;
+		break;
+
+	case 0x118A0000:
+	case 0x117A0000:
+	case 0x116A0000:
+	case 0x115A0000:
+		/* LAN921[5678] family */
+		pdata->generation = 3;
+		break;
+
+	case 0x92100000:
+	case 0x92110000:
+	case 0x92200000:
+	case 0x92210000:
+		/* LAN9210/LAN9211/LAN9220/LAN9221 */
+		pdata->generation = 4;
+		break;
+
+	default:
+		SMSC_WARNING(PROBE, "LAN911x not identified, idrev: 0x%08X",
+			pdata->idrev);
+		return -ENODEV;
+	}
+
+	SMSC_TRACE(PROBE, "LAN911x identified, idrev: 0x%08X, generation: %d",
+		pdata->idrev, pdata->generation);
+
+	if (pdata->generation == 0)
+		SMSC_WARNING(PROBE,
+			"This driver is not intended for this chip revision");
+
+	/* Reset the LAN911x */
+	if (smsc911x_soft_reset(pdata))
+		return -ENODEV;
+
+	/* Disable all interrupt sources until we bring the device up */
+	smsc911x_reg_write(pdata, INT_EN, 0);
+
+	ether_setup(dev);
+	dev->open = smsc911x_open;
+	dev->stop = smsc911x_stop;
+	dev->hard_start_xmit = smsc911x_hard_start_xmit;
+	dev->get_stats = smsc911x_get_stats;
+	dev->set_multicast_list = smsc911x_set_multicast_list;
+	dev->flags |= IFF_MULTICAST;
+	dev->do_ioctl = smsc911x_do_ioctl;
+	netif_napi_add(dev, &pdata->napi, smsc911x_poll, SMSC_NAPI_WEIGHT);
+	dev->ethtool_ops = &smsc911x_ethtool_ops;
+
+#ifdef CONFIG_NET_POLL_CONTROLLER
+	dev->poll_controller = smsc911x_poll_controller;
+#endif				/* CONFIG_NET_POLL_CONTROLLER */
+
+	return 0;
+}
+
+static int __devexit smsc911x_drv_remove(struct platform_device *pdev)
+{
+	struct net_device *dev;
+	struct smsc911x_data *pdata;
+	struct resource *res;
+
+	dev = platform_get_drvdata(pdev);
+	BUG_ON(!dev);
+	pdata = netdev_priv(dev);
+	BUG_ON(!pdata);
+	BUG_ON(!pdata->ioaddr);
+	BUG_ON(!pdata->phy_dev);
+
+	SMSC_TRACE(IFDOWN, "Stopping driver.");
+
+	phy_disconnect(pdata->phy_dev);
+	pdata->phy_dev = NULL;
+	mdiobus_unregister(pdata->mii_bus);
+	mdiobus_free(pdata->mii_bus);
+
+	platform_set_drvdata(pdev, NULL);
+	unregister_netdev(dev);
+	free_irq(dev->irq, dev);
+	res = platform_get_resource_byname(pdev, IORESOURCE_MEM,
+					   "smsc911x-memory");
+	if (!res)
+		platform_get_resource(pdev, IORESOURCE_MEM, 0);
+
+	release_mem_region(res->start, res->end - res->start);
+
+	iounmap(pdata->ioaddr);
+
+	free_netdev(dev);
+
+	return 0;
+}
+
+static int __devinit smsc911x_drv_probe(struct platform_device *pdev)
+{
+	struct net_device *dev;
+	struct smsc911x_data *pdata;
+	struct resource *res;
+	unsigned int intcfg = 0;
+	int res_size;
+	int retval;
+	DECLARE_MAC_BUF(mac);
+
+	pr_info("%s: Driver version %s.\n", SMSC_CHIPNAME, SMSC_DRV_VERSION);
+
+	res = platform_get_resource_byname(pdev, IORESOURCE_MEM,
+					   "smsc911x-memory");
+	if (!res)
+		res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+	if (!res) {
+		pr_warning("%s: Could not allocate resource.\n",
+			SMSC_CHIPNAME);
+		retval = -ENODEV;
+		goto out_0;
+	}
+	res_size = res->end - res->start;
+
+	if (!request_mem_region(res->start, res_size, SMSC_CHIPNAME)) {
+		retval = -EBUSY;
+		goto out_0;
+	}
+
+	dev = alloc_etherdev(sizeof(struct smsc911x_data));
+	if (!dev) {
+		pr_warning("%s: Could not allocate device.\n", SMSC_CHIPNAME);
+		retval = -ENOMEM;
+		goto out_release_io_1;
+	}
+
+	SET_NETDEV_DEV(dev, &pdev->dev);
+
+	pdata = netdev_priv(dev);
+
+	dev->irq = platform_get_irq(pdev, 0);
+	pdata->ioaddr = ioremap_nocache(res->start, res_size);
+
+	/* copy config parameters across if present, otherwise pdata
+	 * defaults to zeros */
+	if (pdev->dev.platform_data) {
+		struct smsc911x_platform_config *config =
+			pdev->dev.platform_data;
+		pdata->irq_polarity = config->irq_polarity;
+		pdata->irq_type  = config->irq_type;
+		pdata->phy_interface = config->phy_interface;
+	}
+
+	pdata->dev = dev;
+	pdata->msg_enable = ((1 << debug) - 1);
+
+	if (pdata->ioaddr == NULL) {
+		SMSC_WARNING(PROBE,
+			"Error smsc911x base address invalid");
+		retval = -ENOMEM;
+		goto out_free_netdev_2;
+	}
+
+	retval = smsc911x_init(dev);
+	if (retval < 0)
+		goto out_unmap_io_3;
+
+	/* configure irq polarity and type before connecting isr */
+	if (pdata->irq_polarity == SMSC911X_IRQ_POLARITY_ACTIVE_HIGH)
+		intcfg |= INT_CFG_IRQ_POL_;
+
+	if (pdata->irq_type == SMSC911X_IRQ_TYPE_PUSH_PULL)
+		intcfg |= INT_CFG_IRQ_TYPE_;
+
+	smsc911x_reg_write(pdata, INT_CFG, intcfg);
+
+	/* Ensure interrupts are globally disabled before connecting ISR */
+	smsc911x_reg_write(pdata, INT_EN, 0);
+	smsc911x_reg_write(pdata, INT_STS, 0xFFFFFFFF);
+
+	retval = request_irq(dev->irq, smsc911x_irqhandler, IRQF_DISABLED,
+			     SMSC_CHIPNAME, dev);
+	if (retval) {
+		SMSC_WARNING(PROBE,
+			"Unable to claim requested irq: %d", dev->irq);
+		goto out_unmap_io_3;
+	}
+
+	platform_set_drvdata(pdev, dev);
+
+	retval = register_netdev(dev);
+	if (retval) {
+		SMSC_WARNING(PROBE,
+			"Error %i registering device", retval);
+		goto out_unset_drvdata_4;
+	} else {
+		SMSC_TRACE(PROBE, "Network interface: \"%s\"", dev->name);
+	}
+
+	spin_lock_init(&pdata->mac_lock);
+
+	retval = smsc911x_mii_init(pdev, dev);
+	if (retval) {
+		SMSC_WARNING(PROBE,
+			"Error %i initialising mii", retval);
+		goto out_unregister_netdev_5;
+	}
+
+	spin_lock_irq(&pdata->mac_lock);
+
+	/* Check if mac address has been specified when bringing interface up */
+	if (is_valid_ether_addr(dev->dev_addr)) {
+		smsc911x_set_mac_address(pdata, dev->dev_addr);
+		SMSC_TRACE(PROBE, "MAC Address is specified by configuration");
+	} else {
+		/* Try reading mac address from device. if EEPROM is present
+		 * it will already have been set */
+		u32 mac_high16 = smsc911x_mac_read(pdata, ADDRH);
+		u32 mac_low32 = smsc911x_mac_read(pdata, ADDRL);
+		dev->dev_addr[0] = (u8)(mac_low32);
+		dev->dev_addr[1] = (u8)(mac_low32 >> 8);
+		dev->dev_addr[2] = (u8)(mac_low32 >> 16);
+		dev->dev_addr[3] = (u8)(mac_low32 >> 24);
+		dev->dev_addr[4] = (u8)(mac_high16);
+		dev->dev_addr[5] = (u8)(mac_high16 >> 8);
+
+		if (is_valid_ether_addr(dev->dev_addr)) {
+			/* eeprom values are valid  so use them */
+			SMSC_TRACE(PROBE,
+				"Mac Address is read from LAN911x EEPROM");
+		} else {
+			/* eeprom values are invalid, generate random MAC */
+			random_ether_addr(dev->dev_addr);
+			smsc911x_set_mac_address(pdata, dev->dev_addr);
+			SMSC_TRACE(PROBE,
+				"MAC Address is set to random_ether_addr");
+		}
+	}
+
+	spin_unlock_irq(&pdata->mac_lock);
+
+	dev_info(&dev->dev, "MAC Address: %s\n",
+		 print_mac(mac, dev->dev_addr));
+
+	return 0;
+
+out_unregister_netdev_5:
+	unregister_netdev(dev);
+out_unset_drvdata_4:
+	platform_set_drvdata(pdev, NULL);
+	free_irq(dev->irq, dev);
+out_unmap_io_3:
+	iounmap(pdata->ioaddr);
+out_free_netdev_2:
+	free_netdev(dev);
+out_release_io_1:
+	release_mem_region(res->start, res->end - res->start);
+out_0:
+	return retval;
+}
+
+static struct platform_driver smsc911x_driver = {
+	.probe = smsc911x_drv_probe,
+	.remove = smsc911x_drv_remove,
+	.driver = {
+		.name = SMSC_CHIPNAME,
+	},
+};
+
+/* Entry point for loading the module */
+static int __init smsc911x_init_module(void)
+{
+	return platform_driver_register(&smsc911x_driver);
+}
+
+/* entry point for unloading the module */
+static void __exit smsc911x_cleanup_module(void)
+{
+	platform_driver_unregister(&smsc911x_driver);
+}
+
+module_init(smsc911x_init_module);
+module_exit(smsc911x_cleanup_module);
diff --git a/drivers/net/smsc911x.h b/drivers/net/smsc911x.h
new file mode 100644
index 00000000000..feb36de274c
--- /dev/null
+++ b/drivers/net/smsc911x.h
@@ -0,0 +1,394 @@
+/***************************************************************************
+ *
+ * Copyright (C) 2004-2008 SMSC
+ * Copyright (C) 2005-2008 ARM
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
+ *
+ ***************************************************************************/
+#ifndef __SMSC911X_H__
+#define __SMSC911X_H__
+
+#define SMSC_CAN_USE_32BIT	1
+#define TX_FIFO_LOW_THRESHOLD	((u32)1600)
+#define SMSC911X_EEPROM_SIZE	((u32)7)
+#define USE_DEBUG		0
+
+/* This is the maximum number of packets to be received every
+ * NAPI poll */
+#define SMSC_NAPI_WEIGHT	16
+
+/* implements a PHY loopback test at initialisation time, to ensure a packet
+ * can be succesfully looped back */
+#define USE_PHY_WORK_AROUND
+
+#define DPRINTK(nlevel, klevel, fmt, args...) \
+	((void)((NETIF_MSG_##nlevel & pdata->msg_enable) && \
+	printk(KERN_##klevel "%s: %s: " fmt "\n", \
+	pdata->dev->name, __func__, ## args)))
+
+#if USE_DEBUG >= 1
+#define SMSC_WARNING(nlevel, fmt, args...) \
+	DPRINTK(nlevel, WARNING, fmt, ## args)
+#else
+#define SMSC_WARNING(nlevel, fmt, args...) \
+	({ do {} while (0); 0; })
+#endif
+
+#if USE_DEBUG >= 2
+#define SMSC_TRACE(nlevel, fmt, args...) \
+	DPRINTK(nlevel, INFO, fmt, ## args)
+#else
+#define SMSC_TRACE(nlevel, fmt, args...) \
+	({ do {} while (0); 0; })
+#endif
+
+#ifdef CONFIG_DEBUG_SPINLOCK
+#define SMSC_ASSERT_MAC_LOCK(pdata) \
+		WARN_ON(!spin_is_locked(&pdata->mac_lock))
+#else
+#define SMSC_ASSERT_MAC_LOCK(pdata) do {} while (0)
+#endif				/* CONFIG_DEBUG_SPINLOCK */
+
+#define FLOW_CTRL_TX		(1)
+#define FLOW_CTRL_RX		(2)
+
+/* SMSC911x registers and bitfields */
+#define RX_DATA_FIFO			0x00
+
+#define TX_DATA_FIFO			0x20
+#define TX_CMD_A_ON_COMP_		0x80000000
+#define TX_CMD_A_BUF_END_ALGN_		0x03000000
+#define TX_CMD_A_4_BYTE_ALGN_		0x00000000
+#define TX_CMD_A_16_BYTE_ALGN_		0x01000000
+#define TX_CMD_A_32_BYTE_ALGN_		0x02000000
+#define TX_CMD_A_DATA_OFFSET_		0x001F0000
+#define TX_CMD_A_FIRST_SEG_		0x00002000
+#define TX_CMD_A_LAST_SEG_		0x00001000
+#define TX_CMD_A_BUF_SIZE_		0x000007FF
+#define TX_CMD_B_PKT_TAG_		0xFFFF0000
+#define TX_CMD_B_ADD_CRC_DISABLE_	0x00002000
+#define TX_CMD_B_DISABLE_PADDING_	0x00001000
+#define TX_CMD_B_PKT_BYTE_LENGTH_	0x000007FF
+
+#define RX_STATUS_FIFO			0x40
+#define RX_STS_ES_			0x00008000
+#define RX_STS_MCAST_			0x00000400
+
+#define RX_STATUS_FIFO_PEEK		0x44
+
+#define TX_STATUS_FIFO			0x48
+#define TX_STS_ES_			0x00008000
+
+#define TX_STATUS_FIFO_PEEK		0x4C
+
+#define ID_REV				0x50
+#define ID_REV_CHIP_ID_			0xFFFF0000
+#define ID_REV_REV_ID_			0x0000FFFF
+
+#define INT_CFG				0x54
+#define INT_CFG_INT_DEAS_		0xFF000000
+#define INT_CFG_INT_DEAS_CLR_		0x00004000
+#define INT_CFG_INT_DEAS_STS_		0x00002000
+#define INT_CFG_IRQ_INT_		0x00001000
+#define INT_CFG_IRQ_EN_			0x00000100
+#define INT_CFG_IRQ_POL_		0x00000010
+#define INT_CFG_IRQ_TYPE_		0x00000001
+
+#define INT_STS				0x58
+#define INT_STS_SW_INT_			0x80000000
+#define INT_STS_TXSTOP_INT_		0x02000000
+#define INT_STS_RXSTOP_INT_		0x01000000
+#define INT_STS_RXDFH_INT_		0x00800000
+#define INT_STS_RXDF_INT_		0x00400000
+#define INT_STS_TX_IOC_			0x00200000
+#define INT_STS_RXD_INT_		0x00100000
+#define INT_STS_GPT_INT_		0x00080000
+#define INT_STS_PHY_INT_		0x00040000
+#define INT_STS_PME_INT_		0x00020000
+#define INT_STS_TXSO_			0x00010000
+#define INT_STS_RWT_			0x00008000
+#define INT_STS_RXE_			0x00004000
+#define INT_STS_TXE_			0x00002000
+#define INT_STS_TDFU_			0x00000800
+#define INT_STS_TDFO_			0x00000400
+#define INT_STS_TDFA_			0x00000200
+#define INT_STS_TSFF_			0x00000100
+#define INT_STS_TSFL_			0x00000080
+#define INT_STS_RXDF_			0x00000040
+#define INT_STS_RDFL_			0x00000020
+#define INT_STS_RSFF_			0x00000010
+#define INT_STS_RSFL_			0x00000008
+#define INT_STS_GPIO2_INT_		0x00000004
+#define INT_STS_GPIO1_INT_		0x00000002
+#define INT_STS_GPIO0_INT_		0x00000001
+
+#define INT_EN				0x5C
+#define INT_EN_SW_INT_EN_		0x80000000
+#define INT_EN_TXSTOP_INT_EN_		0x02000000
+#define INT_EN_RXSTOP_INT_EN_		0x01000000
+#define INT_EN_RXDFH_INT_EN_		0x00800000
+#define INT_EN_TIOC_INT_EN_		0x00200000
+#define INT_EN_RXD_INT_EN_		0x00100000
+#define INT_EN_GPT_INT_EN_		0x00080000
+#define INT_EN_PHY_INT_EN_		0x00040000
+#define INT_EN_PME_INT_EN_		0x00020000
+#define INT_EN_TXSO_EN_			0x00010000
+#define INT_EN_RWT_EN_			0x00008000
+#define INT_EN_RXE_EN_			0x00004000
+#define INT_EN_TXE_EN_			0x00002000
+#define INT_EN_TDFU_EN_			0x00000800
+#define INT_EN_TDFO_EN_			0x00000400
+#define INT_EN_TDFA_EN_			0x00000200
+#define INT_EN_TSFF_EN_			0x00000100
+#define INT_EN_TSFL_EN_			0x00000080
+#define INT_EN_RXDF_EN_			0x00000040
+#define INT_EN_RDFL_EN_			0x00000020
+#define INT_EN_RSFF_EN_			0x00000010
+#define INT_EN_RSFL_EN_			0x00000008
+#define INT_EN_GPIO2_INT_		0x00000004
+#define INT_EN_GPIO1_INT_		0x00000002
+#define INT_EN_GPIO0_INT_		0x00000001
+
+#define BYTE_TEST			0x64
+
+#define FIFO_INT			0x68
+#define FIFO_INT_TX_AVAIL_LEVEL_	0xFF000000
+#define FIFO_INT_TX_STS_LEVEL_		0x00FF0000
+#define FIFO_INT_RX_AVAIL_LEVEL_	0x0000FF00
+#define FIFO_INT_RX_STS_LEVEL_		0x000000FF
+
+#define RX_CFG				0x6C
+#define RX_CFG_RX_END_ALGN_		0xC0000000
+#define RX_CFG_RX_END_ALGN4_		0x00000000
+#define RX_CFG_RX_END_ALGN16_		0x40000000
+#define RX_CFG_RX_END_ALGN32_		0x80000000
+#define RX_CFG_RX_DMA_CNT_		0x0FFF0000
+#define RX_CFG_RX_DUMP_			0x00008000
+#define RX_CFG_RXDOFF_			0x00001F00
+
+#define TX_CFG				0x70
+#define TX_CFG_TXS_DUMP_		0x00008000
+#define TX_CFG_TXD_DUMP_		0x00004000
+#define TX_CFG_TXSAO_			0x00000004
+#define TX_CFG_TX_ON_			0x00000002
+#define TX_CFG_STOP_TX_			0x00000001
+
+#define HW_CFG				0x74
+#define HW_CFG_TTM_			0x00200000
+#define HW_CFG_SF_			0x00100000
+#define HW_CFG_TX_FIF_SZ_		0x000F0000
+#define HW_CFG_TR_			0x00003000
+#define HW_CFG_SRST_			0x00000001
+
+/* only available on 115/117 */
+#define HW_CFG_PHY_CLK_SEL_		0x00000060
+#define HW_CFG_PHY_CLK_SEL_INT_PHY_	0x00000000
+#define HW_CFG_PHY_CLK_SEL_EXT_PHY_	0x00000020
+#define HW_CFG_PHY_CLK_SEL_CLK_DIS_	0x00000040
+#define HW_CFG_SMI_SEL_		 	0x00000010
+#define HW_CFG_EXT_PHY_DET_		0x00000008
+#define HW_CFG_EXT_PHY_EN_		0x00000004
+#define HW_CFG_SRST_TO_			0x00000002
+
+/* only available  on 116/118 */
+#define HW_CFG_32_16_BIT_MODE_		0x00000004
+
+#define RX_DP_CTRL			0x78
+#define RX_DP_CTRL_RX_FFWD_		0x80000000
+
+#define RX_FIFO_INF			0x7C
+#define RX_FIFO_INF_RXSUSED_		0x00FF0000
+#define RX_FIFO_INF_RXDUSED_		0x0000FFFF
+
+#define TX_FIFO_INF			0x80
+#define TX_FIFO_INF_TSUSED_		0x00FF0000
+#define TX_FIFO_INF_TDFREE_		0x0000FFFF
+
+#define PMT_CTRL			0x84
+#define PMT_CTRL_PM_MODE_		0x00003000
+#define PMT_CTRL_PM_MODE_D0_		0x00000000
+#define PMT_CTRL_PM_MODE_D1_		0x00001000
+#define PMT_CTRL_PM_MODE_D2_		0x00002000
+#define PMT_CTRL_PM_MODE_D3_		0x00003000
+#define PMT_CTRL_PHY_RST_		0x00000400
+#define PMT_CTRL_WOL_EN_		0x00000200
+#define PMT_CTRL_ED_EN_			0x00000100
+#define PMT_CTRL_PME_TYPE_		0x00000040
+#define PMT_CTRL_WUPS_			0x00000030
+#define PMT_CTRL_WUPS_NOWAKE_		0x00000000
+#define PMT_CTRL_WUPS_ED_		0x00000010
+#define PMT_CTRL_WUPS_WOL_		0x00000020
+#define PMT_CTRL_WUPS_MULTI_		0x00000030
+#define PMT_CTRL_PME_IND_		0x00000008
+#define PMT_CTRL_PME_POL_		0x00000004
+#define PMT_CTRL_PME_EN_		0x00000002
+#define PMT_CTRL_READY_			0x00000001
+
+#define GPIO_CFG			0x88
+#define GPIO_CFG_LED3_EN_		0x40000000
+#define GPIO_CFG_LED2_EN_		0x20000000
+#define GPIO_CFG_LED1_EN_		0x10000000
+#define GPIO_CFG_GPIO2_INT_POL_		0x04000000
+#define GPIO_CFG_GPIO1_INT_POL_		0x02000000
+#define GPIO_CFG_GPIO0_INT_POL_		0x01000000
+#define GPIO_CFG_EEPR_EN_		0x00700000
+#define GPIO_CFG_GPIOBUF2_		0x00040000
+#define GPIO_CFG_GPIOBUF1_		0x00020000
+#define GPIO_CFG_GPIOBUF0_		0x00010000
+#define GPIO_CFG_GPIODIR2_		0x00000400
+#define GPIO_CFG_GPIODIR1_		0x00000200
+#define GPIO_CFG_GPIODIR0_		0x00000100
+#define GPIO_CFG_GPIOD4_		0x00000020
+#define GPIO_CFG_GPIOD3_		0x00000010
+#define GPIO_CFG_GPIOD2_		0x00000004
+#define GPIO_CFG_GPIOD1_		0x00000002
+#define GPIO_CFG_GPIOD0_		0x00000001
+
+#define GPT_CFG				0x8C
+#define GPT_CFG_TIMER_EN_		0x20000000
+#define GPT_CFG_GPT_LOAD_		0x0000FFFF
+
+#define GPT_CNT				0x90
+#define GPT_CNT_GPT_CNT_		0x0000FFFF
+
+#define WORD_SWAP			0x98
+
+#define FREE_RUN			0x9C
+
+#define RX_DROP				0xA0
+
+#define MAC_CSR_CMD			0xA4
+#define MAC_CSR_CMD_CSR_BUSY_		0x80000000
+#define MAC_CSR_CMD_R_NOT_W_		0x40000000
+#define MAC_CSR_CMD_CSR_ADDR_		0x000000FF
+
+#define MAC_CSR_DATA			0xA8
+
+#define AFC_CFG				0xAC
+#define AFC_CFG_AFC_HI_			0x00FF0000
+#define AFC_CFG_AFC_LO_			0x0000FF00
+#define AFC_CFG_BACK_DUR_		0x000000F0
+#define AFC_CFG_FCMULT_			0x00000008
+#define AFC_CFG_FCBRD_			0x00000004
+#define AFC_CFG_FCADD_			0x00000002
+#define AFC_CFG_FCANY_			0x00000001
+
+#define E2P_CMD				0xB0
+#define E2P_CMD_EPC_BUSY_		0x80000000
+#define E2P_CMD_EPC_CMD_		0x70000000
+#define E2P_CMD_EPC_CMD_READ_		0x00000000
+#define E2P_CMD_EPC_CMD_EWDS_		0x10000000
+#define E2P_CMD_EPC_CMD_EWEN_		0x20000000
+#define E2P_CMD_EPC_CMD_WRITE_		0x30000000
+#define E2P_CMD_EPC_CMD_WRAL_		0x40000000
+#define E2P_CMD_EPC_CMD_ERASE_		0x50000000
+#define E2P_CMD_EPC_CMD_ERAL_		0x60000000
+#define E2P_CMD_EPC_CMD_RELOAD_		0x70000000
+#define E2P_CMD_EPC_TIMEOUT_		0x00000200
+#define E2P_CMD_MAC_ADDR_LOADED_	0x00000100
+#define E2P_CMD_EPC_ADDR_		0x000000FF
+
+#define E2P_DATA			0xB4
+#define E2P_DATA_EEPROM_DATA_		0x000000FF
+#define LAN_REGISTER_EXTENT		0x00000100
+
+/*
+ * MAC Control and Status Register (Indirect Address)
+ * Offset (through the MAC_CSR CMD and DATA port)
+ */
+#define MAC_CR				0x01
+#define MAC_CR_RXALL_			0x80000000
+#define MAC_CR_HBDIS_			0x10000000
+#define MAC_CR_RCVOWN_			0x00800000
+#define MAC_CR_LOOPBK_			0x00200000
+#define MAC_CR_FDPX_			0x00100000
+#define MAC_CR_MCPAS_			0x00080000
+#define MAC_CR_PRMS_			0x00040000
+#define MAC_CR_INVFILT_			0x00020000
+#define MAC_CR_PASSBAD_			0x00010000
+#define MAC_CR_HFILT_			0x00008000
+#define MAC_CR_HPFILT_			0x00002000
+#define MAC_CR_LCOLL_			0x00001000
+#define MAC_CR_BCAST_			0x00000800
+#define MAC_CR_DISRTY_			0x00000400
+#define MAC_CR_PADSTR_			0x00000100
+#define MAC_CR_BOLMT_MASK_		0x000000C0
+#define MAC_CR_DFCHK_			0x00000020
+#define MAC_CR_TXEN_			0x00000008
+#define MAC_CR_RXEN_			0x00000004
+
+#define ADDRH				0x02
+
+#define ADDRL				0x03
+
+#define HASHH				0x04
+
+#define HASHL				0x05
+
+#define MII_ACC				0x06
+#define MII_ACC_PHY_ADDR_		0x0000F800
+#define MII_ACC_MIIRINDA_		0x000007C0
+#define MII_ACC_MII_WRITE_		0x00000002
+#define MII_ACC_MII_BUSY_		0x00000001
+
+#define MII_DATA			0x07
+
+#define FLOW				0x08
+#define FLOW_FCPT_			0xFFFF0000
+#define FLOW_FCPASS_			0x00000004
+#define FLOW_FCEN_			0x00000002
+#define FLOW_FCBSY_			0x00000001
+
+#define VLAN1				0x09
+
+#define VLAN2				0x0A
+
+#define WUFF				0x0B
+
+#define WUCSR				0x0C
+#define WUCSR_GUE_			0x00000200
+#define WUCSR_WUFR_			0x00000040
+#define WUCSR_MPR_			0x00000020
+#define WUCSR_WAKE_EN_			0x00000004
+#define WUCSR_MPEN_			0x00000002
+
+/*
+ * Phy definitions (vendor-specific)
+ */
+#define LAN9118_PHY_ID			0x00C0001C
+
+#define MII_INTSTS			0x1D
+
+#define MII_INTMSK			0x1E
+#define PHY_INTMSK_AN_RCV_		(1 << 1)
+#define PHY_INTMSK_PDFAULT_		(1 << 2)
+#define PHY_INTMSK_AN_ACK_		(1 << 3)
+#define PHY_INTMSK_LNKDOWN_		(1 << 4)
+#define PHY_INTMSK_RFAULT_		(1 << 5)
+#define PHY_INTMSK_AN_COMP_		(1 << 6)
+#define PHY_INTMSK_ENERGYON_		(1 << 7)
+#define PHY_INTMSK_DEFAULT_		(PHY_INTMSK_ENERGYON_ | \
+					 PHY_INTMSK_AN_COMP_ | \
+					 PHY_INTMSK_RFAULT_ | \
+					 PHY_INTMSK_LNKDOWN_)
+
+#define ADVERTISE_PAUSE_ALL		(ADVERTISE_PAUSE_CAP | \
+					 ADVERTISE_PAUSE_ASYM)
+
+#define LPA_PAUSE_ALL			(LPA_PAUSE_CAP | \
+					 LPA_PAUSE_ASYM)
+
+#endif				/* __SMSC911X_H__ */
diff --git a/include/linux/smsc911x.h b/include/linux/smsc911x.h
new file mode 100644
index 00000000000..47c4ffd10db
--- /dev/null
+++ b/include/linux/smsc911x.h
@@ -0,0 +1,42 @@
+/***************************************************************************
+ *
+ * Copyright (C) 2004-2008 SMSC
+ * Copyright (C) 2005-2008 ARM
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
+ *
+ ***************************************************************************/
+#ifndef __LINUX_SMSC911X_H__
+#define __LINUX_SMSC911X_H__
+
+#include <linux/phy.h>
+
+/* platform_device configuration data, should be assigned to
+ * the platform_device's dev.platform_data */
+struct smsc911x_platform_config {
+	unsigned int irq_polarity;
+	unsigned int irq_type;
+	phy_interface_t phy_interface;
+};
+
+/* Constants for platform_device irq polarity configuration */
+#define SMSC911X_IRQ_POLARITY_ACTIVE_LOW	0
+#define SMSC911X_IRQ_POLARITY_ACTIVE_HIGH	1
+
+/* Constants for platform_device irq type configuration */
+#define SMSC911X_IRQ_TYPE_OPEN_DRAIN		0
+#define SMSC911X_IRQ_TYPE_PUSH_PULL		1
+
+#endif /* __LINUX_SMSC911X_H__ */
-- 
cgit v1.2.3-70-g09d2


From 60a7ecf42661f2b22168751298592da6ee210c9e Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Wed, 5 Nov 2008 16:05:44 -0500
Subject: ftrace: add quick function trace stop

Impact: quick start and stop of function tracer

This patch adds a way to disable the function tracer quickly without
the need to run kstop_machine. It adds a new variable called
function_trace_stop which will stop the calls to functions from mcount
when set.  This is just an on/off switch and does not handle recursion
like preempt_disable().

It's main purpose is to help other tracers/debuggers start and stop tracing
fuctions without the need to call kstop_machine.

The config option HAVE_FUNCTION_TRACE_MCOUNT_TEST is added for archs
that implement the testing of the function_trace_stop in the mcount
arch dependent code. Otherwise, the test is done in the C code.

x86 is the only arch at the moment that supports this.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/Kconfig           |  1 +
 arch/x86/kernel/entry_32.S |  6 ++++++
 arch/x86/kernel/entry_64.S |  5 +++++
 include/linux/ftrace.h     | 30 +++++++++++++++++++++++++++++
 kernel/trace/Kconfig       |  7 +++++++
 kernel/trace/ftrace.c      | 47 ++++++++++++++++++++++++++++++++++++----------
 6 files changed, 86 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 6f20718d315..d09e812c622 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -29,6 +29,7 @@ config X86
 	select HAVE_FTRACE_MCOUNT_RECORD
 	select HAVE_DYNAMIC_FTRACE
 	select HAVE_FUNCTION_TRACER
+	select HAVE_FUNCTION_TRACE_MCOUNT_TEST
 	select HAVE_KVM if ((X86_32 && !X86_VOYAGER && !X86_VISWS && !X86_NUMAQ) || X86_64)
 	select HAVE_ARCH_KGDB if !X86_VOYAGER
 	select HAVE_ARCH_TRACEHOOK
diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
index 28b597ef9ca..9134de814c9 100644
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -1157,6 +1157,9 @@ ENTRY(mcount)
 END(mcount)
 
 ENTRY(ftrace_caller)
+	cmpl $0, function_trace_stop
+	jne  ftrace_stub
+
 	pushl %eax
 	pushl %ecx
 	pushl %edx
@@ -1180,6 +1183,9 @@ END(ftrace_caller)
 #else /* ! CONFIG_DYNAMIC_FTRACE */
 
 ENTRY(mcount)
+	cmpl $0, function_trace_stop
+	jne  ftrace_stub
+
 	cmpl $ftrace_stub, ftrace_trace_function
 	jnz trace
 .globl ftrace_stub
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index b86f332c96a..08aa6b10933 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -68,6 +68,8 @@ ENTRY(mcount)
 END(mcount)
 
 ENTRY(ftrace_caller)
+	cmpl $0, function_trace_stop
+	jne  ftrace_stub
 
 	/* taken from glibc */
 	subq $0x38, %rsp
@@ -103,6 +105,9 @@ END(ftrace_caller)
 
 #else /* ! CONFIG_DYNAMIC_FTRACE */
 ENTRY(mcount)
+	cmpl $0, function_trace_stop
+	jne  ftrace_stub
+
 	cmpq $ftrace_stub, ftrace_trace_function
 	jnz trace
 .globl ftrace_stub
diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index 4642959e5bd..794ab907dbf 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -23,6 +23,34 @@ struct ftrace_ops {
 	struct ftrace_ops *next;
 };
 
+extern int function_trace_stop;
+
+/**
+ * ftrace_stop - stop function tracer.
+ *
+ * A quick way to stop the function tracer. Note this an on off switch,
+ * it is not something that is recursive like preempt_disable.
+ * This does not disable the calling of mcount, it only stops the
+ * calling of functions from mcount.
+ */
+static inline void ftrace_stop(void)
+{
+	function_trace_stop = 1;
+}
+
+/**
+ * ftrace_start - start the function tracer.
+ *
+ * This function is the inverse of ftrace_stop. This does not enable
+ * the function tracing if the function tracer is disabled. This only
+ * sets the function tracer flag to continue calling the functions
+ * from mcount.
+ */
+static inline void ftrace_start(void)
+{
+	function_trace_stop = 0;
+}
+
 /*
  * The ftrace_ops must be a static and should also
  * be read_mostly.  These functions do modify read_mostly variables
@@ -41,6 +69,8 @@ extern void ftrace_stub(unsigned long a0, unsigned long a1);
 # define unregister_ftrace_function(ops) do { } while (0)
 # define clear_ftrace_function(ops) do { } while (0)
 static inline void ftrace_kill(void) { }
+static inline void ftrace_stop(void) { }
+static inline void ftrace_start(void) { }
 #endif /* CONFIG_FUNCTION_TRACER */
 
 #ifdef CONFIG_DYNAMIC_FTRACE
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 33dbefd471e..fc4febc3334 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -9,6 +9,13 @@ config NOP_TRACER
 config HAVE_FUNCTION_TRACER
 	bool
 
+config HAVE_FUNCTION_TRACE_MCOUNT_TEST
+	bool
+	help
+	 This gets selected when the arch tests the function_trace_stop
+	 variable at the mcount call site. Otherwise, this variable
+	 is tested by the called function.
+
 config HAVE_DYNAMIC_FTRACE
 	bool
 
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 4a39d24568c..896c71f0f4c 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -47,6 +47,9 @@
 int ftrace_enabled __read_mostly;
 static int last_ftrace_enabled;
 
+/* Quick disabling of function tracer. */
+int function_trace_stop;
+
 /*
  * ftrace_disabled is set when an anomaly is discovered.
  * ftrace_disabled is much stronger than ftrace_enabled.
@@ -63,6 +66,7 @@ static struct ftrace_ops ftrace_list_end __read_mostly =
 
 static struct ftrace_ops *ftrace_list __read_mostly = &ftrace_list_end;
 ftrace_func_t ftrace_trace_function __read_mostly = ftrace_stub;
+ftrace_func_t __ftrace_trace_function __read_mostly = ftrace_stub;
 
 static void ftrace_list_func(unsigned long ip, unsigned long parent_ip)
 {
@@ -88,8 +92,23 @@ static void ftrace_list_func(unsigned long ip, unsigned long parent_ip)
 void clear_ftrace_function(void)
 {
 	ftrace_trace_function = ftrace_stub;
+	__ftrace_trace_function = ftrace_stub;
 }
 
+#ifndef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST
+/*
+ * For those archs that do not test ftrace_trace_stop in their
+ * mcount call site, we need to do it from C.
+ */
+static void ftrace_test_stop_func(unsigned long ip, unsigned long parent_ip)
+{
+	if (function_trace_stop)
+		return;
+
+	__ftrace_trace_function(ip, parent_ip);
+}
+#endif
+
 static int __register_ftrace_function(struct ftrace_ops *ops)
 {
 	/* should not be called from interrupt context */
@@ -110,10 +129,18 @@ static int __register_ftrace_function(struct ftrace_ops *ops)
 		 * For one func, simply call it directly.
 		 * For more than one func, call the chain.
 		 */
+#ifdef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST
 		if (ops->next == &ftrace_list_end)
 			ftrace_trace_function = ops->func;
 		else
 			ftrace_trace_function = ftrace_list_func;
+#else
+		if (ops->next == &ftrace_list_end)
+			__ftrace_trace_function = ops->func;
+		else
+			__ftrace_trace_function = ftrace_list_func;
+		ftrace_trace_function = ftrace_test_stop_func;
+#endif
 	}
 
 	spin_unlock(&ftrace_lock);
@@ -526,7 +553,7 @@ static void ftrace_run_update_code(int command)
 }
 
 static ftrace_func_t saved_ftrace_func;
-static int ftrace_start;
+static int ftrace_start_up;
 static DEFINE_MUTEX(ftrace_start_lock);
 
 static void ftrace_startup(void)
@@ -537,8 +564,8 @@ static void ftrace_startup(void)
 		return;
 
 	mutex_lock(&ftrace_start_lock);
-	ftrace_start++;
-	if (ftrace_start == 1)
+	ftrace_start_up++;
+	if (ftrace_start_up == 1)
 		command |= FTRACE_ENABLE_CALLS;
 
 	if (saved_ftrace_func != ftrace_trace_function) {
@@ -562,8 +589,8 @@ static void ftrace_shutdown(void)
 		return;
 
 	mutex_lock(&ftrace_start_lock);
-	ftrace_start--;
-	if (!ftrace_start)
+	ftrace_start_up--;
+	if (!ftrace_start_up)
 		command |= FTRACE_DISABLE_CALLS;
 
 	if (saved_ftrace_func != ftrace_trace_function) {
@@ -589,8 +616,8 @@ static void ftrace_startup_sysctl(void)
 	mutex_lock(&ftrace_start_lock);
 	/* Force update next time */
 	saved_ftrace_func = NULL;
-	/* ftrace_start is true if we want ftrace running */
-	if (ftrace_start)
+	/* ftrace_start_up is true if we want ftrace running */
+	if (ftrace_start_up)
 		command |= FTRACE_ENABLE_CALLS;
 
 	ftrace_run_update_code(command);
@@ -605,8 +632,8 @@ static void ftrace_shutdown_sysctl(void)
 		return;
 
 	mutex_lock(&ftrace_start_lock);
-	/* ftrace_start is true if ftrace is running */
-	if (ftrace_start)
+	/* ftrace_start_up is true if ftrace is running */
+	if (ftrace_start_up)
 		command |= FTRACE_DISABLE_CALLS;
 
 	ftrace_run_update_code(command);
@@ -1186,7 +1213,7 @@ ftrace_regex_release(struct inode *inode, struct file *file, int enable)
 
 	mutex_lock(&ftrace_sysctl_lock);
 	mutex_lock(&ftrace_start_lock);
-	if (iter->filtered && ftrace_start && ftrace_enabled)
+	if (iter->filtered && ftrace_start_up && ftrace_enabled)
 		ftrace_run_update_code(FTRACE_ENABLE_CALLS);
 	mutex_unlock(&ftrace_start_lock);
 	mutex_unlock(&ftrace_sysctl_lock);
-- 
cgit v1.2.3-70-g09d2


From 0f04870148ecb825133bc2733f473b1c5773ac0b Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Wed, 5 Nov 2008 16:05:44 -0500
Subject: ftrace: soft tracing stop and start

Impact: add way to quickly start stop tracing from the kernel

This patch adds a soft stop and start to the trace. This simply
disables function tracing via the ftrace_disabled flag, and
disables the trace buffers to prevent recording. The tracing
code may still be executed, but the trace will not be recorded.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/ftrace.h |  5 ++++
 kernel/trace/trace.c   | 81 ++++++++++++++++++++++++++++++++++++++++++++++++--
 2 files changed, 84 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index 794ab907dbf..7a75fc6d41f 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -216,6 +216,9 @@ static inline void __ftrace_enabled_restore(int enabled)
 #ifdef CONFIG_TRACING
 extern int ftrace_dump_on_oops;
 
+extern void tracing_start(void);
+extern void tracing_stop(void);
+
 extern void
 ftrace_special(unsigned long arg1, unsigned long arg2, unsigned long arg3);
 
@@ -246,6 +249,8 @@ ftrace_special(unsigned long arg1, unsigned long arg2, unsigned long arg3) { }
 static inline int
 ftrace_printk(const char *fmt, ...) __attribute__ ((format (printf, 1, 0)));
 
+static inline void tracing_start(void) { }
+static inline void tracing_stop(void) { }
 static inline int
 ftrace_printk(const char *fmt, ...)
 {
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 29ab40a764c..113aea9447e 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -43,6 +43,15 @@
 unsigned long __read_mostly	tracing_max_latency = (cycle_t)ULONG_MAX;
 unsigned long __read_mostly	tracing_thresh;
 
+
+/*
+ * Kill all tracing for good (never come back).
+ * It is initialized to 1 but will turn to zero if the initialization
+ * of the tracer is successful. But that is the only place that sets
+ * this back to zero.
+ */
+int tracing_disabled = 1;
+
 static DEFINE_PER_CPU(local_t, ftrace_cpu_disabled);
 
 static inline void ftrace_disable_cpu(void)
@@ -62,8 +71,6 @@ static cpumask_t __read_mostly		tracing_buffer_mask;
 #define for_each_tracing_cpu(cpu)	\
 	for_each_cpu_mask(cpu, tracing_buffer_mask)
 
-static int tracing_disabled = 1;
-
 /*
  * ftrace_dump_on_oops - variable to dump ftrace buffer on oops
  *
@@ -613,6 +620,76 @@ static void trace_init_cmdlines(void)
 	cmdline_idx = 0;
 }
 
+static int trace_stop_count;
+static DEFINE_SPINLOCK(tracing_start_lock);
+
+/**
+ * tracing_start - quick start of the tracer
+ *
+ * If tracing is enabled but was stopped by tracing_stop,
+ * this will start the tracer back up.
+ */
+void tracing_start(void)
+{
+	struct ring_buffer *buffer;
+	unsigned long flags;
+
+	if (tracing_disabled)
+		return;
+
+	spin_lock_irqsave(&tracing_start_lock, flags);
+	if (--trace_stop_count)
+		goto out;
+
+	if (trace_stop_count < 0) {
+		/* Someone screwed up their debugging */
+		WARN_ON_ONCE(1);
+		trace_stop_count = 0;
+		goto out;
+	}
+
+
+	buffer = global_trace.buffer;
+	if (buffer)
+		ring_buffer_record_enable(buffer);
+
+	buffer = max_tr.buffer;
+	if (buffer)
+		ring_buffer_record_enable(buffer);
+
+	ftrace_start();
+ out:
+	spin_unlock_irqrestore(&tracing_start_lock, flags);
+}
+
+/**
+ * tracing_stop - quick stop of the tracer
+ *
+ * Light weight way to stop tracing. Use in conjunction with
+ * tracing_start.
+ */
+void tracing_stop(void)
+{
+	struct ring_buffer *buffer;
+	unsigned long flags;
+
+	ftrace_stop();
+	spin_lock_irqsave(&tracing_start_lock, flags);
+	if (trace_stop_count++)
+		goto out;
+
+	buffer = global_trace.buffer;
+	if (buffer)
+		ring_buffer_record_disable(buffer);
+
+	buffer = max_tr.buffer;
+	if (buffer)
+		ring_buffer_record_disable(buffer);
+
+ out:
+	spin_unlock_irqrestore(&tracing_start_lock, flags);
+}
+
 void trace_stop_cmdline_recording(void);
 
 static void trace_save_cmdline(struct task_struct *tsk)
-- 
cgit v1.2.3-70-g09d2


From 6a60dd121c5b6c2d827e99b38c1326f2600c3891 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Thu, 6 Nov 2008 15:55:21 -0500
Subject: ftrace: split out hardirq ftrace code into own header

Impact: moving of function prototypes into own header file

ftrace.h is too big of a file for hardirq.h, and some archs will fail
to build because of the include dependencies not being met.

This patch pulls out the required prototypes for hardirq.h into a smaller
and safer ftrace_irq.h file.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/ftrace.h     |  5 -----
 include/linux/ftrace_irq.h | 13 +++++++++++++
 include/linux/hardirq.h    |  2 +-
 3 files changed, 14 insertions(+), 6 deletions(-)
 create mode 100644 include/linux/ftrace_irq.h

(limited to 'include/linux')

diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index 0ad1b48aea6..1b340e3fa24 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -104,9 +104,6 @@ extern void ftrace_release(void *start, unsigned long size);
 
 extern void ftrace_disable_daemon(void);
 extern void ftrace_enable_daemon(void);
-extern void ftrace_nmi_enter(void);
-extern void ftrace_nmi_exit(void);
-
 #else
 # define skip_trace(ip)				({ 0; })
 # define ftrace_force_update()			({ 0; })
@@ -114,8 +111,6 @@ extern void ftrace_nmi_exit(void);
 # define ftrace_disable_daemon()		do { } while (0)
 # define ftrace_enable_daemon()			do { } while (0)
 static inline void ftrace_release(void *start, unsigned long size) { }
-static inline void ftrace_nmi_enter(void) { }
-static inline void ftrace_nmi_exit(void) { }
 #endif /* CONFIG_DYNAMIC_FTRACE */
 
 /* totally disable ftrace - can not re-enable after this */
diff --git a/include/linux/ftrace_irq.h b/include/linux/ftrace_irq.h
new file mode 100644
index 00000000000..b1299d6729f
--- /dev/null
+++ b/include/linux/ftrace_irq.h
@@ -0,0 +1,13 @@
+#ifndef _LINUX_FTRACE_IRQ_H
+#define _LINUX_FTRACE_IRQ_H
+
+
+#ifdef CONFIG_DYNAMIC_FTRACE
+extern void ftrace_nmi_enter(void);
+extern void ftrace_nmi_exit(void);
+#else
+static inline void ftrace_nmi_enter(void) { }
+static inline void ftrace_nmi_exit(void) { }
+#endif
+
+#endif /* _LINUX_FTRACE_IRQ_H */
diff --git a/include/linux/hardirq.h b/include/linux/hardirq.h
index ffc16ab5a87..89a56d79e4c 100644
--- a/include/linux/hardirq.h
+++ b/include/linux/hardirq.h
@@ -4,7 +4,7 @@
 #include <linux/preempt.h>
 #include <linux/smp_lock.h>
 #include <linux/lockdep.h>
-#include <linux/ftrace.h>
+#include <linux/ftrace_irq.h>
 #include <asm/hardirq.h>
 #include <asm/system.h>
 
-- 
cgit v1.2.3-70-g09d2


From 307ba6dd73254fe7d2ce27db64ffd90e1bb3c6c0 Mon Sep 17 00:00:00 2001
From: David Vrabel <david.vrabel@csr.com>
Date: Fri, 7 Nov 2008 17:37:33 +0000
Subject: uwb: don't unbind the radio controller driver when resetting

Use pre_reset and post_reset methods to avoid unbinding the radio
controller driver after a uwb_rc_reset_all() call.  This avoids a
deadlock in uwb_rc_rm() when waiting for the uwb event thread to stop.

Signed-off-by: David Vrabel <david.vrabel@csr.com>
---
 drivers/uwb/hwa-rc.c       | 22 +++++++++++++++-
 drivers/uwb/reset.c        | 49 ++++++++++++++++++++++++++++++------
 drivers/uwb/rsv.c          | 29 +++++++++++++++-------
 drivers/uwb/umc-bus.c      | 62 ++++++++++++++++++++++++++++++++--------------
 drivers/uwb/uwb-internal.h |  1 +
 drivers/uwb/whc-rc.c       | 30 ++++++++++++++++++----
 include/linux/uwb.h        |  2 ++
 include/linux/uwb/umc.h    |  2 ++
 8 files changed, 155 insertions(+), 42 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/uwb/hwa-rc.c b/drivers/uwb/hwa-rc.c
index 18009c99577..158e98d08af 100644
--- a/drivers/uwb/hwa-rc.c
+++ b/drivers/uwb/hwa-rc.c
@@ -881,6 +881,24 @@ static void hwarc_disconnect(struct usb_interface *iface)
 	uwb_rc_put(uwb_rc);	/* when creating the device, refcount = 1 */
 }
 
+static int hwarc_pre_reset(struct usb_interface *iface)
+{
+	struct hwarc *hwarc = usb_get_intfdata(iface);
+	struct uwb_rc *uwb_rc = hwarc->uwb_rc;
+
+	uwb_rc_pre_reset(uwb_rc);
+	return 0;
+}
+
+static int hwarc_post_reset(struct usb_interface *iface)
+{
+	struct hwarc *hwarc = usb_get_intfdata(iface);
+	struct uwb_rc *uwb_rc = hwarc->uwb_rc;
+
+	uwb_rc_post_reset(uwb_rc);
+	return 0;
+}
+
 /** USB device ID's that we handle */
 static struct usb_device_id hwarc_id_table[] = {
 	/* D-Link DUB-1210 */
@@ -897,9 +915,11 @@ MODULE_DEVICE_TABLE(usb, hwarc_id_table);
 
 static struct usb_driver hwarc_driver = {
 	.name =		"hwa-rc",
+	.id_table =	hwarc_id_table,
 	.probe =	hwarc_probe,
 	.disconnect =	hwarc_disconnect,
-	.id_table =	hwarc_id_table,
+	.pre_reset =    hwarc_pre_reset,
+	.post_reset =   hwarc_post_reset,
 };
 
 static int __init hwarc_driver_init(void)
diff --git a/drivers/uwb/reset.c b/drivers/uwb/reset.c
index 8de856fa795..e39b32099af 100644
--- a/drivers/uwb/reset.c
+++ b/drivers/uwb/reset.c
@@ -323,17 +323,16 @@ int uwbd_msg_handle_reset(struct uwb_event *evt)
 	struct uwb_rc *rc = evt->rc;
 	int ret;
 
-	/* Need to prevent the RC hardware module going away while in
-	   the rc->reset() call. */
-	if (!try_module_get(rc->owner))
-		return 0;
-
 	dev_info(&rc->uwb_dev.dev, "resetting radio controller\n");
 	ret = rc->reset(rc);
-	if (ret)
+	if (ret) {
 		dev_err(&rc->uwb_dev.dev, "failed to reset hardware: %d\n", ret);
-
-	module_put(rc->owner);
+		goto error;
+	}
+	return 0;
+error:
+	/* Nothing can be done except try the reset again. */
+	uwb_rc_reset_all(rc);
 	return ret;
 }
 
@@ -360,3 +359,37 @@ void uwb_rc_reset_all(struct uwb_rc *rc)
 	uwbd_event_queue(evt);
 }
 EXPORT_SYMBOL_GPL(uwb_rc_reset_all);
+
+void uwb_rc_pre_reset(struct uwb_rc *rc)
+{
+	rc->stop(rc);
+	uwbd_flush(rc);
+
+	mutex_lock(&rc->uwb_dev.mutex);
+	rc->beaconing = -1;
+	rc->scanning = -1;
+	mutex_unlock(&rc->uwb_dev.mutex);
+
+	uwb_rsv_remove_all(rc);
+}
+EXPORT_SYMBOL_GPL(uwb_rc_pre_reset);
+
+void uwb_rc_post_reset(struct uwb_rc *rc)
+{
+	int ret;
+
+	ret = rc->start(rc);
+	if (ret)
+		goto error;
+	ret = uwb_rc_mac_addr_set(rc, &rc->uwb_dev.mac_addr);
+	if (ret)
+		goto error;
+	ret = uwb_rc_dev_addr_set(rc, &rc->uwb_dev.dev_addr);
+	if (ret)
+		goto error;
+	return;
+error:
+	/* Nothing can be done except try the reset again. */
+	uwb_rc_reset_all(rc);
+}
+EXPORT_SYMBOL_GPL(uwb_rc_post_reset);
diff --git a/drivers/uwb/rsv.c b/drivers/uwb/rsv.c
index ce0094657d3..3d76efe26ac 100644
--- a/drivers/uwb/rsv.c
+++ b/drivers/uwb/rsv.c
@@ -659,6 +659,25 @@ static void uwb_rsv_timer(unsigned long arg)
 	uwb_rsv_sched_update(rsv->rc);
 }
 
+/**
+ * uwb_rsv_remove_all - remove all reservations
+ * @rc: the radio controller
+ *
+ * A DRP IE update is not done.
+ */
+void uwb_rsv_remove_all(struct uwb_rc *rc)
+{
+	struct uwb_rsv *rsv, *t;
+
+	mutex_lock(&rc->rsvs_mutex);
+	list_for_each_entry_safe(rsv, t, &rc->reservations, rc_node) {
+		uwb_rsv_remove(rsv);
+	}
+	mutex_unlock(&rc->rsvs_mutex);
+
+	cancel_work_sync(&rc->rsv_update_work);
+}
+
 void uwb_rsv_init(struct uwb_rc *rc)
 {
 	INIT_LIST_HEAD(&rc->reservations);
@@ -682,14 +701,6 @@ int uwb_rsv_setup(struct uwb_rc *rc)
 
 void uwb_rsv_cleanup(struct uwb_rc *rc)
 {
-	struct uwb_rsv *rsv, *t;
-
-	mutex_lock(&rc->rsvs_mutex);
-	list_for_each_entry_safe(rsv, t, &rc->reservations, rc_node) {
-		uwb_rsv_remove(rsv);
-	}
-	mutex_unlock(&rc->rsvs_mutex);
-
-	cancel_work_sync(&rc->rsv_update_work);
+	uwb_rsv_remove_all(rc);
 	destroy_workqueue(rc->rsv_workq);
 }
diff --git a/drivers/uwb/umc-bus.c b/drivers/uwb/umc-bus.c
index 2d8d62d9f53..5ad36164c13 100644
--- a/drivers/uwb/umc-bus.c
+++ b/drivers/uwb/umc-bus.c
@@ -11,23 +11,48 @@
 #include <linux/uwb/umc.h>
 #include <linux/pci.h>
 
-static int umc_bus_unbind_helper(struct device *dev, void *data)
+static int umc_bus_pre_reset_helper(struct device *dev, void *data)
 {
-	struct device *parent = data;
+	int ret = 0;
 
-	if (dev->parent == parent && dev->driver)
-		device_release_driver(dev);
-	return 0;
+	if (dev->driver) {
+		struct umc_dev *umc = to_umc_dev(dev);
+		struct umc_driver *umc_drv = to_umc_driver(dev->driver);
+
+		if (umc_drv->pre_reset)
+			ret = umc_drv->pre_reset(umc);
+		else
+			device_release_driver(dev);
+	}
+	return ret;
+}
+
+static int umc_bus_post_reset_helper(struct device *dev, void *data)
+{
+	int ret = 0;
+
+	if (dev->driver) {
+		struct umc_dev *umc = to_umc_dev(dev);
+		struct umc_driver *umc_drv = to_umc_driver(dev->driver);
+
+		if (umc_drv->post_reset)
+			ret = umc_drv->post_reset(umc);
+	} else
+		ret = device_attach(dev);
+
+	return ret;
 }
 
 /**
  * umc_controller_reset - reset the whole UMC controller
  * @umc: the UMC device for the radio controller.
  *
- * Drivers will be unbound from all UMC devices belonging to the
- * controller and then the radio controller will be rebound.  The
- * radio controller is expected to do a full hardware reset when it is
- * probed.
+ * Drivers or all capabilities of the controller will have their
+ * pre_reset methods called or be unbound from their device.  Then all
+ * post_reset methods will be called or the drivers will be rebound.
+ *
+ * Radio controllers must provide pre_reset and post_reset methods and
+ * reset the hardware in their start method.
  *
  * If this is called while a probe() or remove() is in progress it
  * will return -EAGAIN and not perform the reset.
@@ -35,14 +60,13 @@ static int umc_bus_unbind_helper(struct device *dev, void *data)
 int umc_controller_reset(struct umc_dev *umc)
 {
 	struct device *parent = umc->dev.parent;
-	int ret;
+	int ret = 0;
 
-	if (down_trylock(&parent->sem))
+	if(down_trylock(&parent->sem))
 		return -EAGAIN;
-	bus_for_each_dev(&umc_bus_type, NULL, parent, umc_bus_unbind_helper);
-	ret = device_attach(&umc->dev);
-	if (ret == 1)
-		ret = 0;
+	ret = device_for_each_child(parent, parent, umc_bus_pre_reset_helper);
+	if (ret >= 0)
+		device_for_each_child(parent, parent, umc_bus_post_reset_helper);
 	up(&parent->sem);
 
 	return ret;
@@ -75,10 +99,10 @@ static int umc_bus_rescan_helper(struct device *dev, void *data)
 	if (!dev->driver)
 		ret = device_attach(dev);
 
-	return ret < 0 ? ret : 0;
+	return ret;
 }
 
-static void umc_bus_rescan(void)
+static void umc_bus_rescan(struct device *parent)
 {
 	int err;
 
@@ -86,7 +110,7 @@ static void umc_bus_rescan(void)
 	 * We can't use bus_rescan_devices() here as it deadlocks when
 	 * it tries to retake the dev->parent semaphore.
 	 */
-	err = bus_for_each_dev(&umc_bus_type, NULL, NULL, umc_bus_rescan_helper);
+	err = device_for_each_child(parent, NULL, umc_bus_rescan_helper);
 	if (err < 0)
 		printk(KERN_WARNING "%s: rescan of bus failed: %d\n",
 		       KBUILD_MODNAME, err);
@@ -120,7 +144,7 @@ static int umc_device_probe(struct device *dev)
 	if (err)
 		put_device(dev);
 	else
-		umc_bus_rescan();
+		umc_bus_rescan(dev->parent);
 
 	return err;
 }
diff --git a/drivers/uwb/uwb-internal.h b/drivers/uwb/uwb-internal.h
index 4c244967997..af95541dabc 100644
--- a/drivers/uwb/uwb-internal.h
+++ b/drivers/uwb/uwb-internal.h
@@ -248,6 +248,7 @@ extern struct device_attribute dev_attr_scan;
 void uwb_rsv_init(struct uwb_rc *rc);
 int uwb_rsv_setup(struct uwb_rc *rc);
 void uwb_rsv_cleanup(struct uwb_rc *rc);
+void uwb_rsv_remove_all(struct uwb_rc *rc);
 
 void uwb_rsv_set_state(struct uwb_rsv *rsv, enum uwb_rsv_state new_state);
 void uwb_rsv_remove(struct uwb_rsv *rsv);
diff --git a/drivers/uwb/whc-rc.c b/drivers/uwb/whc-rc.c
index 6c454eadd30..e0d66938ccd 100644
--- a/drivers/uwb/whc-rc.c
+++ b/drivers/uwb/whc-rc.c
@@ -394,7 +394,7 @@ void whcrc_stop_rc(struct uwb_rc *rc)
 
 	le_writel(0, whcrc->rc_base + URCCMD);
 	whci_wait_for(&umc_dev->dev, whcrc->rc_base + URCSTS,
-		      URCSTS_HALTED, 0, 40, "URCSTS.HALTED");
+		      URCSTS_HALTED, URCSTS_HALTED, 100, "URCSTS.HALTED");
 }
 
 static void whcrc_init(struct whcrc *whcrc)
@@ -488,6 +488,24 @@ static void whcrc_remove(struct umc_dev *umc_dev)
 	d_printf(1, &umc_dev->dev, "freed whcrc %p\n", whcrc);
 }
 
+static int whcrc_pre_reset(struct umc_dev *umc)
+{
+	struct whcrc *whcrc = umc_get_drvdata(umc);
+	struct uwb_rc *uwb_rc = whcrc->uwb_rc;
+
+	uwb_rc_pre_reset(uwb_rc);
+	return 0;
+}
+
+static int whcrc_post_reset(struct umc_dev *umc)
+{
+	struct whcrc *whcrc = umc_get_drvdata(umc);
+	struct uwb_rc *uwb_rc = whcrc->uwb_rc;
+
+	uwb_rc_post_reset(uwb_rc);
+	return 0;
+}
+
 /* PCI device ID's that we handle [so it gets loaded] */
 static struct pci_device_id whcrc_id_table[] = {
 	{ PCI_DEVICE_CLASS(PCI_CLASS_WIRELESS_WHCI, ~0) },
@@ -496,10 +514,12 @@ static struct pci_device_id whcrc_id_table[] = {
 MODULE_DEVICE_TABLE(pci, whcrc_id_table);
 
 static struct umc_driver whcrc_driver = {
-	.name   = "whc-rc",
-	.cap_id = UMC_CAP_ID_WHCI_RC,
-	.probe  = whcrc_probe,
-	.remove = whcrc_remove,
+	.name       = "whc-rc",
+	.cap_id     = UMC_CAP_ID_WHCI_RC,
+	.probe      = whcrc_probe,
+	.remove     = whcrc_remove,
+	.pre_reset  = whcrc_pre_reset,
+	.post_reset = whcrc_post_reset,
 };
 
 static int __init whcrc_driver_init(void)
diff --git a/include/linux/uwb.h b/include/linux/uwb.h
index 881f0c5b6d2..c4854848999 100644
--- a/include/linux/uwb.h
+++ b/include/linux/uwb.h
@@ -540,6 +540,8 @@ void uwb_rc_rm(struct uwb_rc *);
 void uwb_rc_neh_grok(struct uwb_rc *, void *, size_t);
 void uwb_rc_neh_error(struct uwb_rc *, int);
 void uwb_rc_reset_all(struct uwb_rc *rc);
+void uwb_rc_pre_reset(struct uwb_rc *rc);
+void uwb_rc_post_reset(struct uwb_rc *rc);
 
 /**
  * uwb_rsv_is_owner - is the owner of this reservation the RC?
diff --git a/include/linux/uwb/umc.h b/include/linux/uwb/umc.h
index 36a39e34f8d..4b4fc0f4385 100644
--- a/include/linux/uwb/umc.h
+++ b/include/linux/uwb/umc.h
@@ -89,6 +89,8 @@ struct umc_driver {
 	void (*remove)(struct umc_dev *);
 	int  (*suspend)(struct umc_dev *, pm_message_t state);
 	int  (*resume)(struct umc_dev *);
+	int  (*pre_reset)(struct umc_dev *);
+	int  (*post_reset)(struct umc_dev *);
 
 	struct device_driver driver;
 };
-- 
cgit v1.2.3-70-g09d2


From 3d8160b1493bcadca74fbb635d79b3928b8999cf Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Fri, 7 Nov 2008 22:52:14 -0800
Subject: Revert "net: Guaranetee the proper ordering of the loopback device."

This reverts commit ae33bc40c0d96d02f51a996482ea7e41c5152695.
---
 drivers/net/loopback.c    | 13 +++++++++++--
 include/linux/netdevice.h |  1 -
 net/core/dev.c            | 12 ------------
 3 files changed, 11 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/loopback.c b/drivers/net/loopback.c
index c4516b580ba..91d08585a6d 100644
--- a/drivers/net/loopback.c
+++ b/drivers/net/loopback.c
@@ -204,8 +204,17 @@ static __net_exit void loopback_net_exit(struct net *net)
 	unregister_netdev(dev);
 }
 
-/* Registered in net/core/dev.c */
-struct pernet_operations __net_initdata loopback_net_ops = {
+static struct pernet_operations __net_initdata loopback_net_ops = {
        .init = loopback_net_init,
        .exit = loopback_net_exit,
 };
+
+static int __init loopback_init(void)
+{
+	return register_pernet_device(&loopback_net_ops);
+}
+
+/* Loopback is special. It should be initialized before any other network
+ * device and network subsystem.
+ */
+fs_initcall(loopback_init);
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 12d7f4469dc..f1b0dbe5846 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1766,7 +1766,6 @@ static inline int skb_bond_should_drop(struct sk_buff *skb)
 	return 0;
 }
 
-extern struct pernet_operations __net_initdata loopback_net_ops;
 #endif /* __KERNEL__ */
 
 #endif	/* _LINUX_DEV_H */
diff --git a/net/core/dev.c b/net/core/dev.c
index e0dc67a789b..2306d56fbb5 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -4909,18 +4909,6 @@ static int __init net_dev_init(void)
 	if (register_pernet_subsys(&netdev_net_ops))
 		goto out;
 
-	/* The loopback device is special if any other network devices
-	 * is present in a network namespace the loopback device must
-	 * be present. Since we now dynamically allocate and free the
-	 * loopback device ensure this invariant is maintained by
-	 * keeping the loopback device as the first device on the
-	 * list of network devices.  Ensuring the loopback devices
-	 * is the first device that appears and the last network device
-	 * that disappears.
-	 */
-	if (register_pernet_device(&loopback_net_ops))
-		goto out;
-
 	if (register_pernet_device(&default_device_ops))
 		goto out;
 
-- 
cgit v1.2.3-70-g09d2


From 505d4f73dda9e20d59da05008f1f5eb432613e71 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@maxwell.aristanetworks.com>
Date: Fri, 7 Nov 2008 22:54:20 -0800
Subject: net: Guaranetee the proper ordering of the loopback device. v2

I was recently hunting a bug that occurred in network namespace
cleanup.  In looking at the code it became apparrent that we have
and will continue to have cases where if we have anything going
on in a network namespace there will be assumptions that the
loopback device is present.   Things like sending igmp unsubscribe
messages when we bring down network devices invokes the routing
code which assumes that at least the loopback driver is present.

Therefore to avoid magic initcall ordering hackery that is hard
to follow and hard to get right insert a call to register the
loopback device directly from net_dev_init().    This guarantes
that the loopback device is the first device registered and
the last network device to go away.

But do it carefully so we register the loopback device after
we clear dev_boot_phase.

Signed-off-by: Eric W. Biederman <ebiederm@maxwell.aristanetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/loopback.c    | 13 ++-----------
 include/linux/netdevice.h |  1 +
 net/core/dev.c            | 22 +++++++++++++++++-----
 3 files changed, 20 insertions(+), 16 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/loopback.c b/drivers/net/loopback.c
index 91d08585a6d..c4516b580ba 100644
--- a/drivers/net/loopback.c
+++ b/drivers/net/loopback.c
@@ -204,17 +204,8 @@ static __net_exit void loopback_net_exit(struct net *net)
 	unregister_netdev(dev);
 }
 
-static struct pernet_operations __net_initdata loopback_net_ops = {
+/* Registered in net/core/dev.c */
+struct pernet_operations __net_initdata loopback_net_ops = {
        .init = loopback_net_init,
        .exit = loopback_net_exit,
 };
-
-static int __init loopback_init(void)
-{
-	return register_pernet_device(&loopback_net_ops);
-}
-
-/* Loopback is special. It should be initialized before any other network
- * device and network subsystem.
- */
-fs_initcall(loopback_init);
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index f1b0dbe5846..12d7f4469dc 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1766,6 +1766,7 @@ static inline int skb_bond_should_drop(struct sk_buff *skb)
 	return 0;
 }
 
+extern struct pernet_operations __net_initdata loopback_net_ops;
 #endif /* __KERNEL__ */
 
 #endif	/* _LINUX_DEV_H */
diff --git a/net/core/dev.c b/net/core/dev.c
index 2306d56fbb5..31568b2068a 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -4909,9 +4909,6 @@ static int __init net_dev_init(void)
 	if (register_pernet_subsys(&netdev_net_ops))
 		goto out;
 
-	if (register_pernet_device(&default_device_ops))
-		goto out;
-
 	/*
 	 *	Initialise the packet receive queues.
 	 */
@@ -4928,10 +4925,25 @@ static int __init net_dev_init(void)
 		queue->backlog.weight = weight_p;
 	}
 
-	netdev_dma_register();
-
 	dev_boot_phase = 0;
 
+	/* The loopback device is special if any other network devices
+	 * is present in a network namespace the loopback device must
+	 * be present. Since we now dynamically allocate and free the
+	 * loopback device ensure this invariant is maintained by
+	 * keeping the loopback device as the first device on the
+	 * list of network devices.  Ensuring the loopback devices
+	 * is the first device that appears and the last network device
+	 * that disappears.
+	 */
+	if (register_pernet_device(&loopback_net_ops))
+		goto out;
+
+	if (register_pernet_device(&default_device_ops))
+		goto out;
+
+	netdev_dma_register();
+
 	open_softirq(NET_TX_SOFTIRQ, net_tx_action);
 	open_softirq(NET_RX_SOFTIRQ, net_rx_action);
 
-- 
cgit v1.2.3-70-g09d2


From f400923735ecbb67cbe4a3606c9479f694754f51 Mon Sep 17 00:00:00 2001
From: Thomas Graf <tgraf@suug.ch>
Date: Fri, 7 Nov 2008 22:56:00 -0800
Subject: pkt_sched: Control group classifier

The classifier should cover the most common use case and will work
without any special configuration.

The principle of the classifier is to directly access the
task_struct via get_current(). In order for this to work,
classification requests from softirqs must be ignored. This is
not a problem because the vast majority of packets in softirq
context are not assigned to a task anyway. For this to work, a
mechanism is needed to trace softirq context.

This repost goes back to the method of relying on the number of
nested bh disable calls for the sake of not adding too much
complexity and the option to come up with something more reliable
if actually needed.

Signed-off-by: Thomas Graf <tgraf@suug.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/cgroup_subsys.h |   6 +
 include/linux/pkt_cls.h       |  14 ++
 net/sched/Kconfig             |  11 ++
 net/sched/Makefile            |   1 +
 net/sched/cls_cgroup.c        | 290 ++++++++++++++++++++++++++++++++++++++++++
 5 files changed, 322 insertions(+)
 create mode 100644 net/sched/cls_cgroup.c

(limited to 'include/linux')

diff --git a/include/linux/cgroup_subsys.h b/include/linux/cgroup_subsys.h
index 9c22396e8b5..9c8d31bacf4 100644
--- a/include/linux/cgroup_subsys.h
+++ b/include/linux/cgroup_subsys.h
@@ -54,3 +54,9 @@ SUBSYS(freezer)
 #endif
 
 /* */
+
+#ifdef CONFIG_NET_CLS_CGROUP
+SUBSYS(net_cls)
+#endif
+
+/* */
diff --git a/include/linux/pkt_cls.h b/include/linux/pkt_cls.h
index 7cf7824df77..e6aa8482ad7 100644
--- a/include/linux/pkt_cls.h
+++ b/include/linux/pkt_cls.h
@@ -394,6 +394,20 @@ enum
 
 #define TCA_BASIC_MAX (__TCA_BASIC_MAX - 1)
 
+
+/* Cgroup classifier */
+
+enum
+{
+	TCA_CGROUP_UNSPEC,
+	TCA_CGROUP_ACT,
+	TCA_CGROUP_POLICE,
+	TCA_CGROUP_EMATCHES,
+	__TCA_CGROUP_MAX,
+};
+
+#define TCA_CGROUP_MAX (__TCA_CGROUP_MAX - 1)
+
 /* Extended Matches */
 
 struct tcf_ematch_tree_hdr
diff --git a/net/sched/Kconfig b/net/sched/Kconfig
index 6767e54155d..36543b6fcef 100644
--- a/net/sched/Kconfig
+++ b/net/sched/Kconfig
@@ -316,6 +316,17 @@ config NET_CLS_FLOW
 	  To compile this code as a module, choose M here: the
 	  module will be called cls_flow.
 
+config NET_CLS_CGROUP
+	bool "Control Group Classifier"
+	select NET_CLS
+	depends on CGROUPS
+	---help---
+	  Say Y here if you want to classify packets based on the control
+	  cgroup of their process.
+
+	  To compile this code as a module, choose M here: the
+	  module will be called cls_cgroup.
+
 config NET_EMATCH
 	bool "Extended Matches"
 	select NET_CLS
diff --git a/net/sched/Makefile b/net/sched/Makefile
index e60c9925b26..70b35f8708c 100644
--- a/net/sched/Makefile
+++ b/net/sched/Makefile
@@ -38,6 +38,7 @@ obj-$(CONFIG_NET_CLS_TCINDEX)	+= cls_tcindex.o
 obj-$(CONFIG_NET_CLS_RSVP6)	+= cls_rsvp6.o
 obj-$(CONFIG_NET_CLS_BASIC)	+= cls_basic.o
 obj-$(CONFIG_NET_CLS_FLOW)	+= cls_flow.o
+obj-$(CONFIG_NET_CLS_CGROUP)	+= cls_cgroup.o
 obj-$(CONFIG_NET_EMATCH)	+= ematch.o
 obj-$(CONFIG_NET_EMATCH_CMP)	+= em_cmp.o
 obj-$(CONFIG_NET_EMATCH_NBYTE)	+= em_nbyte.o
diff --git a/net/sched/cls_cgroup.c b/net/sched/cls_cgroup.c
new file mode 100644
index 00000000000..53ada2c0e41
--- /dev/null
+++ b/net/sched/cls_cgroup.c
@@ -0,0 +1,290 @@
+/*
+ * net/sched/cls_cgroup.c	Control Group Classifier
+ *
+ *		This program is free software; you can redistribute it and/or
+ *		modify it under the terms of the GNU General Public License
+ *		as published by the Free Software Foundation; either version
+ *		2 of the License, or (at your option) any later version.
+ *
+ * Authors:	Thomas Graf <tgraf@suug.ch>
+ */
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/string.h>
+#include <linux/errno.h>
+#include <linux/skbuff.h>
+#include <linux/cgroup.h>
+#include <net/rtnetlink.h>
+#include <net/pkt_cls.h>
+
+struct cgroup_cls_state
+{
+	struct cgroup_subsys_state css;
+	u32 classid;
+};
+
+static inline struct cgroup_cls_state *net_cls_state(struct cgroup *cgrp)
+{
+	return (struct cgroup_cls_state *)
+		cgroup_subsys_state(cgrp, net_cls_subsys_id);
+}
+
+static struct cgroup_subsys_state *cgrp_create(struct cgroup_subsys *ss,
+						 struct cgroup *cgrp)
+{
+	struct cgroup_cls_state *cs;
+
+	if (!(cs = kzalloc(sizeof(*cs), GFP_KERNEL)))
+		return ERR_PTR(-ENOMEM);
+
+	if (cgrp->parent)
+		cs->classid = net_cls_state(cgrp->parent)->classid;
+
+	return &cs->css;
+}
+
+static void cgrp_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp)
+{
+	kfree(ss);
+}
+
+static u64 read_classid(struct cgroup *cgrp, struct cftype *cft)
+{
+	return net_cls_state(cgrp)->classid;
+}
+
+static int write_classid(struct cgroup *cgrp, struct cftype *cft, u64 value)
+{
+	if (!cgroup_lock_live_group(cgrp))
+		return -ENODEV;
+
+	net_cls_state(cgrp)->classid = (u32) value;
+
+	cgroup_unlock();
+
+	return 0;
+}
+
+static struct cftype ss_files[] = {
+	{
+		.name = "classid",
+		.read_u64 = read_classid,
+		.write_u64 = write_classid,
+	},
+};
+
+static int cgrp_populate(struct cgroup_subsys *ss, struct cgroup *cgrp)
+{
+	return cgroup_add_files(cgrp, ss, ss_files, ARRAY_SIZE(ss_files));
+}
+
+struct cgroup_subsys net_cls_subsys = {
+	.name		= "net_cls",
+	.create		= cgrp_create,
+	.destroy	= cgrp_destroy,
+	.populate	= cgrp_populate,
+	.subsys_id	= net_cls_subsys_id,
+};
+
+struct cls_cgroup_head
+{
+	u32			handle;
+	struct tcf_exts		exts;
+	struct tcf_ematch_tree	ematches;
+};
+
+static int cls_cgroup_classify(struct sk_buff *skb, struct tcf_proto *tp,
+			       struct tcf_result *res)
+{
+	struct cls_cgroup_head *head = tp->root;
+	struct cgroup_cls_state *cs;
+	int ret = 0;
+
+	/*
+	 * Due to the nature of the classifier it is required to ignore all
+	 * packets originating from softirq context as accessing `current'
+	 * would lead to false results.
+	 *
+	 * This test assumes that all callers of dev_queue_xmit() explicitely
+	 * disable bh. Knowing this, it is possible to detect softirq based
+	 * calls by looking at the number of nested bh disable calls because
+	 * softirqs always disables bh.
+	 */
+	if (softirq_count() != SOFTIRQ_OFFSET)
+		return -1;
+
+	rcu_read_lock();
+	cs = (struct cgroup_cls_state *) task_subsys_state(current,
+							   net_cls_subsys_id);
+	if (cs->classid && tcf_em_tree_match(skb, &head->ematches, NULL)) {
+		res->classid = cs->classid;
+		res->class = 0;
+		ret = tcf_exts_exec(skb, &head->exts, res);
+	} else
+		ret = -1;
+
+	rcu_read_unlock();
+
+	return ret;
+}
+
+static unsigned long cls_cgroup_get(struct tcf_proto *tp, u32 handle)
+{
+	return 0UL;
+}
+
+static void cls_cgroup_put(struct tcf_proto *tp, unsigned long f)
+{
+}
+
+static int cls_cgroup_init(struct tcf_proto *tp)
+{
+	return 0;
+}
+
+static const struct tcf_ext_map cgroup_ext_map = {
+	.action = TCA_CGROUP_ACT,
+	.police = TCA_CGROUP_POLICE,
+};
+
+static const struct nla_policy cgroup_policy[TCA_CGROUP_MAX + 1] = {
+	[TCA_CGROUP_EMATCHES]	= { .type = NLA_NESTED },
+};
+
+static int cls_cgroup_change(struct tcf_proto *tp, unsigned long base,
+			     u32 handle, struct nlattr **tca,
+			     unsigned long *arg)
+{
+	struct nlattr *tb[TCA_CGROUP_MAX+1];
+	struct cls_cgroup_head *head = tp->root;
+	struct tcf_ematch_tree t;
+	struct tcf_exts e;
+	int err;
+
+	if (head == NULL) {
+		if (!handle)
+			return -EINVAL;
+
+		head = kzalloc(sizeof(*head), GFP_KERNEL);
+		if (head == NULL)
+			return -ENOBUFS;
+
+		head->handle = handle;
+
+		tcf_tree_lock(tp);
+		tp->root = head;
+		tcf_tree_unlock(tp);
+	}
+
+	if (handle != head->handle)
+		return -ENOENT;
+
+	err = nla_parse_nested(tb, TCA_CGROUP_MAX, tca[TCA_OPTIONS],
+			       cgroup_policy);
+	if (err < 0)
+		return err;
+
+	err = tcf_exts_validate(tp, tb, tca[TCA_RATE], &e, &cgroup_ext_map);
+	if (err < 0)
+		return err;
+
+	err = tcf_em_tree_validate(tp, tb[TCA_CGROUP_EMATCHES], &t);
+	if (err < 0)
+		return err;
+
+	tcf_exts_change(tp, &head->exts, &e);
+	tcf_em_tree_change(tp, &head->ematches, &t);
+
+	return 0;
+}
+
+static void cls_cgroup_destroy(struct tcf_proto *tp)
+{
+	struct cls_cgroup_head *head;
+
+	head = (struct cls_cgroup_head *)xchg(&tp->root, NULL);
+
+	if (head) {
+		tcf_exts_destroy(tp, &head->exts);
+		tcf_em_tree_destroy(tp, &head->ematches);
+		kfree(head);
+	}
+}
+
+static int cls_cgroup_delete(struct tcf_proto *tp, unsigned long arg)
+{
+	return -EOPNOTSUPP;
+}
+
+static void cls_cgroup_walk(struct tcf_proto *tp, struct tcf_walker *arg)
+{
+	struct cls_cgroup_head *head = tp->root;
+
+	if (arg->count < arg->skip)
+		goto skip;
+
+	if (arg->fn(tp, (unsigned long) head, arg) < 0) {
+		arg->stop = 1;
+		return;
+	}
+skip:
+	arg->count++;
+}
+
+static int cls_cgroup_dump(struct tcf_proto *tp, unsigned long fh,
+			   struct sk_buff *skb, struct tcmsg *t)
+{
+	struct cls_cgroup_head *head = tp->root;
+	unsigned char *b = skb_tail_pointer(skb);
+	struct nlattr *nest;
+
+	t->tcm_handle = head->handle;
+
+	nest = nla_nest_start(skb, TCA_OPTIONS);
+	if (nest == NULL)
+		goto nla_put_failure;
+
+	if (tcf_exts_dump(skb, &head->exts, &cgroup_ext_map) < 0 ||
+	    tcf_em_tree_dump(skb, &head->ematches, TCA_CGROUP_EMATCHES) < 0)
+		goto nla_put_failure;
+
+	nla_nest_end(skb, nest);
+
+	if (tcf_exts_dump_stats(skb, &head->exts, &cgroup_ext_map) < 0)
+		goto nla_put_failure;
+
+	return skb->len;
+
+nla_put_failure:
+	nlmsg_trim(skb, b);
+	return -1;
+}
+
+static struct tcf_proto_ops cls_cgroup_ops __read_mostly = {
+	.kind		=	"cgroup",
+	.init		=	cls_cgroup_init,
+	.change		=	cls_cgroup_change,
+	.classify	=	cls_cgroup_classify,
+	.destroy	=	cls_cgroup_destroy,
+	.get		=	cls_cgroup_get,
+	.put		=	cls_cgroup_put,
+	.delete		=	cls_cgroup_delete,
+	.walk		=	cls_cgroup_walk,
+	.dump		=	cls_cgroup_dump,
+	.owner		=	THIS_MODULE,
+};
+
+static int __init init_cgroup_cls(void)
+{
+	return register_tcf_proto_ops(&cls_cgroup_ops);
+}
+
+static void __exit exit_cgroup_cls(void)
+{
+	unregister_tcf_proto_ops(&cls_cgroup_ops);
+}
+
+module_init(init_cgroup_cls);
+module_exit(exit_cgroup_cls);
+MODULE_LICENSE("GPL");
-- 
cgit v1.2.3-70-g09d2


From 1239cd58d237fa6ad501acaec8776262a5784ec8 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes@sipsolutions.net>
Date: Tue, 28 Oct 2008 11:12:57 +0100
Subject: wireless: move mesh config length constant

This is a constant from the 802.11 specification.

Signed-off-by: Johannes Berg <johannes@sipsolutions.net>
Cc: Javier Cardona <javier@cozybit.com>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 include/linux/ieee80211.h | 3 +++
 net/mac80211/mesh.c       | 2 +-
 net/mac80211/mesh.h       | 5 +----
 net/mac80211/scan.c       | 2 +-
 4 files changed, 6 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h
index aad99195a4c..9dc288b920c 100644
--- a/include/linux/ieee80211.h
+++ b/include/linux/ieee80211.h
@@ -97,7 +97,10 @@
 #define IEEE80211_MAX_FRAME_LEN		2352
 
 #define IEEE80211_MAX_SSID_LEN		32
+
 #define IEEE80211_MAX_MESH_ID_LEN	32
+#define IEEE80211_MESH_CONFIG_LEN	19
+
 #define IEEE80211_QOS_CTL_LEN		2
 #define IEEE80211_QOS_CTL_TID_MASK	0x000F
 #define IEEE80211_QOS_CTL_TAG1D_MASK	0x0007
diff --git a/net/mac80211/mesh.c b/net/mac80211/mesh.c
index d3b6e1a648b..82f568e9436 100644
--- a/net/mac80211/mesh.c
+++ b/net/mac80211/mesh.c
@@ -238,7 +238,7 @@ void mesh_mgmt_ies_add(struct sk_buff *skb, struct ieee80211_sub_if_data *sdata)
 
 	pos = skb_put(skb, 21);
 	*pos++ = WLAN_EID_MESH_CONFIG;
-	*pos++ = MESH_CFG_LEN;
+	*pos++ = IEEE80211_MESH_CONFIG_LEN;
 	/* Version */
 	*pos++ = 1;
 
diff --git a/net/mac80211/mesh.h b/net/mac80211/mesh.h
index e10471c6ba4..c197ab545e5 100644
--- a/net/mac80211/mesh.h
+++ b/net/mac80211/mesh.h
@@ -145,9 +145,6 @@ struct mesh_rmc {
 };
 
 
-/* Mesh IEs constants */
-#define MESH_CFG_LEN		19
-
 /*
  * MESH_CFG_COMP_LEN Includes:
  * 	- Active path selection protocol ID.
@@ -157,7 +154,7 @@ struct mesh_rmc {
  * Does not include mesh capabilities, which may vary across nodes in the same
  * mesh
  */
-#define MESH_CFG_CMP_LEN 	17
+#define MESH_CFG_CMP_LEN 	(IEEE80211_MESH_CONFIG_LEN - 2)
 
 /* Default values, timeouts in ms */
 #define MESH_TTL 		5
diff --git a/net/mac80211/scan.c b/net/mac80211/scan.c
index 7372d7abb8c..f5c7c337192 100644
--- a/net/mac80211/scan.c
+++ b/net/mac80211/scan.c
@@ -159,7 +159,7 @@ ieee80211_rx_mesh_bss_add(struct ieee80211_local *local, u8 *mesh_id, int mesh_i
 {
 	struct ieee80211_bss *bss;
 
-	if (mesh_config_len != MESH_CFG_LEN)
+	if (mesh_config_len != IEEE80211_MESH_CONFIG_LEN)
 		return NULL;
 
 	bss = kzalloc(sizeof(*bss), GFP_ATOMIC);
-- 
cgit v1.2.3-70-g09d2


From 90c97a040d6b08cc4890328aa262fdc37336ab01 Mon Sep 17 00:00:00 2001
From: Jouni Malinen <j@w1.fi>
Date: Thu, 30 Oct 2008 16:59:22 +0200
Subject: nl80211: Add basic rate configuration for AP mode

Add a new attribute, NL80211_ATTR_BSS_BASIC_RATES, that can be used with
NL80211_CMD_SET_BSS for userspace (e.g., hostapd) to set which rates are
in the basic rate set.

Signed-off-by: Jouni Malinen <jouni.malinen@atheros.com>
Acked-by: Johannes Berg <johannes@sipsolutions.net>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 include/linux/nl80211.h |  6 ++++++
 include/net/cfg80211.h  |  5 +++++
 net/mac80211/cfg.c      | 18 ++++++++++++++++++
 net/wireless/nl80211.c  |  8 ++++++++
 4 files changed, 37 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/nl80211.h b/include/linux/nl80211.h
index e4cc7869b22..5009809588c 100644
--- a/include/linux/nl80211.h
+++ b/include/linux/nl80211.h
@@ -243,6 +243,9 @@ enum nl80211_commands {
  *	(u8, 0 or 1)
  * @NL80211_ATTR_BSS_SHORT_SLOT_TIME: whether short slot time enabled
  *	(u8, 0 or 1)
+ * @NL80211_ATTR_BSS_BASIC_RATES: basic rates, array of basic
+ *	rates in format defined by IEEE 802.11 7.3.2.2 but without the length
+ *	restriction (at most %NL80211_MAX_SUPP_RATES).
  *
  * @NL80211_ATTR_HT_CAPABILITY: HT Capability information element (from
  *	association request when used with NL80211_CMD_NEW_STATION)
@@ -307,6 +310,8 @@ enum nl80211_attrs {
 
 	NL80211_ATTR_MESH_PARAMS,
 
+	NL80211_ATTR_BSS_BASIC_RATES,
+
 	/* add attributes here, update the policy in nl80211.c */
 
 	__NL80211_ATTR_AFTER_LAST,
@@ -318,6 +323,7 @@ enum nl80211_attrs {
  * here
  */
 #define NL80211_ATTR_HT_CAPABILITY NL80211_ATTR_HT_CAPABILITY
+#define NL80211_ATTR_BSS_BASIC_RATES NL80211_ATTR_BSS_BASIC_RATES
 
 #define NL80211_MAX_SUPP_RATES			32
 #define NL80211_MAX_SUPP_REG_RULES		32
diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index 03e1e88c6a0..7caf3c76a12 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -280,11 +280,16 @@ struct mpath_info {
  *	(0 = no, 1 = yes, -1 = do not change)
  * @use_short_slot_time: Whether the use of short slot time is allowed
  *	(0 = no, 1 = yes, -1 = do not change)
+ * @basic_rates: basic rates in IEEE 802.11 format
+ *	(or NULL for no change)
+ * @basic_rates_len: number of basic rates
  */
 struct bss_parameters {
 	int use_cts_prot;
 	int use_short_preamble;
 	int use_short_slot_time;
+	u8 *basic_rates;
+	u8 basic_rates_len;
 };
 
 /**
diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c
index 91f56a48e2b..442a4d7b180 100644
--- a/net/mac80211/cfg.c
+++ b/net/mac80211/cfg.c
@@ -1046,6 +1046,24 @@ static int ieee80211_change_bss(struct wiphy *wiphy,
 		changed |= BSS_CHANGED_ERP_SLOT;
 	}
 
+	if (params->basic_rates) {
+		int i, j;
+		u32 rates = 0;
+		struct ieee80211_local *local = wiphy_priv(wiphy);
+		struct ieee80211_supported_band *sband =
+			wiphy->bands[local->oper_channel->band];
+
+		for (i = 0; i < params->basic_rates_len; i++) {
+			int rate = (params->basic_rates[i] & 0x7f) * 5;
+			for (j = 0; j < sband->n_bitrates; j++) {
+				if (sband->bitrates[j].bitrate == rate)
+					rates |= BIT(j);
+			}
+		}
+		sdata->vif.bss_conf.basic_rates = rates;
+		changed |= BSS_CHANGED_BASIC_RATES;
+	}
+
 	ieee80211_bss_info_change_notify(sdata, changed);
 
 	return 0;
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index 5e1d658a8b5..1ea5e3fd393 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -95,6 +95,8 @@ static struct nla_policy nl80211_policy[NL80211_ATTR_MAX+1] __read_mostly = {
 	[NL80211_ATTR_BSS_CTS_PROT] = { .type = NLA_U8 },
 	[NL80211_ATTR_BSS_SHORT_PREAMBLE] = { .type = NLA_U8 },
 	[NL80211_ATTR_BSS_SHORT_SLOT_TIME] = { .type = NLA_U8 },
+	[NL80211_ATTR_BSS_BASIC_RATES] = { .type = NLA_BINARY,
+					   .len = NL80211_MAX_SUPP_RATES },
 
 	[NL80211_ATTR_MESH_PARAMS] = { .type = NLA_NESTED },
 
@@ -1613,6 +1615,12 @@ static int nl80211_set_bss(struct sk_buff *skb, struct genl_info *info)
 	if (info->attrs[NL80211_ATTR_BSS_SHORT_SLOT_TIME])
 		params.use_short_slot_time =
 		    nla_get_u8(info->attrs[NL80211_ATTR_BSS_SHORT_SLOT_TIME]);
+	if (info->attrs[NL80211_ATTR_BSS_BASIC_RATES]) {
+		params.basic_rates =
+			nla_data(info->attrs[NL80211_ATTR_BSS_BASIC_RATES]);
+		params.basic_rates_len =
+			nla_len(info->attrs[NL80211_ATTR_BSS_BASIC_RATES]);
+	}
 
 	err = get_drv_dev_by_info_ifindex(info->attrs, &drv, &dev);
 	if (err)
-- 
cgit v1.2.3-70-g09d2


From 318884875bdddca663ecc373c813cf8e117d9e43 Mon Sep 17 00:00:00 2001
From: Jouni Malinen <j@w1.fi>
Date: Thu, 30 Oct 2008 16:59:24 +0200
Subject: nl80211: Add TX queue parameter configuration

Add a new attribute, NL80211_ATTR_WIPHY_TXQ_PARAMS, that can be used with
NL80211_CMD_SET_WIPHY for userspace (e.g., hostapd) to set TX queue
parameters (txop, cwmin, cwmax, aifs).

Signed-off-by: Jouni Malinen <jouni.malinen@atheros.com>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 include/linux/nl80211.h | 43 +++++++++++++++++++++++++++++--
 include/net/cfg80211.h  | 23 +++++++++++++++++
 net/mac80211/cfg.c      | 25 ++++++++++++++++++
 net/wireless/nl80211.c  | 67 +++++++++++++++++++++++++++++++++++++++++++++----
 4 files changed, 151 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/nl80211.h b/include/linux/nl80211.h
index 5009809588c..79827345351 100644
--- a/include/linux/nl80211.h
+++ b/include/linux/nl80211.h
@@ -25,8 +25,9 @@
  *
  * @NL80211_CMD_GET_WIPHY: request information about a wiphy or dump request
  *	to get a list of all present wiphys.
- * @NL80211_CMD_SET_WIPHY: set wiphy name, needs %NL80211_ATTR_WIPHY and
- *	%NL80211_ATTR_WIPHY_NAME.
+ * @NL80211_CMD_SET_WIPHY: set wiphy parameters, needs %NL80211_ATTR_WIPHY or
+ *	%NL80211_ATTR_IFINDEX; can be used to set %NL80211_ATTR_WIPHY_NAME
+ *	and/or %NL80211_ATTR_WIPHY_TXQ_PARAMS.
  * @NL80211_CMD_NEW_WIPHY: Newly created wiphy, response to get request
  *	or rename notification. Has attributes %NL80211_ATTR_WIPHY and
  *	%NL80211_ATTR_WIPHY_NAME.
@@ -178,6 +179,7 @@ enum nl80211_commands {
  * @NL80211_ATTR_WIPHY: index of wiphy to operate on, cf.
  *	/sys/class/ieee80211/<phyname>/index
  * @NL80211_ATTR_WIPHY_NAME: wiphy name (used for renaming)
+ * @NL80211_ATTR_WIPHY_TXQ_PARAMS: a nested array of TX queue parameters
  *
  * @NL80211_ATTR_IFINDEX: network interface index of the device to operate on
  * @NL80211_ATTR_IFNAME: network interface name
@@ -312,6 +314,8 @@ enum nl80211_attrs {
 
 	NL80211_ATTR_BSS_BASIC_RATES,
 
+	NL80211_ATTR_WIPHY_TXQ_PARAMS,
+
 	/* add attributes here, update the policy in nl80211.c */
 
 	__NL80211_ATTR_AFTER_LAST,
@@ -324,6 +328,7 @@ enum nl80211_attrs {
  */
 #define NL80211_ATTR_HT_CAPABILITY NL80211_ATTR_HT_CAPABILITY
 #define NL80211_ATTR_BSS_BASIC_RATES NL80211_ATTR_BSS_BASIC_RATES
+#define NL80211_ATTR_WIPHY_TXQ_PARAMS NL80211_ATTR_WIPHY_TXQ_PARAMS
 
 #define NL80211_MAX_SUPP_RATES			32
 #define NL80211_MAX_SUPP_REG_RULES		32
@@ -698,4 +703,38 @@ enum nl80211_meshconf_params {
 	NL80211_MESHCONF_ATTR_MAX = __NL80211_MESHCONF_ATTR_AFTER_LAST - 1
 };
 
+/**
+ * enum nl80211_txq_attr - TX queue parameter attributes
+ * @__NL80211_TXQ_ATTR_INVALID: Attribute number 0 is reserved
+ * @NL80211_TXQ_ATTR_QUEUE: TX queue identifier (NL80211_TXQ_Q_*)
+ * @NL80211_TXQ_ATTR_TXOP: Maximum burst time in units of 32 usecs, 0 meaning
+ *	disabled
+ * @NL80211_TXQ_ATTR_CWMIN: Minimum contention window [a value of the form
+ *	2^n-1 in the range 1..32767]
+ * @NL80211_TXQ_ATTR_CWMAX: Maximum contention window [a value of the form
+ *	2^n-1 in the range 1..32767]
+ * @NL80211_TXQ_ATTR_AIFS: Arbitration interframe space [0..255]
+ * @__NL80211_TXQ_ATTR_AFTER_LAST: Internal
+ * @NL80211_TXQ_ATTR_MAX: Maximum TXQ attribute number
+ */
+enum nl80211_txq_attr {
+	__NL80211_TXQ_ATTR_INVALID,
+	NL80211_TXQ_ATTR_QUEUE,
+	NL80211_TXQ_ATTR_TXOP,
+	NL80211_TXQ_ATTR_CWMIN,
+	NL80211_TXQ_ATTR_CWMAX,
+	NL80211_TXQ_ATTR_AIFS,
+
+	/* keep last */
+	__NL80211_TXQ_ATTR_AFTER_LAST,
+	NL80211_TXQ_ATTR_MAX = __NL80211_TXQ_ATTR_AFTER_LAST - 1
+};
+
+enum nl80211_txq_q {
+	NL80211_TXQ_Q_VO,
+	NL80211_TXQ_Q_VI,
+	NL80211_TXQ_Q_BE,
+	NL80211_TXQ_Q_BK
+};
+
 #endif /* __LINUX_NL80211_H */
diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index 7caf3c76a12..448023193e5 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -371,6 +371,24 @@ struct mesh_config {
 	u16 dot11MeshHWMPnetDiameterTraversalTime;
 };
 
+/**
+ * struct ieee80211_txq_params - TX queue parameters
+ * @queue: TX queue identifier (NL80211_TXQ_Q_*)
+ * @txop: Maximum burst time in units of 32 usecs, 0 meaning disabled
+ * @cwmin: Minimum contention window [a value of the form 2^n-1 in the range
+ *	1..32767]
+ * @cwmax: Maximum contention window [a value of the form 2^n-1 in the range
+ *	1..32767]
+ * @aifs: Arbitration interframe space [0..255]
+ */
+struct ieee80211_txq_params {
+	enum nl80211_txq_q queue;
+	u16 txop;
+	u16 cwmin;
+	u16 cwmax;
+	u8 aifs;
+};
+
 /* from net/wireless.h */
 struct wiphy;
 
@@ -430,6 +448,8 @@ struct wiphy;
  * @set_mesh_cfg: set mesh parameters (by now, just mesh id)
  *
  * @change_bss: Modify parameters for a given BSS.
+ *
+ * @set_txq_params: Set TX queue parameters
  */
 struct cfg80211_ops {
 	int	(*add_virtual_intf)(struct wiphy *wiphy, char *name,
@@ -490,6 +510,9 @@ struct cfg80211_ops {
 				const struct mesh_config *nconf, u32 mask);
 	int	(*change_bss)(struct wiphy *wiphy, struct net_device *dev,
 			      struct bss_parameters *params);
+
+	int	(*set_txq_params)(struct wiphy *wiphy,
+				  struct ieee80211_txq_params *params);
 };
 
 #endif /* __NET_CFG80211_H */
diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c
index 442a4d7b180..fe672faa819 100644
--- a/net/mac80211/cfg.c
+++ b/net/mac80211/cfg.c
@@ -1069,6 +1069,30 @@ static int ieee80211_change_bss(struct wiphy *wiphy,
 	return 0;
 }
 
+static int ieee80211_set_txq_params(struct wiphy *wiphy,
+				    struct ieee80211_txq_params *params)
+{
+	struct ieee80211_local *local = wiphy_priv(wiphy);
+	struct ieee80211_tx_queue_params p;
+
+	if (!local->ops->conf_tx)
+		return -EOPNOTSUPP;
+
+	memset(&p, 0, sizeof(p));
+	p.aifs = params->aifs;
+	p.cw_max = params->cwmax;
+	p.cw_min = params->cwmin;
+	p.txop = params->txop;
+	if (local->ops->conf_tx(local_to_hw(local), params->queue, &p)) {
+		printk(KERN_DEBUG "%s: failed to set TX queue "
+		       "parameters for queue %d\n", local->mdev->name,
+		       params->queue);
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
 struct cfg80211_ops mac80211_config_ops = {
 	.add_virtual_intf = ieee80211_add_iface,
 	.del_virtual_intf = ieee80211_del_iface,
@@ -1095,4 +1119,5 @@ struct cfg80211_ops mac80211_config_ops = {
 	.get_mesh_params = ieee80211_get_mesh_params,
 #endif
 	.change_bss = ieee80211_change_bss,
+	.set_txq_params = ieee80211_set_txq_params,
 };
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index 1ea5e3fd393..e3e1494e769 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -58,6 +58,7 @@ static struct nla_policy nl80211_policy[NL80211_ATTR_MAX+1] __read_mostly = {
 	[NL80211_ATTR_WIPHY] = { .type = NLA_U32 },
 	[NL80211_ATTR_WIPHY_NAME] = { .type = NLA_NUL_STRING,
 				      .len = BUS_ID_SIZE-1 },
+	[NL80211_ATTR_WIPHY_TXQ_PARAMS] = { .type = NLA_NESTED },
 
 	[NL80211_ATTR_IFTYPE] = { .type = NLA_U32 },
 	[NL80211_ATTR_IFINDEX] = { .type = NLA_U32 },
@@ -286,20 +287,76 @@ static int nl80211_get_wiphy(struct sk_buff *skb, struct genl_info *info)
 	return -ENOBUFS;
 }
 
+static const struct nla_policy txq_params_policy[NL80211_TXQ_ATTR_MAX + 1] = {
+	[NL80211_TXQ_ATTR_QUEUE]		= { .type = NLA_U8 },
+	[NL80211_TXQ_ATTR_TXOP]			= { .type = NLA_U16 },
+	[NL80211_TXQ_ATTR_CWMIN]		= { .type = NLA_U16 },
+	[NL80211_TXQ_ATTR_CWMAX]		= { .type = NLA_U16 },
+	[NL80211_TXQ_ATTR_AIFS]			= { .type = NLA_U8 },
+};
+
+static int parse_txq_params(struct nlattr *tb[],
+			    struct ieee80211_txq_params *txq_params)
+{
+	if (!tb[NL80211_TXQ_ATTR_QUEUE] || !tb[NL80211_TXQ_ATTR_TXOP] ||
+	    !tb[NL80211_TXQ_ATTR_CWMIN] || !tb[NL80211_TXQ_ATTR_CWMAX] ||
+	    !tb[NL80211_TXQ_ATTR_AIFS])
+		return -EINVAL;
+
+	txq_params->queue = nla_get_u8(tb[NL80211_TXQ_ATTR_QUEUE]);
+	txq_params->txop = nla_get_u16(tb[NL80211_TXQ_ATTR_TXOP]);
+	txq_params->cwmin = nla_get_u16(tb[NL80211_TXQ_ATTR_CWMIN]);
+	txq_params->cwmax = nla_get_u16(tb[NL80211_TXQ_ATTR_CWMAX]);
+	txq_params->aifs = nla_get_u8(tb[NL80211_TXQ_ATTR_AIFS]);
+
+	return 0;
+}
+
 static int nl80211_set_wiphy(struct sk_buff *skb, struct genl_info *info)
 {
 	struct cfg80211_registered_device *rdev;
-	int result;
-
-	if (!info->attrs[NL80211_ATTR_WIPHY_NAME])
-		return -EINVAL;
+	int result = 0, rem_txq_params = 0;
+	struct nlattr *nl_txq_params;
 
 	rdev = cfg80211_get_dev_from_info(info);
 	if (IS_ERR(rdev))
 		return PTR_ERR(rdev);
 
-	result = cfg80211_dev_rename(rdev, nla_data(info->attrs[NL80211_ATTR_WIPHY_NAME]));
+	if (info->attrs[NL80211_ATTR_WIPHY_NAME]) {
+		result = cfg80211_dev_rename(
+			rdev, nla_data(info->attrs[NL80211_ATTR_WIPHY_NAME]));
+		if (result)
+			goto bad_res;
+	}
+
+	if (info->attrs[NL80211_ATTR_WIPHY_TXQ_PARAMS]) {
+		struct ieee80211_txq_params txq_params;
+		struct nlattr *tb[NL80211_TXQ_ATTR_MAX + 1];
+
+		if (!rdev->ops->set_txq_params) {
+			result = -EOPNOTSUPP;
+			goto bad_res;
+		}
+
+		nla_for_each_nested(nl_txq_params,
+				    info->attrs[NL80211_ATTR_WIPHY_TXQ_PARAMS],
+				    rem_txq_params) {
+			nla_parse(tb, NL80211_TXQ_ATTR_MAX,
+				  nla_data(nl_txq_params),
+				  nla_len(nl_txq_params),
+				  txq_params_policy);
+			result = parse_txq_params(tb, &txq_params);
+			if (result)
+				goto bad_res;
+
+			result = rdev->ops->set_txq_params(&rdev->wiphy,
+							   &txq_params);
+			if (result)
+				goto bad_res;
+		}
+	}
 
+bad_res:
 	cfg80211_put_dev(rdev);
 	return result;
 }
-- 
cgit v1.2.3-70-g09d2


From fc6971d491517ba15e800540ff88caa55dc65b01 Mon Sep 17 00:00:00 2001
From: Jouni Malinen <j@w1.fi>
Date: Thu, 30 Oct 2008 19:59:05 +0200
Subject: mac80211_hwsim: Add support for client PS mode

This introduces a debugfs file (ieee80211/phy#/hwsim/ps) that can be
used to force a simulated radio into power save mode. Following values
can be written into this file to change PS mode:
0 = power save disabled (constantly awake)
1 = power save enabled (drop all frames; do not send PS-Poll)
2 = power save enabled (send PS-Poll frames automatically to receive
    buffered unicast frames); not yet fully implemented
3 = manual PS-Poll trigger (send a single PS-Poll frame)

Two different behavior for power save mode processing can be tested:
- move between modes 1 and 0 (i.e., receive all buffered frames at a
  time)
- move to mode 1 and use manual PS-Poll frames (write 3 to the 'ps'
  debugfs file) to fetch power save buffered frames one at a time

Mode 2 (automatic PS-Poll) does not yet parse Beacon frames, but
eventually, it should take a look at TIM IE and send PS-Poll if a
traffic bit is set for our AID.

Signed-off-by: Jouni Malinen <jouni.malinen@atheros.com>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 drivers/net/wireless/mac80211_hwsim.c | 184 ++++++++++++++++++++++++++++++++++
 include/linux/ieee80211.h             |   7 ++
 2 files changed, 191 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/net/wireless/mac80211_hwsim.c b/drivers/net/wireless/mac80211_hwsim.c
index bc9da939376..b9230da925e 100644
--- a/drivers/net/wireless/mac80211_hwsim.c
+++ b/drivers/net/wireless/mac80211_hwsim.c
@@ -21,6 +21,7 @@
 #include <linux/if_arp.h>
 #include <linux/rtnetlink.h>
 #include <linux/etherdevice.h>
+#include <linux/debugfs.h>
 
 MODULE_AUTHOR("Jouni Malinen");
 MODULE_DESCRIPTION("Software simulator of 802.11 radio(s) for mac80211");
@@ -32,6 +33,9 @@ MODULE_PARM_DESC(radios, "Number of simulated radios");
 
 struct hwsim_vif_priv {
 	u32 magic;
+	u8 bssid[ETH_ALEN];
+	bool assoc;
+	u16 aid;
 };
 
 #define HWSIM_VIF_MAGIC	0x69537748
@@ -132,6 +136,12 @@ struct mac80211_hwsim_data {
 	unsigned int rx_filter;
 	int started;
 	struct timer_list beacon_timer;
+	enum ps_mode {
+		PS_DISABLED, PS_ENABLED, PS_AUTO_POLL, PS_MANUAL_POLL
+	} ps;
+	bool ps_poll_pending;
+	struct dentry *debugfs;
+	struct dentry *debugfs_ps;
 };
 
 
@@ -196,6 +206,34 @@ static void mac80211_hwsim_monitor_rx(struct ieee80211_hw *hw,
 }
 
 
+static bool hwsim_ps_rx_ok(struct mac80211_hwsim_data *data,
+			   struct sk_buff *skb)
+{
+	switch (data->ps) {
+	case PS_DISABLED:
+		return true;
+	case PS_ENABLED:
+		return false;
+	case PS_AUTO_POLL:
+		/* TODO: accept (some) Beacons by default and other frames only
+		 * if pending PS-Poll has been sent */
+		return true;
+	case PS_MANUAL_POLL:
+		/* Allow unicast frames to own address if there is a pending
+		 * PS-Poll */
+		if (data->ps_poll_pending &&
+		    memcmp(data->hw->wiphy->perm_addr, skb->data + 4,
+			   ETH_ALEN) == 0) {
+			data->ps_poll_pending = false;
+			return true;
+		}
+		return false;
+	}
+
+	return true;
+}
+
+
 static bool mac80211_hwsim_tx_frame(struct ieee80211_hw *hw,
 				    struct sk_buff *skb)
 {
@@ -212,6 +250,9 @@ static bool mac80211_hwsim_tx_frame(struct ieee80211_hw *hw,
 	rx_status.rate_idx = info->control.rates[0].idx;
 	/* TODO: simulate signal strength (and optional packet drop) */
 
+	if (data->ps != PS_DISABLED)
+		hdr->frame_control |= cpu_to_le16(IEEE80211_FCTL_PM);
+
 	/* Copy skb to all enabled radios that are on the current frequency */
 	spin_lock(&hwsim_radio_lock);
 	list_for_each_entry(data2, &hwsim_radios, list) {
@@ -221,6 +262,7 @@ static bool mac80211_hwsim_tx_frame(struct ieee80211_hw *hw,
 			continue;
 
 		if (!data2->started || !data2->radio_enabled ||
+		    !hwsim_ps_rx_ok(data2, skb) ||
 		    data->channel->center_freq != data2->channel->center_freq)
 			continue;
 
@@ -404,7 +446,16 @@ static int mac80211_hwsim_config_interface(struct ieee80211_hw *hw,
 					   struct ieee80211_vif *vif,
 					   struct ieee80211_if_conf *conf)
 {
+	struct hwsim_vif_priv *vp = (void *)vif->drv_priv;
+
 	hwsim_check_magic(vif);
+	if (conf->changed & IEEE80211_IFCC_BSSID) {
+		DECLARE_MAC_BUF(mac);
+		printk(KERN_DEBUG "%s:%s: BSSID changed: %s\n",
+		       wiphy_name(hw->wiphy), __func__,
+		       print_mac(mac, conf->bssid));
+		memcpy(vp->bssid, conf->bssid, ETH_ALEN);
+	}
 	return 0;
 }
 
@@ -413,6 +464,8 @@ static void mac80211_hwsim_bss_info_changed(struct ieee80211_hw *hw,
 					    struct ieee80211_bss_conf *info,
 					    u32 changed)
 {
+	struct hwsim_vif_priv *vp = (void *)vif->drv_priv;
+
 	hwsim_check_magic(vif);
 
 	printk(KERN_DEBUG "%s:%s(changed=0x%x)\n",
@@ -421,6 +474,8 @@ static void mac80211_hwsim_bss_info_changed(struct ieee80211_hw *hw,
 	if (changed & BSS_CHANGED_ASSOC) {
 		printk(KERN_DEBUG "  %s: ASSOC: assoc=%d aid=%d\n",
 		       wiphy_name(hw->wiphy), info->assoc, info->aid);
+		vp->assoc = info->assoc;
+		vp->aid = info->aid;
 	}
 
 	if (changed & BSS_CHANGED_ERP_CTS_PROT) {
@@ -518,6 +573,8 @@ static void mac80211_hwsim_free(void)
 	spin_unlock_bh(&hwsim_radio_lock);
 
 	list_for_each_entry(data, &tmplist, list) {
+		debugfs_remove(data->debugfs_ps);
+		debugfs_remove(data->debugfs);
 		ieee80211_unregister_hw(data->hw);
 		device_unregister(data->dev);
 		ieee80211_free_hw(data->hw);
@@ -543,6 +600,127 @@ static void hwsim_mon_setup(struct net_device *dev)
 }
 
 
+static void hwsim_send_ps_poll(void *dat, u8 *mac, struct ieee80211_vif *vif)
+{
+	struct mac80211_hwsim_data *data = dat;
+	struct hwsim_vif_priv *vp = (void *)vif->drv_priv;
+	DECLARE_MAC_BUF(buf);
+	struct sk_buff *skb;
+	struct ieee80211_pspoll *pspoll;
+
+	if (!vp->assoc)
+		return;
+
+	printk(KERN_DEBUG "%s:%s: send PS-Poll to %s for aid %d\n",
+	       wiphy_name(data->hw->wiphy), __func__,
+	       print_mac(buf, vp->bssid), vp->aid);
+
+	skb = dev_alloc_skb(sizeof(*pspoll));
+	if (!skb)
+		return;
+	pspoll = (void *) skb_put(skb, sizeof(*pspoll));
+	pspoll->frame_control = cpu_to_le16(IEEE80211_FTYPE_CTL |
+					    IEEE80211_STYPE_PSPOLL |
+					    IEEE80211_FCTL_PM);
+	pspoll->aid = cpu_to_le16(0xc000 | vp->aid);
+	memcpy(pspoll->bssid, vp->bssid, ETH_ALEN);
+	memcpy(pspoll->ta, mac, ETH_ALEN);
+	if (data->radio_enabled &&
+	    !mac80211_hwsim_tx_frame(data->hw, skb))
+		printk(KERN_DEBUG "%s: PS-Poll frame not ack'ed\n", __func__);
+	dev_kfree_skb(skb);
+}
+
+
+static void hwsim_send_nullfunc(struct mac80211_hwsim_data *data, u8 *mac,
+				struct ieee80211_vif *vif, int ps)
+{
+	struct hwsim_vif_priv *vp = (void *)vif->drv_priv;
+	DECLARE_MAC_BUF(buf);
+	struct sk_buff *skb;
+	struct ieee80211_hdr *hdr;
+
+	if (!vp->assoc)
+		return;
+
+	printk(KERN_DEBUG "%s:%s: send data::nullfunc to %s ps=%d\n",
+	       wiphy_name(data->hw->wiphy), __func__,
+	       print_mac(buf, vp->bssid), ps);
+
+	skb = dev_alloc_skb(sizeof(*hdr));
+	if (!skb)
+		return;
+	hdr = (void *) skb_put(skb, sizeof(*hdr) - ETH_ALEN);
+	hdr->frame_control = cpu_to_le16(IEEE80211_FTYPE_DATA |
+					 IEEE80211_STYPE_NULLFUNC |
+					 (ps ? IEEE80211_FCTL_PM : 0));
+	hdr->duration_id = cpu_to_le16(0);
+	memcpy(hdr->addr1, vp->bssid, ETH_ALEN);
+	memcpy(hdr->addr2, mac, ETH_ALEN);
+	memcpy(hdr->addr3, vp->bssid, ETH_ALEN);
+	if (data->radio_enabled &&
+	    !mac80211_hwsim_tx_frame(data->hw, skb))
+		printk(KERN_DEBUG "%s: nullfunc frame not ack'ed\n", __func__);
+	dev_kfree_skb(skb);
+}
+
+
+static void hwsim_send_nullfunc_ps(void *dat, u8 *mac,
+				   struct ieee80211_vif *vif)
+{
+	struct mac80211_hwsim_data *data = dat;
+	hwsim_send_nullfunc(data, mac, vif, 1);
+}
+
+
+static void hwsim_send_nullfunc_no_ps(void *dat, u8 *mac,
+				      struct ieee80211_vif *vif)
+{
+	struct mac80211_hwsim_data *data = dat;
+	hwsim_send_nullfunc(data, mac, vif, 0);
+}
+
+
+static int hwsim_fops_ps_read(void *dat, u64 *val)
+{
+	struct mac80211_hwsim_data *data = dat;
+	*val = data->ps;
+	return 0;
+}
+
+static int hwsim_fops_ps_write(void *dat, u64 val)
+{
+	struct mac80211_hwsim_data *data = dat;
+	enum ps_mode old_ps;
+
+	if (val != PS_DISABLED && val != PS_ENABLED && val != PS_AUTO_POLL &&
+	    val != PS_MANUAL_POLL)
+		return -EINVAL;
+
+	old_ps = data->ps;
+	data->ps = val;
+
+	if (val == PS_MANUAL_POLL) {
+		ieee80211_iterate_active_interfaces(data->hw,
+						    hwsim_send_ps_poll, data);
+		data->ps_poll_pending = true;
+	} else if (old_ps == PS_DISABLED && val != PS_DISABLED) {
+		ieee80211_iterate_active_interfaces(data->hw,
+						    hwsim_send_nullfunc_ps,
+						    data);
+	} else if (old_ps != PS_DISABLED && val == PS_DISABLED) {
+		ieee80211_iterate_active_interfaces(data->hw,
+						    hwsim_send_nullfunc_no_ps,
+						    data);
+	}
+
+	return 0;
+}
+
+DEFINE_SIMPLE_ATTRIBUTE(hwsim_fops_ps, hwsim_fops_ps_read, hwsim_fops_ps_write,
+			"%llu\n");
+
+
 static int __init init_mac80211_hwsim(void)
 {
 	int i, err = 0;
@@ -634,6 +812,12 @@ static int __init init_mac80211_hwsim(void)
 		       wiphy_name(hw->wiphy),
 		       hw->wiphy->perm_addr);
 
+		data->debugfs = debugfs_create_dir("hwsim",
+						   hw->wiphy->debugfsdir);
+		data->debugfs_ps = debugfs_create_file("ps", 0666,
+						       data->debugfs, data,
+						       &hwsim_fops_ps);
+
 		setup_timer(&data->beacon_timer, mac80211_hwsim_beacon,
 			    (unsigned long) hw);
 
diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h
index 9dc288b920c..56b0eb25d92 100644
--- a/include/linux/ieee80211.h
+++ b/include/linux/ieee80211.h
@@ -669,6 +669,13 @@ struct ieee80211_cts {
 	u8 ra[6];
 } __attribute__ ((packed));
 
+struct ieee80211_pspoll {
+	__le16 frame_control;
+	__le16 aid;
+	u8 bssid[6];
+	u8 ta[6];
+} __attribute__ ((packed));
+
 /**
  * struct ieee80211_bar - HT Block Ack Request
  *
-- 
cgit v1.2.3-70-g09d2


From caf4b323b02a16c92fba449952ac6515ddc76d7a Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Tue, 11 Nov 2008 07:03:45 +0100
Subject: tracing, x86: add low level support for ftrace return tracing

Impact: add infrastructure for function-return tracing

Add low level support for ftrace return tracing.

This plug-in stores return addresses on the thread_info structure of
the current task.

The index of the current return address is initialized when the task
is the first one (init) and when a process forks (the child). It is
not needed when a task does a sys_execve because after this syscall,
it still needs to return on the kernel functions it called.

Note that the code of return_to_handler has been suggested by Steven
Rostedt as almost all of the ideas of improvements in this V3.

For purpose of security, arch/x86/kernel/process_32.c is not traced
because __switch_to() changes the current task during its execution.
That could cause inconsistency in the stored return address of this
function even if I didn't have any crash after testing with tracing on
this function enabled.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/Kconfig                   |   1 +
 arch/x86/include/asm/ftrace.h      |  26 ++++++
 arch/x86/include/asm/thread_info.h |  24 +++++
 arch/x86/kernel/Makefile           |   6 ++
 arch/x86/kernel/entry_32.S         |  33 +++++++
 arch/x86/kernel/ftrace.c           | 181 +++++++++++++++++++++++++++++++++++--
 include/linux/ftrace.h             |  20 ++++
 include/linux/ftrace_irq.h         |   2 +-
 include/linux/sched.h              |  11 +++
 kernel/Makefile                    |   4 +
 10 files changed, 300 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 27b8a3a3991..ca91e50bdb1 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -11,6 +11,7 @@ config 64BIT
 
 config X86_32
 	def_bool !64BIT
+	select HAVE_FUNCTION_RET_TRACER
 
 config X86_64
 	def_bool 64BIT
diff --git a/arch/x86/include/asm/ftrace.h b/arch/x86/include/asm/ftrace.h
index f8173ed1c97..9b6a1fa19e7 100644
--- a/arch/x86/include/asm/ftrace.h
+++ b/arch/x86/include/asm/ftrace.h
@@ -20,4 +20,30 @@ static inline unsigned long ftrace_call_adjust(unsigned long addr)
 #endif /* __ASSEMBLY__ */
 #endif /* CONFIG_FUNCTION_TRACER */
 
+#ifdef CONFIG_FUNCTION_RET_TRACER
+#define FTRACE_RET_STACK_SIZE 20
+
+#ifndef __ASSEMBLY__
+
+/*
+ * Stack of return addresses for functions
+ * of a thread.
+ * Used in struct thread_info
+ */
+struct ftrace_ret_stack {
+	unsigned long ret;
+	unsigned long func;
+	unsigned long long calltime;
+};
+
+/*
+ * Primary handler of a function return.
+ * It relays on ftrace_return_to_handler.
+ * Defined in entry32.S
+ */
+extern void return_to_handler(void);
+
+#endif /* __ASSEMBLY__ */
+#endif /* CONFIG_FUNCTION_RET_TRACER */
+
 #endif /* _ASM_X86_FTRACE_H */
diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
index e44d379faad..a71158369fd 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -20,6 +20,7 @@
 struct task_struct;
 struct exec_domain;
 #include <asm/processor.h>
+#include <asm/ftrace.h>
 
 struct thread_info {
 	struct task_struct	*task;		/* main task structure */
@@ -38,8 +39,30 @@ struct thread_info {
 						*/
 	__u8			supervisor_stack[0];
 #endif
+
+#ifdef CONFIG_FUNCTION_RET_TRACER
+	/* Index of current stored adress in ret_stack */
+	int		curr_ret_stack;
+	/* Stack of return addresses for return function tracing */
+	struct ftrace_ret_stack	ret_stack[FTRACE_RET_STACK_SIZE];
+#endif
 };
 
+#ifdef CONFIG_FUNCTION_RET_TRACER
+#define INIT_THREAD_INFO(tsk)			\
+{						\
+	.task		= &tsk,			\
+	.exec_domain	= &default_exec_domain,	\
+	.flags		= 0,			\
+	.cpu		= 0,			\
+	.preempt_count	= 1,			\
+	.addr_limit	= KERNEL_DS,		\
+	.restart_block = {			\
+		.fn = do_no_restart_syscall,	\
+	},					\
+	.curr_ret_stack = -1,\
+}
+#else
 #define INIT_THREAD_INFO(tsk)			\
 {						\
 	.task		= &tsk,			\
@@ -52,6 +75,7 @@ struct thread_info {
 		.fn = do_no_restart_syscall,	\
 	},					\
 }
+#endif
 
 #define init_thread_info	(init_thread_union.thread_info)
 #define init_stack		(init_thread_union.stack)
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index e489ff9cb3e..1d8ed95da84 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -14,6 +14,11 @@ CFLAGS_REMOVE_paravirt-spinlocks.o = -pg
 CFLAGS_REMOVE_ftrace.o = -pg
 endif
 
+ifdef CONFIG_FUNCTION_RET_TRACER
+# Don't trace __switch_to() but let it for function tracer
+CFLAGS_REMOVE_process_32.o = -pg
+endif
+
 #
 # vsyscalls (which work on the user stack) should have
 # no stack-protector checks:
@@ -65,6 +70,7 @@ obj-$(CONFIG_X86_LOCAL_APIC)	+= apic.o nmi.o
 obj-$(CONFIG_X86_IO_APIC)	+= io_apic.o
 obj-$(CONFIG_X86_REBOOTFIXUPS)	+= reboot_fixups_32.o
 obj-$(CONFIG_DYNAMIC_FTRACE)	+= ftrace.o
+obj-$(CONFIG_FUNCTION_RET_TRACER)	+= ftrace.o
 obj-$(CONFIG_KEXEC)		+= machine_kexec_$(BITS).o
 obj-$(CONFIG_KEXEC)		+= relocate_kernel_$(BITS).o crash.o
 obj-$(CONFIG_CRASH_DUMP)	+= crash_dump_$(BITS).o
diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
index 9134de814c9..9a0ac85946d 100644
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -1188,6 +1188,10 @@ ENTRY(mcount)
 
 	cmpl $ftrace_stub, ftrace_trace_function
 	jnz trace
+#ifdef CONFIG_FUNCTION_RET_TRACER
+	cmpl $ftrace_stub, ftrace_function_return
+	jnz trace_return
+#endif
 .globl ftrace_stub
 ftrace_stub:
 	ret
@@ -1206,8 +1210,37 @@ trace:
 	popl %edx
 	popl %ecx
 	popl %eax
+	jmp ftrace_stub
 
+#ifdef CONFIG_FUNCTION_RET_TRACER
+trace_return:
+	pushl %eax
+	pushl %ecx
+	pushl %edx
+	movl 0xc(%esp), %eax
+	pushl %eax
+	lea 0x4(%ebp), %eax
+	pushl %eax
+	call prepare_ftrace_return
+	addl $8, %esp
+	popl %edx
+	popl %ecx
+	popl %eax
 	jmp ftrace_stub
+
+.globl return_to_handler
+return_to_handler:
+	pushl $0
+	pushl %eax
+	pushl %ecx
+	pushl %edx
+	call ftrace_return_to_handler
+	movl %eax, 0xc(%esp)
+	popl %edx
+	popl %ecx
+	popl %eax
+	ret
+#endif /* CONFIG_FUNCTION_RET_TRACER */
 END(mcount)
 #endif /* CONFIG_DYNAMIC_FTRACE */
 #endif /* CONFIG_FUNCTION_TRACER */
diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c
index 69149337f2f..d68033bba22 100644
--- a/arch/x86/kernel/ftrace.c
+++ b/arch/x86/kernel/ftrace.c
@@ -18,10 +18,173 @@
 #include <linux/list.h>
 
 #include <asm/ftrace.h>
+#include <linux/ftrace.h>
 #include <asm/nops.h>
+#include <asm/nmi.h>
 
 
-static unsigned char ftrace_nop[MCOUNT_INSN_SIZE];
+
+#ifdef CONFIG_FUNCTION_RET_TRACER
+
+/*
+ * These functions are picked from those used on
+ * this page for dynamic ftrace. They have been
+ * simplified to ignore all traces in NMI context.
+ */
+static atomic_t in_nmi;
+
+void ftrace_nmi_enter(void)
+{
+	atomic_inc(&in_nmi);
+}
+
+void ftrace_nmi_exit(void)
+{
+	atomic_dec(&in_nmi);
+}
+
+/*
+ * Synchronize accesses to return adresses stack with
+ * interrupts.
+ */
+static raw_spinlock_t ret_stack_lock;
+
+/* Add a function return address to the trace stack on thread info.*/
+static int push_return_trace(unsigned long ret, unsigned long long time,
+				unsigned long func)
+{
+	int index;
+	struct thread_info *ti;
+	unsigned long flags;
+	int err = 0;
+
+	raw_local_irq_save(flags);
+	__raw_spin_lock(&ret_stack_lock);
+
+	ti = current_thread_info();
+	/* The return trace stack is full */
+	if (ti->curr_ret_stack == FTRACE_RET_STACK_SIZE - 1) {
+		err = -EBUSY;
+		goto out;
+	}
+
+	index = ++ti->curr_ret_stack;
+	ti->ret_stack[index].ret = ret;
+	ti->ret_stack[index].func = func;
+	ti->ret_stack[index].calltime = time;
+
+out:
+	__raw_spin_unlock(&ret_stack_lock);
+	raw_local_irq_restore(flags);
+	return err;
+}
+
+/* Retrieve a function return address to the trace stack on thread info.*/
+static void pop_return_trace(unsigned long *ret, unsigned long long *time,
+				unsigned long *func)
+{
+	struct thread_info *ti;
+	int index;
+	unsigned long flags;
+
+	raw_local_irq_save(flags);
+	__raw_spin_lock(&ret_stack_lock);
+
+	ti = current_thread_info();
+	index = ti->curr_ret_stack;
+	*ret = ti->ret_stack[index].ret;
+	*func = ti->ret_stack[index].func;
+	*time = ti->ret_stack[index].calltime;
+	ti->curr_ret_stack--;
+
+	__raw_spin_unlock(&ret_stack_lock);
+	raw_local_irq_restore(flags);
+}
+
+/*
+ * Send the trace to the ring-buffer.
+ * @return the original return address.
+ */
+unsigned long ftrace_return_to_handler(void)
+{
+	struct ftrace_retfunc trace;
+	pop_return_trace(&trace.ret, &trace.calltime, &trace.func);
+	trace.rettime = cpu_clock(raw_smp_processor_id());
+	ftrace_function_return(&trace);
+
+	return trace.ret;
+}
+
+/*
+ * Hook the return address and push it in the stack of return addrs
+ * in current thread info.
+ */
+asmlinkage
+void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr)
+{
+	unsigned long old;
+	unsigned long long calltime;
+	int faulted;
+	unsigned long return_hooker = (unsigned long)
+				&return_to_handler;
+
+	/* Nmi's are currently unsupported */
+	if (atomic_read(&in_nmi))
+		return;
+
+	/*
+	 * Protect against fault, even if it shouldn't
+	 * happen. This tool is too much intrusive to
+	 * ignore such a protection.
+	 */
+	asm volatile(
+		"1: movl (%[parent_old]), %[old]\n"
+		"2: movl %[return_hooker], (%[parent_replaced])\n"
+		"   movl $0, %[faulted]\n"
+
+		".section .fixup, \"ax\"\n"
+		"3: movl $1, %[faulted]\n"
+		".previous\n"
+
+		".section __ex_table, \"a\"\n"
+		"   .long 1b, 3b\n"
+		"   .long 2b, 3b\n"
+		".previous\n"
+
+		: [parent_replaced] "=rm" (parent), [old] "=r" (old),
+		  [faulted] "=r" (faulted)
+		: [parent_old] "0" (parent), [return_hooker] "r" (return_hooker)
+		: "memory"
+	);
+
+	if (WARN_ON(faulted)) {
+		unregister_ftrace_return();
+		return;
+	}
+
+	if (WARN_ON(!__kernel_text_address(old))) {
+		unregister_ftrace_return();
+		*parent = old;
+		return;
+	}
+
+	calltime = cpu_clock(raw_smp_processor_id());
+
+	if (push_return_trace(old, calltime, self_addr) == -EBUSY)
+		*parent = old;
+}
+
+static int __init init_ftrace_function_return(void)
+{
+	ret_stack_lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED;
+	return 0;
+}
+device_initcall(init_ftrace_function_return);
+
+
+#endif
+
+#ifdef CONFIG_DYNAMIC_FTRACE
 
 union ftrace_code_union {
 	char code[MCOUNT_INSN_SIZE];
@@ -31,17 +194,11 @@ union ftrace_code_union {
 	} __attribute__((packed));
 };
 
-
 static int ftrace_calc_offset(long ip, long addr)
 {
 	return (int)(addr - ip);
 }
 
-unsigned char *ftrace_nop_replace(void)
-{
-	return ftrace_nop;
-}
-
 unsigned char *ftrace_call_replace(unsigned long ip, unsigned long addr)
 {
 	static union ftrace_code_union calc;
@@ -183,6 +340,15 @@ do_ftrace_mod_code(unsigned long ip, void *new_code)
 }
 
 
+
+
+static unsigned char ftrace_nop[MCOUNT_INSN_SIZE];
+
+unsigned char *ftrace_nop_replace(void)
+{
+	return ftrace_nop;
+}
+
 int
 ftrace_modify_code(unsigned long ip, unsigned char *old_code,
 		   unsigned char *new_code)
@@ -292,3 +458,4 @@ int __init ftrace_dyn_arch_init(void *data)
 
 	return 0;
 }
+#endif
diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index 1f5608c1102..dcbbf72a88b 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -267,6 +267,26 @@ ftrace_init_module(unsigned long *start, unsigned long *end) { }
 #endif
 
 
+/*
+ * Structure that defines a return function trace.
+ */
+struct ftrace_retfunc {
+	unsigned long ret; /* Return address */
+	unsigned long func; /* Current function */
+	unsigned long long calltime;
+	unsigned long long rettime;
+};
+
+#ifdef CONFIG_FUNCTION_RET_TRACER
+/* Type of a callback handler of tracing return function */
+typedef void (*trace_function_return_t)(struct ftrace_retfunc *);
+
+extern void register_ftrace_return(trace_function_return_t func);
+/* The current handler in use */
+extern trace_function_return_t ftrace_function_return;
+extern void unregister_ftrace_return(void);
+#endif
+
 /*
  * Structure which defines the trace of an initcall.
  * You don't have to fill the func field since it is
diff --git a/include/linux/ftrace_irq.h b/include/linux/ftrace_irq.h
index b1299d6729f..0b4df55d7a7 100644
--- a/include/linux/ftrace_irq.h
+++ b/include/linux/ftrace_irq.h
@@ -2,7 +2,7 @@
 #define _LINUX_FTRACE_IRQ_H
 
 
-#ifdef CONFIG_DYNAMIC_FTRACE
+#if defined(CONFIG_DYNAMIC_FTRACE) || defined(CONFIG_FUNCTION_RET_TRACER)
 extern void ftrace_nmi_enter(void);
 extern void ftrace_nmi_exit(void);
 #else
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 295b7c756ca..df77abe860c 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2005,6 +2005,17 @@ static inline void setup_thread_stack(struct task_struct *p, struct task_struct
 {
 	*task_thread_info(p) = *task_thread_info(org);
 	task_thread_info(p)->task = p;
+
+#ifdef CONFIG_FUNCTION_RET_TRACER
+	/*
+	 * When fork() creates a child process, this function is called.
+	 * But the child task may not inherit the return adresses traced
+	 * by the return function tracer because it will directly execute
+	 * in userspace and will not return to kernel functions its parent
+	 * used.
+	 */
+	task_thread_info(p)->curr_ret_stack = -1;
+#endif
 }
 
 static inline unsigned long *end_of_stack(struct task_struct *p)
diff --git a/kernel/Makefile b/kernel/Makefile
index 9a3ec66a9d8..af3be57acbb 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -23,6 +23,10 @@ CFLAGS_REMOVE_cgroup-debug.o = -pg
 CFLAGS_REMOVE_sched_clock.o = -pg
 CFLAGS_REMOVE_sched.o = -mno-spe -pg
 endif
+ifdef CONFIG_FUNCTION_RET_TRACER
+CFLAGS_REMOVE_extable.o = -pg # For __kernel_text_address()
+CFLAGS_REMOVE_module.o = -pg # For __module_text_address()
+endif
 
 obj-$(CONFIG_FREEZER) += freezer.o
 obj-$(CONFIG_PROFILING) += profile.o
-- 
cgit v1.2.3-70-g09d2


From 9d36be76c55ad2c2bb29683b752b0d9ad2e4eeef Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Tue, 11 Nov 2008 21:48:07 +1100
Subject: Document the order of arguments for cap_issubset.  It's not instantly
 clear which order the argument should be in.  So give an example.

Signed-off-by: Eric Paris <eparis@redhat.com>
Acked-by: Serge Hallyn <serue@us.ibm.com>
Signed-off-by: James Morris <jmorris@namei.org>
---
 include/linux/capability.h | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/capability.h b/include/linux/capability.h
index 5bc145bd759..b5750d0b96e 100644
--- a/include/linux/capability.h
+++ b/include/linux/capability.h
@@ -457,6 +457,13 @@ static inline int cap_isclear(const kernel_cap_t a)
 	return 1;
 }
 
+/*
+ * Check if "a" is a subset of "set".
+ * return 1 if ALL of the capabilities in "a" are also in "set"
+ *	cap_issubset(0101, 1111) will return 1
+ * return 0 if ANY of the capabilities in "a" are not in "set"
+ *	cap_issubset(1111, 0101) will return 0
+ */
 static inline int cap_issubset(const kernel_cap_t a, const kernel_cap_t set)
 {
 	kernel_cap_t dest;
-- 
cgit v1.2.3-70-g09d2


From c0b004413a46a0a5744e6d2b85220fe9d2c33d48 Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Tue, 11 Nov 2008 21:48:10 +1100
Subject: This patch add a generic cpu endian caps structure and externally
 available functions which retrieve fcaps information from disk.  This
 information is necessary so fcaps information can be collected and recorded
 by the audit system.

Signed-off-by: Eric Paris <eparis@redhat.com>
Acked-by: Serge Hallyn <serue@us.ibm.com>
Signed-off-by: James Morris <jmorris@namei.org>
---
 include/linux/capability.h |   7 +++
 security/commoncap.c       | 129 +++++++++++++++++++++++++--------------------
 2 files changed, 78 insertions(+), 58 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/capability.h b/include/linux/capability.h
index b5750d0b96e..d567af247ed 100644
--- a/include/linux/capability.h
+++ b/include/linux/capability.h
@@ -99,6 +99,13 @@ typedef struct kernel_cap_struct {
 	__u32 cap[_KERNEL_CAPABILITY_U32S];
 } kernel_cap_t;
 
+/* exact same as vfs_cap_data but in cpu endian and always filled completely */
+struct cpu_vfs_cap_data {
+	__u32 magic_etc;
+	kernel_cap_t permitted;
+	kernel_cap_t inheritable;
+};
+
 #define _USER_CAP_HEADER_SIZE  (sizeof(struct __user_cap_header_struct))
 #define _KERNEL_CAP_T_SIZE     (sizeof(kernel_cap_t))
 
diff --git a/security/commoncap.c b/security/commoncap.c
index f88119cb2bc..d7eff5797b9 100644
--- a/security/commoncap.c
+++ b/security/commoncap.c
@@ -202,17 +202,70 @@ int cap_inode_killpriv(struct dentry *dentry)
 	return inode->i_op->removexattr(dentry, XATTR_NAME_CAPS);
 }
 
-static inline int cap_from_disk(struct vfs_cap_data *caps,
-				struct linux_binprm *bprm, unsigned size)
+static inline int bprm_caps_from_vfs_caps(struct cpu_vfs_cap_data *caps,
+					  struct linux_binprm *bprm)
 {
+	unsigned i;
+	int ret = 0;
+
+	if (caps->magic_etc & VFS_CAP_FLAGS_EFFECTIVE)
+		bprm->cap_effective = true;
+	else
+		bprm->cap_effective = false;
+
+	CAP_FOR_EACH_U32(i) {
+		__u32 permitted = caps->permitted.cap[i];
+		__u32 inheritable = caps->inheritable.cap[i];
+
+		/*
+		 * pP' = (X & fP) | (pI & fI)
+		 */
+		bprm->cap_post_exec_permitted.cap[i] =
+			(current->cap_bset.cap[i] & permitted) |
+			(current->cap_inheritable.cap[i] & inheritable);
+
+		if (permitted & ~bprm->cap_post_exec_permitted.cap[i]) {
+			/*
+			 * insufficient to execute correctly
+			 */
+			ret = -EPERM;
+		}
+	}
+
+	/*
+	 * For legacy apps, with no internal support for recognizing they
+	 * do not have enough capabilities, we return an error if they are
+	 * missing some "forced" (aka file-permitted) capabilities.
+	 */
+	return bprm->cap_effective ? ret : 0;
+}
+
+int get_vfs_caps_from_disk(const struct dentry *dentry, struct cpu_vfs_cap_data *cpu_caps)
+{
+	struct inode *inode = dentry->d_inode;
 	__u32 magic_etc;
 	unsigned tocopy, i;
-	int ret;
+	int size;
+	struct vfs_cap_data caps;
+
+	memset(cpu_caps, 0, sizeof(struct cpu_vfs_cap_data));
+
+	if (!inode || !inode->i_op || !inode->i_op->getxattr)
+		return -ENODATA;
+
+	size = inode->i_op->getxattr((struct dentry *)dentry, XATTR_NAME_CAPS, &caps,
+				   XATTR_CAPS_SZ);
+	if (size == -ENODATA || size == -EOPNOTSUPP) {
+		/* no data, that's ok */
+		return -ENODATA;
+	}
+	if (size < 0)
+		return size;
 
 	if (size < sizeof(magic_etc))
 		return -EINVAL;
 
-	magic_etc = le32_to_cpu(caps->magic_etc);
+	cpu_caps->magic_etc = magic_etc = le32_to_cpu(caps.magic_etc);
 
 	switch ((magic_etc & VFS_CAP_REVISION_MASK)) {
 	case VFS_CAP_REVISION_1:
@@ -229,46 +282,13 @@ static inline int cap_from_disk(struct vfs_cap_data *caps,
 		return -EINVAL;
 	}
 
-	if (magic_etc & VFS_CAP_FLAGS_EFFECTIVE) {
-		bprm->cap_effective = true;
-	} else {
-		bprm->cap_effective = false;
-	}
-
-	ret = 0;
-
 	CAP_FOR_EACH_U32(i) {
-		__u32 value_cpu;
-
-		if (i >= tocopy) {
-			/*
-			 * Legacy capability sets have no upper bits
-			 */
-			bprm->cap_post_exec_permitted.cap[i] = 0;
-			continue;
-		}
-		/*
-		 * pP' = (X & fP) | (pI & fI)
-		 */
-		value_cpu = le32_to_cpu(caps->data[i].permitted);
-		bprm->cap_post_exec_permitted.cap[i] =
-			(current->cap_bset.cap[i] & value_cpu) |
-			(current->cap_inheritable.cap[i] &
-				le32_to_cpu(caps->data[i].inheritable));
-		if (value_cpu & ~bprm->cap_post_exec_permitted.cap[i]) {
-			/*
-			 * insufficient to execute correctly
-			 */
-			ret = -EPERM;
-		}
+		if (i >= tocopy)
+			break;
+		cpu_caps->permitted.cap[i] = le32_to_cpu(caps.data[i].permitted);
+		cpu_caps->inheritable.cap[i] = le32_to_cpu(caps.data[i].inheritable);
 	}
-
-	/*
-	 * For legacy apps, with no internal support for recognizing they
-	 * do not have enough capabilities, we return an error if they are
-	 * missing some "forced" (aka file-permitted) capabilities.
-	 */
-	return bprm->cap_effective ? ret : 0;
+	return 0;
 }
 
 /* Locate any VFS capabilities: */
@@ -276,8 +296,7 @@ static int get_file_caps(struct linux_binprm *bprm)
 {
 	struct dentry *dentry;
 	int rc = 0;
-	struct vfs_cap_data vcaps;
-	struct inode *inode;
+	struct cpu_vfs_cap_data vcaps;
 
 	bprm_clear_caps(bprm);
 
@@ -288,24 +307,18 @@ static int get_file_caps(struct linux_binprm *bprm)
 		return 0;
 
 	dentry = dget(bprm->file->f_dentry);
-	inode = dentry->d_inode;
-	if (!inode->i_op || !inode->i_op->getxattr)
-		goto out;
 
-	rc = inode->i_op->getxattr(dentry, XATTR_NAME_CAPS, &vcaps,
-				   XATTR_CAPS_SZ);
-	if (rc == -ENODATA || rc == -EOPNOTSUPP) {
-		/* no data, that's ok */
-		rc = 0;
+	rc = get_vfs_caps_from_disk(dentry, &vcaps);
+	if (rc < 0) {
+		if (rc == -EINVAL)
+			printk(KERN_NOTICE "%s: get_vfs_caps_from_disk returned %d for %s\n",
+				__func__, rc, bprm->filename);
+		else if (rc == -ENODATA)
+			rc = 0;
 		goto out;
 	}
-	if (rc < 0)
-		goto out;
 
-	rc = cap_from_disk(&vcaps, bprm, rc);
-	if (rc == -EINVAL)
-		printk(KERN_NOTICE "%s: cap_from_disk returned %d for %s\n",
-		       __func__, rc, bprm->filename);
+	rc = bprm_caps_from_vfs_caps(&vcaps, bprm);
 
 out:
 	dput(dentry);
-- 
cgit v1.2.3-70-g09d2


From 851f7ff56d9c21272f289dd85fb3f1b6cf7a6e10 Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Tue, 11 Nov 2008 21:48:14 +1100
Subject: This patch will print cap_permitted and cap_inheritable data in the
 PATH records of any file that has file capabilities set.  Files which do not
 have fcaps set will not have different PATH records.

An example audit record if you run:
setcap "cap_net_admin+pie" /bin/bash
/bin/bash

type=SYSCALL msg=audit(1225741937.363:230): arch=c000003e syscall=59 success=yes exit=0 a0=2119230 a1=210da30 a2=20ee290 a3=8 items=2 ppid=2149 pid=2923 auid=0 uid=0 gid=0 euid=0 suid=0 fsuid=0 egid=0 sgid=0 fsgid=0 tty=pts0 ses=3 comm="ping" exe="/bin/ping" subj=unconfined_u:unconfined_r:unconfined_t:s0-s0:c0.c1023 key=(null)
type=EXECVE msg=audit(1225741937.363:230): argc=2 a0="ping" a1="www.google.com"
type=CWD msg=audit(1225741937.363:230):  cwd="/root"
type=PATH msg=audit(1225741937.363:230): item=0 name="/bin/ping" inode=49256 dev=fd:00 mode=0104755 ouid=0 ogid=0 rdev=00:00 obj=system_u:object_r:ping_exec_t:s0 cap_fp=0000000000002000 cap_fi=0000000000002000 cap_fe=1 cap_fver=2
type=PATH msg=audit(1225741937.363:230): item=1 name=(null) inode=507915 dev=fd:00 mode=0100755 ouid=0 ogid=0 rdev=00:00 obj=system_u:object_r:ld_so_t:s0

Signed-off-by: Eric Paris <eparis@redhat.com>
Acked-by: Serge Hallyn <serue@us.ibm.com>
Signed-off-by: James Morris <jmorris@namei.org>
---
 include/linux/capability.h |  5 +++
 kernel/auditsc.c           | 82 +++++++++++++++++++++++++++++++++++++++++++---
 2 files changed, 82 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/capability.h b/include/linux/capability.h
index d567af247ed..0f195018110 100644
--- a/include/linux/capability.h
+++ b/include/linux/capability.h
@@ -53,6 +53,7 @@ typedef struct __user_cap_data_struct {
 #define XATTR_NAME_CAPS XATTR_SECURITY_PREFIX XATTR_CAPS_SUFFIX
 
 #define VFS_CAP_REVISION_MASK	0xFF000000
+#define VFS_CAP_REVISION_SHIFT	24
 #define VFS_CAP_FLAGS_MASK	~VFS_CAP_REVISION_MASK
 #define VFS_CAP_FLAGS_EFFECTIVE	0x000001
 
@@ -534,6 +535,10 @@ kernel_cap_t cap_set_effective(const kernel_cap_t pE_new);
 
 extern int capable(int cap);
 
+/* audit system wants to get cap info from files as well */
+struct dentry;
+extern int get_vfs_caps_from_disk(const struct dentry *dentry, struct cpu_vfs_cap_data *cpu_caps);
+
 #endif /* __KERNEL__ */
 
 #endif /* !_LINUX_CAPABILITY_H */
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index cf5bc2f5f9c..de7e9bcba9a 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -65,6 +65,7 @@
 #include <linux/highmem.h>
 #include <linux/syscalls.h>
 #include <linux/inotify.h>
+#include <linux/capability.h>
 
 #include "audit.h"
 
@@ -84,6 +85,15 @@ int audit_n_rules;
 /* determines whether we collect data for signals sent */
 int audit_signals;
 
+struct audit_cap_data {
+	kernel_cap_t		permitted;
+	kernel_cap_t		inheritable;
+	union {
+		unsigned int	fE;		/* effective bit of a file capability */
+		kernel_cap_t	effective;	/* effective set of a process */
+	};
+};
+
 /* When fs/namei.c:getname() is called, we store the pointer in name and
  * we don't let putname() free it (instead we free all of the saved
  * pointers at syscall exit time).
@@ -100,6 +110,8 @@ struct audit_names {
 	gid_t		gid;
 	dev_t		rdev;
 	u32		osid;
+	struct audit_cap_data fcap;
+	unsigned int	fcap_ver;
 };
 
 struct audit_aux_data {
@@ -1171,6 +1183,35 @@ static void audit_log_execve_info(struct audit_context *context,
 	kfree(buf);
 }
 
+static void audit_log_cap(struct audit_buffer *ab, char *prefix, kernel_cap_t *cap)
+{
+	int i;
+
+	audit_log_format(ab, " %s=", prefix);
+	CAP_FOR_EACH_U32(i) {
+		audit_log_format(ab, "%08x", cap->cap[(_KERNEL_CAPABILITY_U32S-1) - i]);
+	}
+}
+
+static void audit_log_fcaps(struct audit_buffer *ab, struct audit_names *name)
+{
+	kernel_cap_t *perm = &name->fcap.permitted;
+	kernel_cap_t *inh = &name->fcap.inheritable;
+	int log = 0;
+
+	if (!cap_isclear(*perm)) {
+		audit_log_cap(ab, "cap_fp", perm);
+		log = 1;
+	}
+	if (!cap_isclear(*inh)) {
+		audit_log_cap(ab, "cap_fi", inh);
+		log = 1;
+	}
+
+	if (log)
+		audit_log_format(ab, " cap_fe=%d cap_fver=%x", name->fcap.fE, name->fcap_ver);
+}
+
 static void audit_log_exit(struct audit_context *context, struct task_struct *tsk)
 {
 	int i, call_panic = 0;
@@ -1421,6 +1462,8 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
 			}
 		}
 
+		audit_log_fcaps(ab, n);
+
 		audit_log_end(ab);
 	}
 
@@ -1787,8 +1830,36 @@ static int audit_inc_name_count(struct audit_context *context,
 	return 0;
 }
 
+
+static inline int audit_copy_fcaps(struct audit_names *name, const struct dentry *dentry)
+{
+	struct cpu_vfs_cap_data caps;
+	int rc;
+
+	memset(&name->fcap.permitted, 0, sizeof(kernel_cap_t));
+	memset(&name->fcap.inheritable, 0, sizeof(kernel_cap_t));
+	name->fcap.fE = 0;
+	name->fcap_ver = 0;
+
+	if (!dentry)
+		return 0;
+
+	rc = get_vfs_caps_from_disk(dentry, &caps);
+	if (rc)
+		return rc;
+
+	name->fcap.permitted = caps.permitted;
+	name->fcap.inheritable = caps.inheritable;
+	name->fcap.fE = !!(caps.magic_etc & VFS_CAP_FLAGS_EFFECTIVE);
+	name->fcap_ver = (caps.magic_etc & VFS_CAP_REVISION_MASK) >> VFS_CAP_REVISION_SHIFT;
+
+	return 0;
+}
+
+
 /* Copy inode data into an audit_names. */
-static void audit_copy_inode(struct audit_names *name, const struct inode *inode)
+static void audit_copy_inode(struct audit_names *name, const struct dentry *dentry,
+			     const struct inode *inode)
 {
 	name->ino   = inode->i_ino;
 	name->dev   = inode->i_sb->s_dev;
@@ -1797,6 +1868,7 @@ static void audit_copy_inode(struct audit_names *name, const struct inode *inode
 	name->gid   = inode->i_gid;
 	name->rdev  = inode->i_rdev;
 	security_inode_getsecid(inode, &name->osid);
+	audit_copy_fcaps(name, dentry);
 }
 
 /**
@@ -1831,7 +1903,7 @@ void __audit_inode(const char *name, const struct dentry *dentry)
 		context->names[idx].name = NULL;
 	}
 	handle_path(dentry);
-	audit_copy_inode(&context->names[idx], inode);
+	audit_copy_inode(&context->names[idx], dentry, inode);
 }
 
 /**
@@ -1892,7 +1964,7 @@ void __audit_inode_child(const char *dname, const struct dentry *dentry,
 		if (!strcmp(dname, n->name) ||
 		     !audit_compare_dname_path(dname, n->name, &dirlen)) {
 			if (inode)
-				audit_copy_inode(n, inode);
+				audit_copy_inode(n, NULL, inode);
 			else
 				n->ino = (unsigned long)-1;
 			found_child = n->name;
@@ -1906,7 +1978,7 @@ add_names:
 			return;
 		idx = context->name_count - 1;
 		context->names[idx].name = NULL;
-		audit_copy_inode(&context->names[idx], parent);
+		audit_copy_inode(&context->names[idx], NULL, parent);
 	}
 
 	if (!found_child) {
@@ -1927,7 +1999,7 @@ add_names:
 		}
 
 		if (inode)
-			audit_copy_inode(&context->names[idx], inode);
+			audit_copy_inode(&context->names[idx], NULL, inode);
 		else
 			context->names[idx].ino = (unsigned long)-1;
 	}
-- 
cgit v1.2.3-70-g09d2


From 3fc689e96c0c90b6fede5946d6c31075e9464f69 Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Tue, 11 Nov 2008 21:48:18 +1100
Subject: Any time fcaps or a setuid app under SECURE_NOROOT is used to result
 in a non-zero pE we will crate a new audit record which contains the entire
 set of known information about the executable in question, fP, fI, fE,
 fversion and includes the process's pE, pI, pP.  Before and after the bprm
 capability are applied.  This record type will only be emitted from execve
 syscalls.

an example of making ping use fcaps instead of setuid:

setcap "cat_net_raw+pe" /bin/ping

type=SYSCALL msg=audit(1225742021.015:236): arch=c000003e syscall=59 success=yes exit=0 a0=1457f30 a1=14606b0 a2=1463940 a3=321b770a70 items=2 ppid=2929 pid=2963 auid=0 uid=500 gid=500 euid=500 suid=500 fsuid=500 egid=500 sgid=500 fsgid=500 tty=pts0 ses=3 comm="ping" exe="/bin/ping" subj=unconfined_u:unconfined_r:unconfined_t:s0-s0:c0.c1023 key=(null)
type=UNKNOWN[1321] msg=audit(1225742021.015:236): fver=2 fp=0000000000002000 fi=0000000000000000 fe=1 old_pp=0000000000000000 old_pi=0000000000000000 old_pe=0000000000000000 new_pp=0000000000002000 new_pi=0000000000000000 new_pe=0000000000002000
type=EXECVE msg=audit(1225742021.015:236): argc=2 a0="ping" a1="127.0.0.1"
type=CWD msg=audit(1225742021.015:236):  cwd="/home/test"
type=PATH msg=audit(1225742021.015:236): item=0 name="/bin/ping" inode=49256 dev=fd:00 mode=0100755 ouid=0 ogid=0 rdev=00:00 obj=system_u:object_r:ping_exec_t:s0 cap_fp=0000000000002000 cap_fe=1 cap_fver=2
type=PATH msg=audit(1225742021.015:236): item=1 name=(null) inode=507915 dev=fd:00 mode=0100755 ouid=0 ogid=0 rdev=00:00 obj=system_u:object_r:ld_so_t:s0

Signed-off-by: Eric Paris <eparis@redhat.com>
Acked-by: Serge Hallyn <serue@us.ibm.com>
Signed-off-by: James Morris <jmorris@namei.org>
---
 include/linux/audit.h | 26 ++++++++++++++++++++
 kernel/auditsc.c      | 68 +++++++++++++++++++++++++++++++++++++++++++++++++++
 security/commoncap.c  | 23 ++++++++++++++++-
 3 files changed, 116 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/audit.h b/include/linux/audit.h
index 6272a395d43..8cfb9feb2a0 100644
--- a/include/linux/audit.h
+++ b/include/linux/audit.h
@@ -99,6 +99,7 @@
 #define AUDIT_OBJ_PID		1318	/* ptrace target */
 #define AUDIT_TTY		1319	/* Input on an administrative TTY */
 #define AUDIT_EOE		1320	/* End of multi-record event */
+#define AUDIT_BPRM_FCAPS	1321	/* Information about fcaps increasing perms */
 
 #define AUDIT_AVC		1400	/* SE Linux avc denial or grant */
 #define AUDIT_SELINUX_ERR	1401	/* Internal SE Linux Errors */
@@ -452,6 +453,7 @@ extern int __audit_mq_timedsend(mqd_t mqdes, size_t msg_len, unsigned int msg_pr
 extern int __audit_mq_timedreceive(mqd_t mqdes, size_t msg_len, unsigned int __user *u_msg_prio, const struct timespec __user *u_abs_timeout);
 extern int __audit_mq_notify(mqd_t mqdes, const struct sigevent __user *u_notification);
 extern int __audit_mq_getsetattr(mqd_t mqdes, struct mq_attr *mqstat);
+extern void __audit_log_bprm_fcaps(struct linux_binprm *bprm, kernel_cap_t *pP, kernel_cap_t *pE);
 
 static inline int audit_ipc_obj(struct kern_ipc_perm *ipcp)
 {
@@ -501,6 +503,29 @@ static inline int audit_mq_getsetattr(mqd_t mqdes, struct mq_attr *mqstat)
 		return __audit_mq_getsetattr(mqdes, mqstat);
 	return 0;
 }
+
+/*
+ * ieieeeeee, an audit function without a return code!
+ *
+ * This function might fail!  I decided that it didn't matter.  We are too late
+ * to fail the syscall and the information isn't REQUIRED for any purpose.  It's
+ * just nice to have.  We should be able to look at past audit logs to figure
+ * out this process's current cap set along with the fcaps from the PATH record
+ * and use that to come up with the final set.  Yeah, its ugly, but all the info
+ * is still in the audit log.  So I'm not going to bother mentioning we failed
+ * if we couldn't allocate memory.
+ *
+ * If someone changes their mind they could create the aux record earlier and
+ * then search here and use that earlier allocation.  But I don't wanna.
+ *
+ * -Eric
+ */
+static inline void audit_log_bprm_fcaps(struct linux_binprm *bprm, kernel_cap_t *pP, kernel_cap_t *pE)
+{
+	if (unlikely(!audit_dummy_context()))
+		__audit_log_bprm_fcaps(bprm, pP, pE);
+}
+
 extern int audit_n_rules;
 extern int audit_signals;
 #else
@@ -532,6 +557,7 @@ extern int audit_signals;
 #define audit_mq_timedreceive(d,l,p,t) ({ 0; })
 #define audit_mq_notify(d,n) ({ 0; })
 #define audit_mq_getsetattr(d,s) ({ 0; })
+#define audit_log_bprm_fcaps(b, p, e) do { ; } while (0)
 #define audit_ptrace(t) ((void)0)
 #define audit_n_rules 0
 #define audit_signals 0
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index de7e9bcba9a..3229cd4206f 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -196,6 +196,14 @@ struct audit_aux_data_pids {
 	int			pid_count;
 };
 
+struct audit_aux_data_bprm_fcaps {
+	struct audit_aux_data	d;
+	struct audit_cap_data	fcap;
+	unsigned int		fcap_ver;
+	struct audit_cap_data	old_pcap;
+	struct audit_cap_data	new_pcap;
+};
+
 struct audit_tree_refs {
 	struct audit_tree_refs *next;
 	struct audit_chunk *c[31];
@@ -1375,6 +1383,20 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
 			audit_log_format(ab, "fd0=%d fd1=%d", axs->fd[0], axs->fd[1]);
 			break; }
 
+		case AUDIT_BPRM_FCAPS: {
+			struct audit_aux_data_bprm_fcaps *axs = (void *)aux;
+			audit_log_format(ab, "fver=%x", axs->fcap_ver);
+			audit_log_cap(ab, "fp", &axs->fcap.permitted);
+			audit_log_cap(ab, "fi", &axs->fcap.inheritable);
+			audit_log_format(ab, " fe=%d", axs->fcap.fE);
+			audit_log_cap(ab, "old_pp", &axs->old_pcap.permitted);
+			audit_log_cap(ab, "old_pi", &axs->old_pcap.inheritable);
+			audit_log_cap(ab, "old_pe", &axs->old_pcap.effective);
+			audit_log_cap(ab, "new_pp", &axs->new_pcap.permitted);
+			audit_log_cap(ab, "new_pi", &axs->new_pcap.inheritable);
+			audit_log_cap(ab, "new_pe", &axs->new_pcap.effective);
+			break; }
+
 		}
 		audit_log_end(ab);
 	}
@@ -2501,6 +2523,52 @@ int __audit_signal_info(int sig, struct task_struct *t)
 	return 0;
 }
 
+/**
+ * __audit_log_bprm_fcaps - store information about a loading bprm and relevant fcaps
+ * @bprm pointer to the bprm being processed
+ * @caps the caps read from the disk
+ *
+ * Simply check if the proc already has the caps given by the file and if not
+ * store the priv escalation info for later auditing at the end of the syscall
+ *
+ * this can fail and we don't care.  See the note in audit.h for
+ * audit_log_bprm_fcaps() for my explaination....
+ *
+ * -Eric
+ */
+void __audit_log_bprm_fcaps(struct linux_binprm *bprm, kernel_cap_t *pP, kernel_cap_t *pE)
+{
+	struct audit_aux_data_bprm_fcaps *ax;
+	struct audit_context *context = current->audit_context;
+	struct cpu_vfs_cap_data vcaps;
+	struct dentry *dentry;
+
+	ax = kmalloc(sizeof(*ax), GFP_KERNEL);
+	if (!ax)
+		return;
+
+	ax->d.type = AUDIT_BPRM_FCAPS;
+	ax->d.next = context->aux;
+	context->aux = (void *)ax;
+
+	dentry = dget(bprm->file->f_dentry);
+	get_vfs_caps_from_disk(dentry, &vcaps);
+	dput(dentry);
+
+	ax->fcap.permitted = vcaps.permitted;
+	ax->fcap.inheritable = vcaps.inheritable;
+	ax->fcap.fE = !!(vcaps.magic_etc & VFS_CAP_FLAGS_EFFECTIVE);
+	ax->fcap_ver = (vcaps.magic_etc & VFS_CAP_REVISION_MASK) >> VFS_CAP_REVISION_SHIFT;
+
+	ax->old_pcap.permitted = *pP;
+	ax->old_pcap.inheritable = current->cap_inheritable;
+	ax->old_pcap.effective = *pE;
+
+	ax->new_pcap.permitted = current->cap_permitted;
+	ax->new_pcap.inheritable = current->cap_inheritable;
+	ax->new_pcap.effective = current->cap_effective;
+}
+
 /**
  * audit_core_dumps - record information about processes that end abnormally
  * @signr: signal value
diff --git a/security/commoncap.c b/security/commoncap.c
index d7eff5797b9..d4539338099 100644
--- a/security/commoncap.c
+++ b/security/commoncap.c
@@ -8,6 +8,7 @@
  */
 
 #include <linux/capability.h>
+#include <linux/audit.h>
 #include <linux/module.h>
 #include <linux/init.h>
 #include <linux/kernel.h>
@@ -376,6 +377,9 @@ int cap_bprm_set_security (struct linux_binprm *bprm)
 
 void cap_bprm_apply_creds (struct linux_binprm *bprm, int unsafe)
 {
+	kernel_cap_t pP = current->cap_permitted;
+	kernel_cap_t pE = current->cap_effective;
+
 	if (bprm->e_uid != current->uid || bprm->e_gid != current->gid ||
 	    !cap_issubset(bprm->cap_post_exec_permitted,
 			  current->cap_permitted)) {
@@ -409,7 +413,24 @@ void cap_bprm_apply_creds (struct linux_binprm *bprm, int unsafe)
 			cap_clear(current->cap_effective);
 	}
 
-	/* AUD: Audit candidate if current->cap_effective is set */
+	/*
+	 * Audit candidate if current->cap_effective is set
+	 *
+	 * We do not bother to audit if 3 things are true:
+	 *   1) cap_effective has all caps
+	 *   2) we are root
+	 *   3) root is supposed to have all caps (SECURE_NOROOT)
+	 * Since this is just a normal root execing a process.
+	 *
+	 * Number 1 above might fail if you don't have a full bset, but I think
+	 * that is interesting information to audit.
+	 */
+	if (!cap_isclear(current->cap_effective)) {
+		if (!cap_issubset(CAP_FULL_SET, current->cap_effective) ||
+		    (bprm->e_uid != 0) || (current->uid != 0) ||
+		    issecure(SECURE_NOROOT))
+			audit_log_bprm_fcaps(bprm, &pP, &pE);
+	}
 
 	current->securebits &= ~issecure_mask(SECURE_KEEP_CAPS);
 }
-- 
cgit v1.2.3-70-g09d2


From e68b75a027bb94066576139ee33676264f867b87 Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Tue, 11 Nov 2008 21:48:22 +1100
Subject: When the capset syscall is used it is not possible for audit to
 record the actual capbilities being added/removed.  This patch adds a new
 record type which emits the target pid and the eff, inh, and perm cap sets.

example output if you audit capset syscalls would be:

type=SYSCALL msg=audit(1225743140.465:76): arch=c000003e syscall=126 success=yes exit=0 a0=17f2014 a1=17f201c a2=80000000 a3=7fff2ab7f060 items=0 ppid=2160 pid=2223 auid=0 uid=0 gid=0 euid=0 suid=0 fsuid=0 egid=0 sgid=0 fsgid=0 tty=pts0 ses=1 comm="setcap" exe="/usr/sbin/setcap" subj=unconfined_u:unconfined_r:unconfined_t:s0-s0:c0.c1023 key=(null)
type=UNKNOWN[1322] msg=audit(1225743140.465:76): pid=0 cap_pi=ffffffffffffffff cap_pp=ffffffffffffffff cap_pe=ffffffffffffffff

Signed-off-by: Eric Paris <eparis@redhat.com>
Acked-by: Serge Hallyn <serue@us.ibm.com>
Signed-off-by: James Morris <jmorris@namei.org>
---
 include/linux/audit.h | 10 ++++++++++
 kernel/auditsc.c      | 48 ++++++++++++++++++++++++++++++++++++++++++++++++
 kernel/capability.c   |  5 +++++
 3 files changed, 63 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/audit.h b/include/linux/audit.h
index 8cfb9feb2a0..6fbebac7b1b 100644
--- a/include/linux/audit.h
+++ b/include/linux/audit.h
@@ -100,6 +100,7 @@
 #define AUDIT_TTY		1319	/* Input on an administrative TTY */
 #define AUDIT_EOE		1320	/* End of multi-record event */
 #define AUDIT_BPRM_FCAPS	1321	/* Information about fcaps increasing perms */
+#define AUDIT_CAPSET		1322	/* Record showing argument to sys_capset */
 
 #define AUDIT_AVC		1400	/* SE Linux avc denial or grant */
 #define AUDIT_SELINUX_ERR	1401	/* Internal SE Linux Errors */
@@ -454,6 +455,7 @@ extern int __audit_mq_timedreceive(mqd_t mqdes, size_t msg_len, unsigned int __u
 extern int __audit_mq_notify(mqd_t mqdes, const struct sigevent __user *u_notification);
 extern int __audit_mq_getsetattr(mqd_t mqdes, struct mq_attr *mqstat);
 extern void __audit_log_bprm_fcaps(struct linux_binprm *bprm, kernel_cap_t *pP, kernel_cap_t *pE);
+extern int __audit_log_capset(pid_t pid, kernel_cap_t *eff, kernel_cap_t *inh, kernel_cap_t *perm);
 
 static inline int audit_ipc_obj(struct kern_ipc_perm *ipcp)
 {
@@ -526,6 +528,13 @@ static inline void audit_log_bprm_fcaps(struct linux_binprm *bprm, kernel_cap_t
 		__audit_log_bprm_fcaps(bprm, pP, pE);
 }
 
+static inline int audit_log_capset(pid_t pid, kernel_cap_t *eff, kernel_cap_t *inh, kernel_cap_t *perm)
+{
+	if (unlikely(!audit_dummy_context()))
+		return __audit_log_capset(pid, eff, inh, perm);
+	return 0;
+}
+
 extern int audit_n_rules;
 extern int audit_signals;
 #else
@@ -558,6 +567,7 @@ extern int audit_signals;
 #define audit_mq_notify(d,n) ({ 0; })
 #define audit_mq_getsetattr(d,s) ({ 0; })
 #define audit_log_bprm_fcaps(b, p, e) do { ; } while (0)
+#define audit_log_capset(pid, e, i, p) ({ 0; })
 #define audit_ptrace(t) ((void)0)
 #define audit_n_rules 0
 #define audit_signals 0
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 3229cd4206f..cef34235b36 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -204,6 +204,12 @@ struct audit_aux_data_bprm_fcaps {
 	struct audit_cap_data	new_pcap;
 };
 
+struct audit_aux_data_capset {
+	struct audit_aux_data	d;
+	pid_t			pid;
+	struct audit_cap_data	cap;
+};
+
 struct audit_tree_refs {
 	struct audit_tree_refs *next;
 	struct audit_chunk *c[31];
@@ -1397,6 +1403,14 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
 			audit_log_cap(ab, "new_pe", &axs->new_pcap.effective);
 			break; }
 
+		case AUDIT_CAPSET: {
+			struct audit_aux_data_capset *axs = (void *)aux;
+			audit_log_format(ab, "pid=%d", axs->pid);
+			audit_log_cap(ab, "cap_pi", &axs->cap.inheritable);
+			audit_log_cap(ab, "cap_pp", &axs->cap.permitted);
+			audit_log_cap(ab, "cap_pe", &axs->cap.effective);
+			break; }
+
 		}
 		audit_log_end(ab);
 	}
@@ -2569,6 +2583,40 @@ void __audit_log_bprm_fcaps(struct linux_binprm *bprm, kernel_cap_t *pP, kernel_
 	ax->new_pcap.effective = current->cap_effective;
 }
 
+/**
+ * __audit_log_capset - store information about the arguments to the capset syscall
+ * @pid target pid of the capset call
+ * @eff effective cap set
+ * @inh inheritible cap set
+ * @perm permited cap set
+ *
+ * Record the aguments userspace sent to sys_capset for later printing by the
+ * audit system if applicable
+ */
+int __audit_log_capset(pid_t pid, kernel_cap_t *eff, kernel_cap_t *inh, kernel_cap_t *perm)
+{
+	struct audit_aux_data_capset *ax;
+	struct audit_context *context = current->audit_context;
+
+	if (likely(!audit_enabled || !context || context->dummy))
+		return 0;
+
+	ax = kmalloc(sizeof(*ax), GFP_KERNEL);
+	if (!ax)
+		return -ENOMEM;
+
+	ax->d.type = AUDIT_CAPSET;
+	ax->d.next = context->aux;
+	context->aux = (void *)ax;
+
+	ax->pid = pid;
+	ax->cap.effective = *eff;
+	ax->cap.inheritable = *eff;
+	ax->cap.permitted = *perm;
+
+	return 0;
+}
+
 /**
  * audit_core_dumps - record information about processes that end abnormally
  * @signr: signal value
diff --git a/kernel/capability.c b/kernel/capability.c
index e13a68535ad..19f9eda8997 100644
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -7,6 +7,7 @@
  * 30 May 2002:	Cleanup, Robert M. Love <rml@tech9.net>
  */
 
+#include <linux/audit.h>
 #include <linux/capability.h>
 #include <linux/mm.h>
 #include <linux/module.h>
@@ -468,6 +469,10 @@ asmlinkage long sys_capset(cap_user_header_t header, const cap_user_data_t data)
 		i++;
 	}
 
+	ret = audit_log_capset(pid, &effective, &inheritable, &permitted);
+	if (ret)
+		return ret;
+
 	if (pid && (pid != task_pid_vnr(current)))
 		ret = do_sys_capset_other_tasks(pid, &effective, &inheritable,
 						&permitted);
-- 
cgit v1.2.3-70-g09d2


From 06112163f5fd9e491a7f810443d81efa9d88e247 Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Tue, 11 Nov 2008 22:02:50 +1100
Subject: Add a new capable interface that will be used by systems that use
 audit to make an A or B type decision instead of a security decision. 
 Currently this is the case at least for filesystems when deciding if a
 process can use the reserved 'root' blocks and for the case of things like
 the oom algorithm determining if processes are root processes and should be
 less likely to be killed.  These types of security system requests should not
 be audited or logged since they are not really security decisions.  It would
 be possible to solve this problem like the vm_enough_memory security check
 did by creating a new LSM interface and moving all of the policy into that
 interface but proves the needlessly bloat the LSM and provide complex
 indirection.

This merely allows those decisions to be made where they belong and to not
flood logs or printk with denials for thing that are not security decisions.

Signed-off-by: Eric Paris <eparis@redhat.com>
Acked-by:  Stephen Smalley <sds@tycho.nsa.gov>
Signed-off-by: James Morris <jmorris@namei.org>
---
 include/linux/capability.h |  3 +++
 include/linux/security.h   | 16 +++++++++++++---
 security/commoncap.c       |  8 ++++----
 security/security.c        |  7 ++++++-
 security/selinux/hooks.c   | 20 +++++++++++++-------
 5 files changed, 39 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/capability.h b/include/linux/capability.h
index 0f195018110..b313ba1dd5d 100644
--- a/include/linux/capability.h
+++ b/include/linux/capability.h
@@ -521,6 +521,8 @@ extern const kernel_cap_t __cap_init_eff_set;
 
 kernel_cap_t cap_set_effective(const kernel_cap_t pE_new);
 
+extern int security_capable(struct task_struct *t, int cap);
+extern int security_capable_noaudit(struct task_struct *t, int cap);
 /**
  * has_capability - Determine if a task has a superior capability available
  * @t: The task in question
@@ -532,6 +534,7 @@ kernel_cap_t cap_set_effective(const kernel_cap_t pE_new);
  * Note that this does not set PF_SUPERPRIV on the task.
  */
 #define has_capability(t, cap) (security_capable((t), (cap)) == 0)
+#define has_capability_noaudit(t, cap) (security_capable_noaudit((t), (cap)) == 0)
 
 extern int capable(int cap);
 
diff --git a/include/linux/security.h b/include/linux/security.h
index c13f1cec9ab..5fe28a671cd 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -37,6 +37,10 @@
 /* Maximum number of letters for an LSM name string */
 #define SECURITY_NAME_MAX	10
 
+/* If capable should audit the security request */
+#define SECURITY_CAP_NOAUDIT 0
+#define SECURITY_CAP_AUDIT 1
+
 struct ctl_table;
 struct audit_krule;
 
@@ -44,7 +48,7 @@ struct audit_krule;
  * These functions are in security/capability.c and are used
  * as the default capabilities functions
  */
-extern int cap_capable(struct task_struct *tsk, int cap);
+extern int cap_capable(struct task_struct *tsk, int cap, int audit);
 extern int cap_settime(struct timespec *ts, struct timezone *tz);
 extern int cap_ptrace_may_access(struct task_struct *child, unsigned int mode);
 extern int cap_ptrace_traceme(struct task_struct *parent);
@@ -1307,7 +1311,7 @@ struct security_operations {
 			    kernel_cap_t *effective,
 			    kernel_cap_t *inheritable,
 			    kernel_cap_t *permitted);
-	int (*capable) (struct task_struct *tsk, int cap);
+	int (*capable) (struct task_struct *tsk, int cap, int audit);
 	int (*acct) (struct file *file);
 	int (*sysctl) (struct ctl_table *table, int op);
 	int (*quotactl) (int cmds, int type, int id, struct super_block *sb);
@@ -1577,6 +1581,7 @@ void security_capset_set(struct task_struct *target,
 			 kernel_cap_t *inheritable,
 			 kernel_cap_t *permitted);
 int security_capable(struct task_struct *tsk, int cap);
+int security_capable_noaudit(struct task_struct *tsk, int cap);
 int security_acct(struct file *file);
 int security_sysctl(struct ctl_table *table, int op);
 int security_quotactl(int cmds, int type, int id, struct super_block *sb);
@@ -1782,7 +1787,12 @@ static inline void security_capset_set(struct task_struct *target,
 
 static inline int security_capable(struct task_struct *tsk, int cap)
 {
-	return cap_capable(tsk, cap);
+	return cap_capable(tsk, cap, SECURITY_CAP_AUDIT);
+}
+
+static inline int security_capable_noaudit(struct task_struct *tsk, int cap)
+{
+	return cap_capable(tsk, cap, SECURITY_CAP_NOAUDIT);
 }
 
 static inline int security_acct(struct file *file)
diff --git a/security/commoncap.c b/security/commoncap.c
index d4539338099..dc06c0086b5 100644
--- a/security/commoncap.c
+++ b/security/commoncap.c
@@ -49,7 +49,7 @@ EXPORT_SYMBOL(cap_netlink_recv);
  * returns 0 when a task has a capability, but the kernel's capable()
  * returns 1 for this case.
  */
-int cap_capable (struct task_struct *tsk, int cap)
+int cap_capable(struct task_struct *tsk, int cap, int audit)
 {
 	/* Derived from include/linux/sched.h:capable. */
 	if (cap_raised(tsk->cap_effective, cap))
@@ -112,7 +112,7 @@ static inline int cap_inh_is_capped(void)
 	 * to the old permitted set. That is, if the current task
 	 * does *not* possess the CAP_SETPCAP capability.
 	 */
-	return (cap_capable(current, CAP_SETPCAP) != 0);
+	return (cap_capable(current, CAP_SETPCAP, SECURITY_CAP_AUDIT) != 0);
 }
 
 static inline int cap_limit_ptraced_target(void) { return 1; }
@@ -677,7 +677,7 @@ int cap_task_prctl(int option, unsigned long arg2, unsigned long arg3,
 		    || ((current->securebits & SECURE_ALL_LOCKS
 			 & ~arg2))                                    /*[2]*/
 		    || (arg2 & ~(SECURE_ALL_LOCKS | SECURE_ALL_BITS)) /*[3]*/
-		    || (cap_capable(current, CAP_SETPCAP) != 0)) {    /*[4]*/
+		    || (cap_capable(current, CAP_SETPCAP, SECURITY_CAP_AUDIT) != 0)) { /*[4]*/
 			/*
 			 * [1] no changing of bits that are locked
 			 * [2] no unlocking of locks
@@ -742,7 +742,7 @@ int cap_vm_enough_memory(struct mm_struct *mm, long pages)
 {
 	int cap_sys_admin = 0;
 
-	if (cap_capable(current, CAP_SYS_ADMIN) == 0)
+	if (cap_capable(current, CAP_SYS_ADMIN, SECURITY_CAP_NOAUDIT) == 0)
 		cap_sys_admin = 1;
 	return __vm_enough_memory(mm, pages, cap_sys_admin);
 }
diff --git a/security/security.c b/security/security.c
index c0acfa7177e..346f21e0ec2 100644
--- a/security/security.c
+++ b/security/security.c
@@ -163,7 +163,12 @@ void security_capset_set(struct task_struct *target,
 
 int security_capable(struct task_struct *tsk, int cap)
 {
-	return security_ops->capable(tsk, cap);
+	return security_ops->capable(tsk, cap, SECURITY_CAP_AUDIT);
+}
+
+int security_capable_noaudit(struct task_struct *tsk, int cap)
+{
+	return security_ops->capable(tsk, cap, SECURITY_CAP_NOAUDIT);
 }
 
 int security_acct(struct file *file)
diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index 7fd4de46b2a..88a3ee33068 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -1365,12 +1365,14 @@ static int task_has_perm(struct task_struct *tsk1,
 
 /* Check whether a task is allowed to use a capability. */
 static int task_has_capability(struct task_struct *tsk,
-			       int cap)
+			       int cap, int audit)
 {
 	struct task_security_struct *tsec;
 	struct avc_audit_data ad;
+	struct av_decision avd;
 	u16 sclass;
 	u32 av = CAP_TO_MASK(cap);
+	int rc;
 
 	tsec = tsk->security;
 
@@ -1390,7 +1392,11 @@ static int task_has_capability(struct task_struct *tsk,
 		       "SELinux:  out of range capability %d\n", cap);
 		BUG();
 	}
-	return avc_has_perm(tsec->sid, tsec->sid, sclass, av, &ad);
+
+	rc = avc_has_perm_noaudit(tsec->sid, tsec->sid, sclass, av, 0, &avd);
+	if (audit == SECURITY_CAP_AUDIT)
+		avc_audit(tsec->sid, tsec->sid, sclass, av, &avd, rc, &ad);
+	return rc;
 }
 
 /* Check whether a task is allowed to use a system operation. */
@@ -1802,15 +1808,15 @@ static void selinux_capset_set(struct task_struct *target, kernel_cap_t *effecti
 	secondary_ops->capset_set(target, effective, inheritable, permitted);
 }
 
-static int selinux_capable(struct task_struct *tsk, int cap)
+static int selinux_capable(struct task_struct *tsk, int cap, int audit)
 {
 	int rc;
 
-	rc = secondary_ops->capable(tsk, cap);
+	rc = secondary_ops->capable(tsk, cap, audit);
 	if (rc)
 		return rc;
 
-	return task_has_capability(tsk, cap);
+	return task_has_capability(tsk, cap, audit);
 }
 
 static int selinux_sysctl_get_sid(ctl_table *table, u16 tclass, u32 *sid)
@@ -1975,7 +1981,7 @@ static int selinux_vm_enough_memory(struct mm_struct *mm, long pages)
 	int rc, cap_sys_admin = 0;
 	struct task_security_struct *tsec = current->security;
 
-	rc = secondary_ops->capable(current, CAP_SYS_ADMIN);
+	rc = secondary_ops->capable(current, CAP_SYS_ADMIN, SECURITY_CAP_NOAUDIT);
 	if (rc == 0)
 		rc = avc_has_perm_noaudit(tsec->sid, tsec->sid,
 					  SECCLASS_CAPABILITY,
@@ -2829,7 +2835,7 @@ static int selinux_inode_getsecurity(const struct inode *inode, const char *name
 	 * and lack of permission just means that we fall back to the
 	 * in-core context value, not a denial.
 	 */
-	error = secondary_ops->capable(current, CAP_MAC_ADMIN);
+	error = secondary_ops->capable(current, CAP_MAC_ADMIN, SECURITY_CAP_NOAUDIT);
 	if (!error)
 		error = avc_has_perm_noaudit(tsec->sid, tsec->sid,
 					     SECCLASS_CAPABILITY2,
-- 
cgit v1.2.3-70-g09d2


From 50ee91765e25e7967a7b69cd5cc2bcab85e2eeb8 Mon Sep 17 00:00:00 2001
From: Dhaval Giani <dhaval@linux.vnet.ibm.com>
Date: Tue, 11 Nov 2008 18:13:23 +0530
Subject: sched/rt: removed unneeded defintion

Impact: cleanup

This function no longer exists, so remove the defintion.

Signed-off-by: Dhaval Giani <dhaval@linux.vnet.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/sched.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index b483f39a711..c6bfb34d978 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -258,8 +258,6 @@ static inline int select_nohz_load_balancer(int cpu)
 }
 #endif
 
-extern unsigned long rt_needs_cpu(int cpu);
-
 /*
  * Only dump TASK_* tasks. (0 for all tasks)
  */
-- 
cgit v1.2.3-70-g09d2


From d90ebcbfa7f5a8b4e20518c9f94c5c4e4cd3c2e5 Mon Sep 17 00:00:00 2001
From: Gerrit Renker <gerrit@erg.abdn.ac.uk>
Date: Wed, 12 Nov 2008 00:47:26 -0800
Subject: dccp: Query supported CCIDs

This provides a data structure to record which CCIDs are locally supported
and three accessor functions:
 - a test function for internal use which is used to validate CCID requests
   made by the user;
 - a copy function so that the list can be used for feature-negotiation;
 - documented getsockopt() support so that the user can query capabilities.

The data structure is a table which is filled in at compile-time with the
list of available CCIDs (which in turn depends on the Kconfig choices).

Using the copy function for cloning the list of supported CCIDs is useful for
feature negotiation, since the negotiation is now with the full list of available
CCIDs (e.g. {2, 3}) instead of the default value {2}. This means negotiation
will not fail if the peer requests to use CCID3 instead of CCID2.

Signed-off-by: Gerrit Renker <gerrit@erg.abdn.ac.uk>
Acked-by: Ian McDonald <ian.mcdonald@jandi.co.nz>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/dccp.txt |  4 ++++
 include/linux/dccp.h              |  1 +
 net/dccp/ccid.c                   | 48 +++++++++++++++++++++++++++++++++++++++
 net/dccp/ccid.h                   |  5 ++++
 net/dccp/feat.c                   |  4 ++++
 net/dccp/proto.c                  |  2 ++
 6 files changed, 64 insertions(+)

(limited to 'include/linux')

diff --git a/Documentation/networking/dccp.txt b/Documentation/networking/dccp.txt
index 39131a3c78f..f0aeb20fa63 100644
--- a/Documentation/networking/dccp.txt
+++ b/Documentation/networking/dccp.txt
@@ -57,6 +57,10 @@ can be set before calling bind().
 DCCP_SOCKOPT_GET_CUR_MPS is read-only and retrieves the current maximum packet
 size (application payload size) in bytes, see RFC 4340, section 14.
 
+DCCP_SOCKOPT_AVAILABLE_CCIDS is also read-only and returns the list of CCIDs
+supported by the endpoint (see include/linux/dccp.h for symbolic constants).
+The caller needs to provide a sufficiently large (> 2) array of type uint8_t.
+
 DCCP_SOCKOPT_SERVER_TIMEWAIT enables the server (listening socket) to hold
 timewait state when closing the connection (RFC 4340, 8.3). The usual case is
 that the closing server sends a CloseReq, whereupon the client holds timewait
diff --git a/include/linux/dccp.h b/include/linux/dccp.h
index 484b8a1fb02..d3ac1bde60b 100644
--- a/include/linux/dccp.h
+++ b/include/linux/dccp.h
@@ -209,6 +209,7 @@ struct dccp_so_feat {
 #define DCCP_SOCKOPT_SERVER_TIMEWAIT	6
 #define DCCP_SOCKOPT_SEND_CSCOV		10
 #define DCCP_SOCKOPT_RECV_CSCOV		11
+#define DCCP_SOCKOPT_AVAILABLE_CCIDS	12
 #define DCCP_SOCKOPT_CCID_RX_INFO	128
 #define DCCP_SOCKOPT_CCID_TX_INFO	192
 
diff --git a/net/dccp/ccid.c b/net/dccp/ccid.c
index 8fe931a3d7a..647cb0614f8 100644
--- a/net/dccp/ccid.c
+++ b/net/dccp/ccid.c
@@ -13,6 +13,13 @@
 
 #include "ccid.h"
 
+static u8 builtin_ccids[] = {
+	DCCPC_CCID2,		/* CCID2 is supported by default */
+#if defined(CONFIG_IP_DCCP_CCID3) || defined(CONFIG_IP_DCCP_CCID3_MODULE)
+	DCCPC_CCID3,
+#endif
+};
+
 static struct ccid_operations *ccids[CCID_MAX];
 #if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT)
 static atomic_t ccids_lockct = ATOMIC_INIT(0);
@@ -86,6 +93,47 @@ static void ccid_kmem_cache_destroy(struct kmem_cache *slab)
 	}
 }
 
+/* check that up to @array_len members in @ccid_array are supported */
+bool ccid_support_check(u8 const *ccid_array, u8 array_len)
+{
+	u8 i, j, found;
+
+	for (i = 0, found = 0; i < array_len; i++, found = 0) {
+		for (j = 0; !found && j < ARRAY_SIZE(builtin_ccids); j++)
+			found = (ccid_array[i] == builtin_ccids[j]);
+		if (!found)
+			return false;
+	}
+	return true;
+}
+
+/**
+ * ccid_get_builtin_ccids  -  Provide copy of `builtin' CCID array
+ * @ccid_array: pointer to copy into
+ * @array_len: value to return length into
+ * This function allocates memory - caller must see that it is freed after use.
+ */
+int ccid_get_builtin_ccids(u8 **ccid_array, u8 *array_len)
+{
+	*ccid_array = kmemdup(builtin_ccids, sizeof(builtin_ccids), gfp_any());
+	if (*ccid_array == NULL)
+		return -ENOBUFS;
+	*array_len = ARRAY_SIZE(builtin_ccids);
+	return 0;
+}
+
+int ccid_getsockopt_builtin_ccids(struct sock *sk, int len,
+				    char __user *optval, int __user *optlen)
+{
+	if (len < sizeof(builtin_ccids))
+		return -EINVAL;
+
+	if (put_user(sizeof(builtin_ccids), optlen) ||
+	    copy_to_user(optval, builtin_ccids, sizeof(builtin_ccids)))
+		return -EFAULT;
+	return 0;
+}
+
 int ccid_register(struct ccid_operations *ccid_ops)
 {
 	int err = -ENOBUFS;
diff --git a/net/dccp/ccid.h b/net/dccp/ccid.h
index fdeae7b5731..259f5469d7d 100644
--- a/net/dccp/ccid.h
+++ b/net/dccp/ccid.h
@@ -103,6 +103,11 @@ static inline void *ccid_priv(const struct ccid *ccid)
 	return (void *)ccid->ccid_priv;
 }
 
+extern bool ccid_support_check(u8 const *ccid_array, u8 array_len);
+extern int  ccid_get_builtin_ccids(u8 **ccid_array, u8 *array_len);
+extern int  ccid_getsockopt_builtin_ccids(struct sock *sk, int len,
+					  char __user *, int __user *);
+
 extern struct ccid *ccid_new(unsigned char id, struct sock *sk, int rx,
 			     gfp_t gfp);
 
diff --git a/net/dccp/feat.c b/net/dccp/feat.c
index 192d494a381..f79fb5e33f5 100644
--- a/net/dccp/feat.c
+++ b/net/dccp/feat.c
@@ -342,6 +342,10 @@ static int __feat_register_sp(struct list_head *fn, u8 feat, u8 is_local,
 	    !dccp_feat_sp_list_ok(feat, sp_val, sp_len))
 		return -EINVAL;
 
+	/* Avoid negotiating alien CCIDs by only advertising supported ones */
+	if (feat == DCCPF_CCID && !ccid_support_check(sp_val, sp_len))
+		return -EOPNOTSUPP;
+
 	if (dccp_feat_clone_sp_val(&fval, sp_val, sp_len))
 		return -ENOMEM;
 
diff --git a/net/dccp/proto.c b/net/dccp/proto.c
index 01332fe7a99..b4b10cbd888 100644
--- a/net/dccp/proto.c
+++ b/net/dccp/proto.c
@@ -649,6 +649,8 @@ static int do_dccp_getsockopt(struct sock *sk, int level, int optname,
 	case DCCP_SOCKOPT_GET_CUR_MPS:
 		val = dp->dccps_mss_cache;
 		break;
+	case DCCP_SOCKOPT_AVAILABLE_CCIDS:
+		return ccid_getsockopt_builtin_ccids(sk, len, optval, optlen);
 	case DCCP_SOCKOPT_SERVER_TIMEWAIT:
 		val = dp->dccps_server_timewait;
 		break;
-- 
cgit v1.2.3-70-g09d2


From 3f5ec13696fd4a33bde42f385406cbb1d3cc96fd Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Tue, 11 Nov 2008 23:21:31 +0100
Subject: tracing/fastboot: move boot tracer structs and funcs into their own
 header.

Impact: Cleanups on the boot tracer and ftrace

This patch bring some cleanups about the boot tracer headers. The
functions and structures of this tracer have nothing related to ftrace
and should have so their own header file.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Acked-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/ftrace.h | 41 -----------------------------------------
 include/trace/boot.h   | 43 +++++++++++++++++++++++++++++++++++++++++++
 init/main.c            |  1 +
 kernel/trace/trace.h   |  1 +
 4 files changed, 45 insertions(+), 41 deletions(-)
 create mode 100644 include/trace/boot.h

(limited to 'include/linux')

diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index dcbbf72a88b..4fbc4a8b86a 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -287,45 +287,4 @@ extern trace_function_return_t ftrace_function_return;
 extern void unregister_ftrace_return(void);
 #endif
 
-/*
- * Structure which defines the trace of an initcall.
- * You don't have to fill the func field since it is
- * only used internally by the tracer.
- */
-struct boot_trace {
-	pid_t			caller;
-	char			func[KSYM_NAME_LEN];
-	int			result;
-	unsigned long long	duration;		/* usecs */
-	ktime_t			calltime;
-	ktime_t			rettime;
-};
-
-#ifdef CONFIG_BOOT_TRACER
-/* Append the trace on the ring-buffer */
-extern void trace_boot(struct boot_trace *it, initcall_t fn);
-
-/* Tells the tracer that smp_pre_initcall is finished.
- * So we can start the tracing
- */
-extern void start_boot_trace(void);
-
-/* Resume the tracing of other necessary events
- * such as sched switches
- */
-extern void enable_boot_trace(void);
-
-/* Suspend this tracing. Actually, only sched_switches tracing have
- * to be suspended. Initcalls doesn't need it.)
- */
-extern void disable_boot_trace(void);
-#else
-static inline void trace_boot(struct boot_trace *it, initcall_t fn) { }
-static inline void start_boot_trace(void) { }
-static inline void enable_boot_trace(void) { }
-static inline void disable_boot_trace(void) { }
-#endif
-
-
-
 #endif /* _LINUX_FTRACE_H */
diff --git a/include/trace/boot.h b/include/trace/boot.h
new file mode 100644
index 00000000000..4cbe64e46cd
--- /dev/null
+++ b/include/trace/boot.h
@@ -0,0 +1,43 @@
+#ifndef _LINUX_TRACE_BOOT_H
+#define _LINUX_TRACE_BOOT_H
+
+/*
+ * Structure which defines the trace of an initcall.
+ * You don't have to fill the func field since it is
+ * only used internally by the tracer.
+ */
+struct boot_trace {
+	pid_t			caller;
+	char			func[KSYM_NAME_LEN];
+	int			result;
+	unsigned long long	duration;		/* usecs */
+	ktime_t			calltime;
+	ktime_t			rettime;
+};
+
+#ifdef CONFIG_BOOT_TRACER
+/* Append the trace on the ring-buffer */
+extern void trace_boot(struct boot_trace *it, initcall_t fn);
+
+/* Tells the tracer that smp_pre_initcall is finished.
+ * So we can start the tracing
+ */
+extern void start_boot_trace(void);
+
+/* Resume the tracing of other necessary events
+ * such as sched switches
+ */
+extern void enable_boot_trace(void);
+
+/* Suspend this tracing. Actually, only sched_switches tracing have
+ * to be suspended. Initcalls doesn't need it.)
+ */
+extern void disable_boot_trace(void);
+#else
+static inline void trace_boot(struct boot_trace *it, initcall_t fn) { }
+static inline void start_boot_trace(void) { }
+static inline void enable_boot_trace(void) { }
+static inline void disable_boot_trace(void) { }
+#endif /* CONFIG_BOOT_TRACER */
+
+#endif /* __LINUX_TRACE_BOOT_H */
diff --git a/init/main.c b/init/main.c
index 4b03cd5656c..16ca1ee071c 100644
--- a/init/main.c
+++ b/init/main.c
@@ -63,6 +63,7 @@
 #include <linux/signal.h>
 #include <linux/idr.h>
 #include <linux/ftrace.h>
+#include <trace/boot.h>
 
 #include <asm/io.h>
 #include <asm/bugs.h>
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index e40ce0c1469..f69a5199596 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -8,6 +8,7 @@
 #include <linux/ring_buffer.h>
 #include <linux/mmiotrace.h>
 #include <linux/ftrace.h>
+#include <trace/boot.h>
 
 enum trace_type {
 	__TRACE_FIRST_TYPE = 0,
-- 
cgit v1.2.3-70-g09d2


From 92a77aac9812d5397abbe6f1920e085e50838635 Mon Sep 17 00:00:00 2001
From: James Morris <jmorris@namei.org>
Date: Wed, 12 Nov 2008 21:20:00 +1100
Subject: security: remove broken and useless declarations

Remove broken declarations for security_capable* functions,
which were not needed anyway.

Signed-off-by: James Morris <jmorris@namei.org>
---
 include/linux/capability.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/capability.h b/include/linux/capability.h
index b313ba1dd5d..7f26580a5a4 100644
--- a/include/linux/capability.h
+++ b/include/linux/capability.h
@@ -521,8 +521,6 @@ extern const kernel_cap_t __cap_init_eff_set;
 
 kernel_cap_t cap_set_effective(const kernel_cap_t pE_new);
 
-extern int security_capable(struct task_struct *t, int cap);
-extern int security_capable_noaudit(struct task_struct *t, int cap);
 /**
  * has_capability - Determine if a task has a superior capability available
  * @t: The task in question
-- 
cgit v1.2.3-70-g09d2


From 1f0d69a9fc815db82f15722bf05227190b1d714d Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Wed, 12 Nov 2008 00:14:39 -0500
Subject: tracing: profile likely and unlikely annotations

Impact: new unlikely/likely profiler

Andrew Morton recently suggested having an in-kernel way to profile
likely and unlikely macros. This patch achieves that goal.

When configured, every(*) likely and unlikely macro gets a counter attached
to it. When the condition is hit, the hit and misses of that condition
are recorded. These numbers can later be retrieved by:

  /debugfs/tracing/profile_likely    - All likely markers
  /debugfs/tracing/profile_unlikely  - All unlikely markers.

# cat /debug/tracing/profile_unlikely | head
 correct incorrect  %        Function                  File              Line
 ------- ---------  -        --------                  ----              ----
    2167        0   0 do_arch_prctl                  process_64.c         832
       0        0   0 do_arch_prctl                  process_64.c         804
    2670        0   0 IS_ERR                         err.h                34
   71230     5693   7 __switch_to                    process_64.c         673
   76919        0   0 __switch_to                    process_64.c         639
   43184    33743  43 __switch_to                    process_64.c         624
   12740    64181  83 __switch_to                    process_64.c         594
   12740    64174  83 __switch_to                    process_64.c         590

# cat /debug/tracing/profile_unlikely | \
  awk '{ if ($3 > 25) print $0; }' |head -20
   44963    35259  43 __switch_to                    process_64.c         624
   12762    67454  84 __switch_to                    process_64.c         594
   12762    67447  84 __switch_to                    process_64.c         590
    1478      595  28 syscall_get_error              syscall.h            51
       0     2821 100 syscall_trace_leave            ptrace.c             1567
       0        1 100 native_smp_prepare_cpus        smpboot.c            1237
   86338   265881  75 calc_delta_fair                sched_fair.c         408
  210410   108540  34 calc_delta_mine                sched.c              1267
       0    54550 100 sched_info_queued              sched_stats.h        222
   51899    66435  56 pick_next_task_fair            sched_fair.c         1422
       6       10  62 yield_task_fair                sched_fair.c         982
    7325     2692  26 rt_policy                      sched.c              144
       0     1270 100 pre_schedule_rt                sched_rt.c           1261
    1268    48073  97 pick_next_task_rt              sched_rt.c           884
       0    45181 100 sched_info_dequeued            sched_stats.h        177
       0       15 100 sched_move_task                sched.c              8700
       0       15 100 sched_move_task                sched.c              8690
   53167    33217  38 schedule                       sched.c              4457
       0    80208 100 sched_info_switch              sched_stats.h        270
   30585    49631  61 context_switch                 sched.c              2619

# cat /debug/tracing/profile_likely | awk '{ if ($3 > 25) print $0; }'
   39900    36577  47 pick_next_task                 sched.c              4397
   20824    15233  42 switch_mm                      mmu_context_64.h     18
       0        7 100 __cancel_work_timer            workqueue.c          560
     617    66484  99 clocksource_adjust             timekeeping.c        456
       0   346340 100 audit_syscall_exit             auditsc.c            1570
      38   347350  99 audit_get_context              auditsc.c            732
       0   345244 100 audit_syscall_entry            auditsc.c            1541
      38     1017  96 audit_free                     auditsc.c            1446
       0     1090 100 audit_alloc                    auditsc.c            862
    2618     1090  29 audit_alloc                    auditsc.c            858
       0        6 100 move_masked_irq                migration.c          9
       1      198  99 probe_sched_wakeup             trace_sched_switch.c 58
       2        2  50 probe_wakeup                   trace_sched_wakeup.c 227
       0        2 100 probe_wakeup_sched_switch      trace_sched_wakeup.c 144
    4514     2090  31 __grab_cache_page              filemap.c            2149
   12882   228786  94 mapping_unevictable            pagemap.h            50
       4       11  73 __flush_cpu_slab               slub.c               1466
  627757   330451  34 slab_free                      slub.c               1731
    2959    61245  95 dentry_lru_del_init            dcache.c             153
     946     1217  56 load_elf_binary                binfmt_elf.c         904
     102       82  44 disk_put_part                  genhd.h              206
       1        1  50 dst_gc_task                    dst.c                82
       0       19 100 tcp_mss_split_point            tcp_output.c         1126

As you can see by the above, there's a bit of work to do in rethinking
the use of some unlikelys and likelys. Note: the unlikely case had 71 hits
that were more than 25%.

Note:  After submitting my first version of this patch, Andrew Morton
  showed me a version written by Daniel Walker, where I picked up
  the following ideas from:

  1)  Using __builtin_constant_p to avoid profiling fixed values.
  2)  Using __FILE__ instead of instruction pointers.
  3)  Using the preprocessor to stop all profiling of likely
       annotations from vsyscall_64.c.

Thanks to Andrew Morton, Arjan van de Ven, Theodore Tso and Ingo Molnar
for their feed back on this patch.

(*) Not ever unlikely is recorded, those that are used by vsyscalls
 (a few of them) had to have profiling disabled.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Theodore Tso <tytso@mit.edu>
Cc: Arjan van de Ven <arjan@infradead.org>
Cc: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/vsyscall_64.c     |   8 ++
 include/asm-generic/vmlinux.lds.h |  14 +++-
 include/linux/compiler.h          |  61 +++++++++++++-
 kernel/trace/Kconfig              |  16 ++++
 kernel/trace/Makefile             |   1 +
 kernel/trace/trace_unlikely.c     | 164 ++++++++++++++++++++++++++++++++++++++
 6 files changed, 261 insertions(+), 3 deletions(-)
 create mode 100644 kernel/trace/trace_unlikely.c

(limited to 'include/linux')

diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c
index 0b8b6690a86..2f90202e59b 100644
--- a/arch/x86/kernel/vsyscall_64.c
+++ b/arch/x86/kernel/vsyscall_64.c
@@ -17,6 +17,14 @@
  *  want per guest time just set the kernel.vsyscall64 sysctl to 0.
  */
 
+/* Protect userspace from profiling */
+#ifdef CONFIG_TRACE_UNLIKELY_PROFILE
+# undef likely
+# undef unlikely
+# define likely(x)		likely_notrace(x)
+# define unlikely(x)		unlikely_notrace(x)
+#endif
+
 #include <linux/time.h>
 #include <linux/init.h>
 #include <linux/kernel.h>
diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h
index 80744606bad..e10beb5335c 100644
--- a/include/asm-generic/vmlinux.lds.h
+++ b/include/asm-generic/vmlinux.lds.h
@@ -45,6 +45,17 @@
 #define MCOUNT_REC()
 #endif
 
+#ifdef CONFIG_TRACE_UNLIKELY_PROFILE
+#define LIKELY_PROFILE()	VMLINUX_SYMBOL(__start_likely_profile) = .;   \
+				*(_ftrace_likely)			      \
+				VMLINUX_SYMBOL(__stop_likely_profile) = .;    \
+				VMLINUX_SYMBOL(__start_unlikely_profile) = .; \
+				*(_ftrace_unlikely)			      \
+				VMLINUX_SYMBOL(__stop_unlikely_profile) = .;
+#else
+#define LIKELY_PROFILE()
+#endif
+
 /* .data section */
 #define DATA_DATA							\
 	*(.data)							\
@@ -62,7 +73,8 @@
 	VMLINUX_SYMBOL(__stop___markers) = .;				\
 	VMLINUX_SYMBOL(__start___tracepoints) = .;			\
 	*(__tracepoints)						\
-	VMLINUX_SYMBOL(__stop___tracepoints) = .;
+	VMLINUX_SYMBOL(__stop___tracepoints) = .;			\
+	LIKELY_PROFILE()
 
 #define RO_DATA(align)							\
 	. = ALIGN((align));						\
diff --git a/include/linux/compiler.h b/include/linux/compiler.h
index 98115d9d04d..935e30cfaf3 100644
--- a/include/linux/compiler.h
+++ b/include/linux/compiler.h
@@ -59,8 +59,65 @@ extern void __chk_io_ptr(const volatile void __iomem *);
  * specific implementations come from the above header files
  */
 
-#define likely(x)	__builtin_expect(!!(x), 1)
-#define unlikely(x)	__builtin_expect(!!(x), 0)
+#ifdef CONFIG_TRACE_UNLIKELY_PROFILE
+struct ftrace_likely_data {
+	const char *func;
+	const char *file;
+	unsigned line;
+	unsigned long correct;
+	unsigned long incorrect;
+};
+void ftrace_likely_update(struct ftrace_likely_data *f, int val, int expect);
+
+#define likely_notrace(x)	__builtin_expect(!!(x), 1)
+#define unlikely_notrace(x)	__builtin_expect(!!(x), 0)
+
+#define likely_check(x) ({						\
+			int ______r;					\
+			static struct ftrace_likely_data		\
+				__attribute__((__aligned__(4)))		\
+				__attribute__((section("_ftrace_likely"))) \
+				______f = {				\
+				.func = __func__,			\
+				.file = __FILE__,			\
+				.line = __LINE__,			\
+			};						\
+			______f.line = __LINE__;			\
+			______r = likely_notrace(x);			\
+			ftrace_likely_update(&______f, ______r, 1);	\
+			______r;					\
+		})
+#define unlikely_check(x) ({						\
+			int ______r;					\
+			static struct ftrace_likely_data		\
+				__attribute__((__aligned__(4)))		\
+				__attribute__((section("_ftrace_unlikely"))) \
+				______f = {				\
+				.func = __func__,			\
+				.file = __FILE__,			\
+				.line = __LINE__,			\
+			};						\
+			______f.line = __LINE__;			\
+			______r = unlikely_notrace(x);			\
+			ftrace_likely_update(&______f, ______r, 0);	\
+			______r;					\
+		})
+
+/*
+ * Using __builtin_constant_p(x) to ignore cases where the return
+ * value is always the same.  This idea is taken from a similar patch
+ * written by Daniel Walker.
+ */
+# ifndef likely
+#  define likely(x)	(__builtin_constant_p(x) ? !!(x) : likely_check(x))
+# endif
+# ifndef unlikely
+#  define unlikely(x)	(__builtin_constant_p(x) ? !!(x) : unlikely_check(x))
+# endif
+#else
+# define likely(x)	__builtin_expect(!!(x), 1)
+# define unlikely(x)	__builtin_expect(!!(x), 0)
+#endif
 
 /* Optimization barrier */
 #ifndef barrier
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index d986216c832..a604f24c755 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -159,6 +159,22 @@ config BOOT_TRACER
 	    selected, because the self-tests are an initcall as well and that
 	    would invalidate the boot trace. )
 
+config TRACE_UNLIKELY_PROFILE
+	bool "Trace likely/unlikely profiler"
+	depends on DEBUG_KERNEL
+	select TRACING
+	help
+	  This tracer profiles all the the likely and unlikely macros
+	  in the kernel. It will display the results in:
+
+	  /debugfs/tracing/profile_likely
+	  /debugfs/tracing/profile_unlikely
+
+	  Note: this will add a significant overhead, only turn this
+	  on if you need to profile the system's use of these macros.
+
+	  Say N if unsure.
+
 config STACK_TRACER
 	bool "Trace max stack"
 	depends on HAVE_FUNCTION_TRACER
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index 3e1f361bbc1..98e70ee2798 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -25,5 +25,6 @@ obj-$(CONFIG_STACK_TRACER) += trace_stack.o
 obj-$(CONFIG_MMIOTRACE) += trace_mmiotrace.o
 obj-$(CONFIG_BOOT_TRACER) += trace_boot.o
 obj-$(CONFIG_FUNCTION_RET_TRACER) += trace_functions_return.o
+obj-$(CONFIG_TRACE_UNLIKELY_PROFILE) += trace_unlikely.o
 
 libftrace-y := ftrace.o
diff --git a/kernel/trace/trace_unlikely.c b/kernel/trace/trace_unlikely.c
new file mode 100644
index 00000000000..94932696069
--- /dev/null
+++ b/kernel/trace/trace_unlikely.c
@@ -0,0 +1,164 @@
+/*
+ * unlikely profiler
+ *
+ * Copyright (C) 2008 Steven Rostedt <srostedt@redhat.com>
+ */
+#include <linux/kallsyms.h>
+#include <linux/seq_file.h>
+#include <linux/spinlock.h>
+#include <linux/debugfs.h>
+#include <linux/uaccess.h>
+#include <linux/module.h>
+#include <linux/ftrace.h>
+#include <linux/hash.h>
+#include <linux/fs.h>
+#include <asm/local.h>
+#include "trace.h"
+
+void ftrace_likely_update(struct ftrace_likely_data *f, int val, int expect)
+{
+	/* FIXME: Make this atomic! */
+	if (val == expect)
+		f->correct++;
+	else
+		f->incorrect++;
+}
+EXPORT_SYMBOL(ftrace_likely_update);
+
+struct ftrace_pointer {
+	void		*start;
+	void		*stop;
+};
+
+static void *
+t_next(struct seq_file *m, void *v, loff_t *pos)
+{
+	struct ftrace_pointer *f = m->private;
+	struct ftrace_likely_data *p = v;
+
+	(*pos)++;
+
+	if (v == (void *)1)
+		return f->start;
+
+	++p;
+
+	if ((void *)p >= (void *)f->stop)
+		return NULL;
+
+	return p;
+}
+
+static void *t_start(struct seq_file *m, loff_t *pos)
+{
+	void *t = (void *)1;
+	loff_t l = 0;
+
+	for (; t && l < *pos; t = t_next(m, t, &l))
+		;
+
+	return t;
+}
+
+static void t_stop(struct seq_file *m, void *p)
+{
+}
+
+static int t_show(struct seq_file *m, void *v)
+{
+	struct ftrace_likely_data *p = v;
+	const char *f;
+	unsigned long percent;
+
+	if (v == (void *)1) {
+		seq_printf(m, " correct incorrect  %% "
+			      "       Function                "
+			      "  File              Line\n"
+			      " ------- ---------  - "
+			      "       --------                "
+			      "  ----              ----\n");
+		return 0;
+	}
+
+	/* Only print the file, not the path */
+	f = p->file + strlen(p->file);
+	while (f >= p->file && *f != '/')
+		f--;
+	f++;
+
+	if (p->correct) {
+		percent = p->incorrect * 100;
+		percent /= p->correct + p->incorrect;
+	} else
+		percent = p->incorrect ? 100 : 0;
+
+	seq_printf(m, "%8lu %8lu %3lu ", p->correct, p->incorrect, percent);
+	seq_printf(m, "%-30.30s %-20.20s %d\n", p->func, f, p->line);
+	return 0;
+}
+
+static struct seq_operations tracing_likely_seq_ops = {
+	.start		= t_start,
+	.next		= t_next,
+	.stop		= t_stop,
+	.show		= t_show,
+};
+
+static int tracing_likely_open(struct inode *inode, struct file *file)
+{
+	int ret;
+
+	ret = seq_open(file, &tracing_likely_seq_ops);
+	if (!ret) {
+		struct seq_file *m = file->private_data;
+		m->private = (void *)inode->i_private;
+	}
+
+	return ret;
+}
+
+static struct file_operations tracing_likely_fops = {
+	.open		= tracing_likely_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+};
+
+extern unsigned long __start_likely_profile[];
+extern unsigned long __stop_likely_profile[];
+extern unsigned long __start_unlikely_profile[];
+extern unsigned long __stop_unlikely_profile[];
+
+static struct ftrace_pointer ftrace_likely_pos = {
+	.start			= __start_likely_profile,
+	.stop			= __stop_likely_profile,
+};
+
+static struct ftrace_pointer ftrace_unlikely_pos = {
+	.start			= __start_unlikely_profile,
+	.stop			= __stop_unlikely_profile,
+};
+
+static __init int ftrace_unlikely_init(void)
+{
+	struct dentry *d_tracer;
+	struct dentry *entry;
+
+	d_tracer = tracing_init_dentry();
+
+	entry = debugfs_create_file("profile_likely", 0444, d_tracer,
+				    &ftrace_likely_pos,
+				    &tracing_likely_fops);
+	if (!entry)
+		pr_warning("Could not create debugfs 'profile_likely' entry\n");
+
+	entry = debugfs_create_file("profile_unlikely", 0444, d_tracer,
+				    &ftrace_unlikely_pos,
+				    &tracing_likely_fops);
+	if (!entry)
+		pr_warning("Could not create debugfs"
+			   " 'profile_unlikely' entry\n");
+
+	return 0;
+}
+
+device_initcall(ftrace_unlikely_init);
-- 
cgit v1.2.3-70-g09d2


From e25cf3db560e803292946ef23a30c69e341ce56f Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 17 Oct 2008 15:55:07 +0200
Subject: lockdep: include/linux/lockdep.h - fix warning in
 net/bluetooth/af_bluetooth.c
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

fix this warning:

  net/bluetooth/af_bluetooth.c:60: warning: ‘bt_key_strings’ defined but not used
  net/bluetooth/af_bluetooth.c:71: warning: ‘bt_slock_key_strings’ defined but not used

this is a lockdep macro problem in the !LOCKDEP case.

We cannot convert it to an inline because the macro works on multiple types,
but we can mark the parameter used.

[ also clean up a misaligned tab in sock_lock_init_class_and_name() ]

[ also remove #ifdefs from around af_family_clock_key strings - which
  were certainly added to get rid of the ugly build warnings. ]

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/lockdep.h | 5 +++--
 include/net/sock.h      | 2 +-
 net/core/sock.c         | 2 --
 3 files changed, 4 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h
index fc9f8e88123..8956daf64ab 100644
--- a/include/linux/lockdep.h
+++ b/include/linux/lockdep.h
@@ -336,10 +336,11 @@ static inline void lockdep_on(void)
 # define lock_set_subclass(l, s, i)		do { } while (0)
 # define lockdep_init()				do { } while (0)
 # define lockdep_info()				do { } while (0)
-# define lockdep_init_map(lock, name, key, sub)	do { (void)(key); } while (0)
+# define lockdep_init_map(lock, name, key, sub) \
+		do { (void)(name); (void)(key); } while (0)
 # define lockdep_set_class(lock, key)		do { (void)(key); } while (0)
 # define lockdep_set_class_and_name(lock, key, name) \
-		do { (void)(key); } while (0)
+		do { (void)(key); (void)(name); } while (0)
 #define lockdep_set_class_and_subclass(lock, key, sub) \
 		do { (void)(key); } while (0)
 #define lockdep_set_subclass(lock, sub)		do { } while (0)
diff --git a/include/net/sock.h b/include/net/sock.h
index c04f9e18ea2..2f47107f6d0 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -815,7 +815,7 @@ static inline void sk_wmem_free_skb(struct sock *sk, struct sk_buff *skb)
  */
 #define sock_lock_init_class_and_name(sk, sname, skey, name, key) 	\
 do {									\
-	sk->sk_lock.owned = 0;					\
+	sk->sk_lock.owned = 0;						\
 	init_waitqueue_head(&sk->sk_lock.wq);				\
 	spin_lock_init(&(sk)->sk_lock.slock);				\
 	debug_check_no_locks_freed((void *)&(sk)->sk_lock,		\
diff --git a/net/core/sock.c b/net/core/sock.c
index 5e2a3132a8c..341e3945695 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -136,7 +136,6 @@
 static struct lock_class_key af_family_keys[AF_MAX];
 static struct lock_class_key af_family_slock_keys[AF_MAX];
 
-#ifdef CONFIG_DEBUG_LOCK_ALLOC
 /*
  * Make lock validator output more readable. (we pre-construct these
  * strings build-time, so that runtime initialization of socket
@@ -187,7 +186,6 @@ static const char *af_family_clock_key_strings[AF_MAX+1] = {
   "clock-AF_RXRPC" , "clock-AF_ISDN"     , "clock-AF_PHONET"   ,
   "clock-AF_MAX"
 };
-#endif
 
 /*
  * sk_callback_lock locking rules are per-address-family,
-- 
cgit v1.2.3-70-g09d2


From 2b7d0390a6d6d595f43ea3806639664afe5b9ebe Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Wed, 12 Nov 2008 13:17:38 +0100
Subject: tracing: branch tracer, fix vdso crash

Impact: fix bootup crash

the branch tracer missed arch/x86/vdso/vclock_gettime.c from
disabling tracing, which caused such bootup crashes:

  [  201.840097] init[1]: segfault at 7fffed3fe7c0 ip 00007fffed3fea2e sp 000077

also clean up the ugly ifdefs in arch/x86/kernel/vsyscall_64.c by
creating DISABLE_UNLIKELY_PROFILE facility for code to turn off
instrumentation on a per file basis.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/vsyscall_64.c  | 9 ++-------
 arch/x86/vdso/vclock_gettime.c | 3 +++
 include/linux/compiler.h       | 6 +++++-
 3 files changed, 10 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c
index 2f90202e59b..ece02932ea5 100644
--- a/arch/x86/kernel/vsyscall_64.c
+++ b/arch/x86/kernel/vsyscall_64.c
@@ -17,13 +17,8 @@
  *  want per guest time just set the kernel.vsyscall64 sysctl to 0.
  */
 
-/* Protect userspace from profiling */
-#ifdef CONFIG_TRACE_UNLIKELY_PROFILE
-# undef likely
-# undef unlikely
-# define likely(x)		likely_notrace(x)
-# define unlikely(x)		unlikely_notrace(x)
-#endif
+/* Disable profiling for userspace code: */
+#define DISABLE_UNLIKELY_PROFILE
 
 #include <linux/time.h>
 #include <linux/init.h>
diff --git a/arch/x86/vdso/vclock_gettime.c b/arch/x86/vdso/vclock_gettime.c
index 1ef0f90813d..6e667631e7d 100644
--- a/arch/x86/vdso/vclock_gettime.c
+++ b/arch/x86/vdso/vclock_gettime.c
@@ -9,6 +9,9 @@
  * Also alternative() doesn't work.
  */
 
+/* Disable profiling for userspace code: */
+#define DISABLE_UNLIKELY_PROFILE
+
 #include <linux/kernel.h>
 #include <linux/posix-timers.h>
 #include <linux/time.h>
diff --git a/include/linux/compiler.h b/include/linux/compiler.h
index 935e30cfaf3..63b7d9089d6 100644
--- a/include/linux/compiler.h
+++ b/include/linux/compiler.h
@@ -59,7 +59,11 @@ extern void __chk_io_ptr(const volatile void __iomem *);
  * specific implementations come from the above header files
  */
 
-#ifdef CONFIG_TRACE_UNLIKELY_PROFILE
+/*
+ * Note: DISABLE_UNLIKELY_PROFILE can be used by special lowlevel code
+ * to disable branch tracing on a per file basis.
+ */
+#if defined(CONFIG_TRACE_UNLIKELY_PROFILE) && !defined(DISABLE_UNLIKELY_PROFILE)
 struct ftrace_likely_data {
 	const char *func;
 	const char *file;
-- 
cgit v1.2.3-70-g09d2


From 2ed84eeb8808cf3c9f039213ca137ffd7d753f0e Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Wed, 12 Nov 2008 15:24:24 -0500
Subject: trace: rename unlikely profiler to branch profiler

Impact: name change of unlikely tracer and profiler

Ingo Molnar suggested changing the config from UNLIKELY_PROFILE
to BRANCH_PROFILING. I never did like the "unlikely" name so I
went one step farther, and renamed all the unlikely configurations
to a "BRANCH" variant.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/vsyscall_64.c     |  2 +-
 arch/x86/vdso/vclock_gettime.c    |  2 +-
 include/asm-generic/vmlinux.lds.h |  2 +-
 include/linux/compiler.h          | 19 ++++++++++---------
 kernel/trace/Kconfig              | 10 +++++-----
 kernel/trace/Makefile             |  7 +++----
 kernel/trace/trace.c              |  2 +-
 kernel/trace/trace.h              |  6 +++---
 kernel/trace/trace_unlikely.c     |  4 ++--
 9 files changed, 27 insertions(+), 27 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c
index ece02932ea5..6f3d3d4cd97 100644
--- a/arch/x86/kernel/vsyscall_64.c
+++ b/arch/x86/kernel/vsyscall_64.c
@@ -18,7 +18,7 @@
  */
 
 /* Disable profiling for userspace code: */
-#define DISABLE_UNLIKELY_PROFILE
+#define DISABLE_BRANCH_PROFILING
 
 #include <linux/time.h>
 #include <linux/init.h>
diff --git a/arch/x86/vdso/vclock_gettime.c b/arch/x86/vdso/vclock_gettime.c
index 6e667631e7d..d9d35824c56 100644
--- a/arch/x86/vdso/vclock_gettime.c
+++ b/arch/x86/vdso/vclock_gettime.c
@@ -10,7 +10,7 @@
  */
 
 /* Disable profiling for userspace code: */
-#define DISABLE_UNLIKELY_PROFILE
+#define DISABLE_BRANCH_PROFILING
 
 #include <linux/kernel.h>
 #include <linux/posix-timers.h>
diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h
index e10beb5335c..a5e4ed9baec 100644
--- a/include/asm-generic/vmlinux.lds.h
+++ b/include/asm-generic/vmlinux.lds.h
@@ -45,7 +45,7 @@
 #define MCOUNT_REC()
 #endif
 
-#ifdef CONFIG_TRACE_UNLIKELY_PROFILE
+#ifdef CONFIG_TRACE_BRANCH_PROFILING
 #define LIKELY_PROFILE()	VMLINUX_SYMBOL(__start_likely_profile) = .;   \
 				*(_ftrace_likely)			      \
 				VMLINUX_SYMBOL(__stop_likely_profile) = .;    \
diff --git a/include/linux/compiler.h b/include/linux/compiler.h
index 63b7d9089d6..c7d804a7a4d 100644
--- a/include/linux/compiler.h
+++ b/include/linux/compiler.h
@@ -59,26 +59,27 @@ extern void __chk_io_ptr(const volatile void __iomem *);
  * specific implementations come from the above header files
  */
 
-/*
- * Note: DISABLE_UNLIKELY_PROFILE can be used by special lowlevel code
- * to disable branch tracing on a per file basis.
- */
-#if defined(CONFIG_TRACE_UNLIKELY_PROFILE) && !defined(DISABLE_UNLIKELY_PROFILE)
-struct ftrace_likely_data {
+struct ftrace_branch_data {
 	const char *func;
 	const char *file;
 	unsigned line;
 	unsigned long correct;
 	unsigned long incorrect;
 };
-void ftrace_likely_update(struct ftrace_likely_data *f, int val, int expect);
+
+/*
+ * Note: DISABLE_BRANCH_PROFILING can be used by special lowlevel code
+ * to disable branch tracing on a per file basis.
+ */
+#if defined(CONFIG_TRACE_BRANCH_PROFILING) && !defined(DISABLE_BRANCH_PROFILING)
+void ftrace_likely_update(struct ftrace_branch_data *f, int val, int expect);
 
 #define likely_notrace(x)	__builtin_expect(!!(x), 1)
 #define unlikely_notrace(x)	__builtin_expect(!!(x), 0)
 
 #define likely_check(x) ({						\
 			int ______r;					\
-			static struct ftrace_likely_data		\
+			static struct ftrace_branch_data		\
 				__attribute__((__aligned__(4)))		\
 				__attribute__((section("_ftrace_likely"))) \
 				______f = {				\
@@ -93,7 +94,7 @@ void ftrace_likely_update(struct ftrace_likely_data *f, int val, int expect);
 		})
 #define unlikely_check(x) ({						\
 			int ______r;					\
-			static struct ftrace_likely_data		\
+			static struct ftrace_branch_data		\
 				__attribute__((__aligned__(4)))		\
 				__attribute__((section("_ftrace_unlikely"))) \
 				______f = {				\
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 8abcaf821be..9c89526b6b7 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -159,7 +159,7 @@ config BOOT_TRACER
 	    selected, because the self-tests are an initcall as well and that
 	    would invalidate the boot trace. )
 
-config TRACE_UNLIKELY_PROFILE
+config TRACE_BRANCH_PROFILING
 	bool "Trace likely/unlikely profiler"
 	depends on DEBUG_KERNEL
 	select TRACING
@@ -175,7 +175,7 @@ config TRACE_UNLIKELY_PROFILE
 
 	  Say N if unsure.
 
-config TRACING_UNLIKELY
+config TRACING_BRANCHES
 	bool
 	help
 	  Selected by tracers that will trace the likely and unlikely
@@ -183,10 +183,10 @@ config TRACING_UNLIKELY
 	  profiled. Profiling the tracing infrastructure can only happen
 	  when the likelys and unlikelys are not being traced.
 
-config UNLIKELY_TRACER
+config BRANCH_TRACER
 	bool "Trace likely/unlikely instances"
-	depends on TRACE_UNLIKELY_PROFILE
-	select TRACING_UNLIKELY
+	depends on TRACE_BRANCH_PROFILING
+	select TRACING_BRANCHES
 	help
 	  This traces the events of likely and unlikely condition
 	  calls in the kernel.  The difference between this and the
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index c938d03516c..0087df7ba44 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -11,9 +11,8 @@ obj-y += trace_selftest_dynamic.o
 endif
 
 # If unlikely tracing is enabled, do not trace these files
-ifdef CONFIG_TRACING_UNLIKELY
-KBUILD_CFLAGS += '-Dlikely(x)=likely_notrace(x)'
-KBUILD_CFLAGS += '-Dunlikely(x)=unlikely_notrace(x)'
+ifdef CONFIG_TRACING_BRANCHES
+KBUILD_CFLAGS += -DDISABLE_BRANCH_PROFILING
 endif
 
 obj-$(CONFIG_FUNCTION_TRACER) += libftrace.o
@@ -31,6 +30,6 @@ obj-$(CONFIG_STACK_TRACER) += trace_stack.o
 obj-$(CONFIG_MMIOTRACE) += trace_mmiotrace.o
 obj-$(CONFIG_BOOT_TRACER) += trace_boot.o
 obj-$(CONFIG_FUNCTION_RET_TRACER) += trace_functions_return.o
-obj-$(CONFIG_TRACE_UNLIKELY_PROFILE) += trace_unlikely.o
+obj-$(CONFIG_TRACE_BRANCH_PROFILING) += trace_unlikely.o
 
 libftrace-y := ftrace.o
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index d842db14a59..bad59d32a4a 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -258,7 +258,7 @@ static const char *trace_options[] = {
 	"sched-tree",
 	"ftrace_printk",
 	"ftrace_preempt",
-#ifdef CONFIG_UNLIKELY_TRACER
+#ifdef CONFIG_BRANCH_TRACER
 	"unlikely",
 #endif
 	NULL
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 9635aa2c4fc..dccae631294 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -468,7 +468,7 @@ enum trace_iterator_flags {
 	TRACE_ITER_SCHED_TREE		= 0x200,
 	TRACE_ITER_PRINTK		= 0x400,
 	TRACE_ITER_PREEMPTONLY		= 0x800,
-#ifdef CONFIG_UNLIKELY_TRACER
+#ifdef CONFIG_BRANCH_TRACER
 	TRACE_ITER_UNLIKELY		= 0x1000,
 #endif
 };
@@ -530,7 +530,7 @@ static inline void ftrace_preempt_enable(int resched)
 		preempt_enable_notrace();
 }
 
-#ifdef CONFIG_UNLIKELY_TRACER
+#ifdef CONFIG_BRANCH_TRACER
 extern int enable_unlikely_tracing(struct trace_array *tr);
 extern void disable_unlikely_tracing(void);
 static inline int trace_unlikely_enable(struct trace_array *tr)
@@ -552,6 +552,6 @@ static inline int trace_unlikely_enable(struct trace_array *tr)
 static inline void trace_unlikely_disable(void)
 {
 }
-#endif /* CONFIG_UNLIKELY_TRACER */
+#endif /* CONFIG_BRANCH_TRACER */
 
 #endif /* _LINUX_KERNEL_TRACE_H */
diff --git a/kernel/trace/trace_unlikely.c b/kernel/trace/trace_unlikely.c
index 7290e0e7b4e..856eb3b7f69 100644
--- a/kernel/trace/trace_unlikely.c
+++ b/kernel/trace/trace_unlikely.c
@@ -15,7 +15,7 @@
 #include <asm/local.h>
 #include "trace.h"
 
-#ifdef CONFIG_UNLIKELY_TRACER
+#ifdef CONFIG_BRANCH_TRACER
 
 static int unlikely_tracing_enabled __read_mostly;
 static DEFINE_MUTEX(unlikely_tracing_mutex);
@@ -119,7 +119,7 @@ static inline
 void trace_likely_condition(struct ftrace_likely_data *f, int val, int expect)
 {
 }
-#endif /* CONFIG_UNLIKELY_TRACER */
+#endif /* CONFIG_BRANCH_TRACER */
 
 void ftrace_likely_update(struct ftrace_likely_data *f, int val, int expect)
 {
-- 
cgit v1.2.3-70-g09d2


From da9592edebceeba1b9301beafe80ec8b9c2db0ce Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 14 Nov 2008 10:39:05 +1100
Subject: CRED: Wrap task credential accesses in the filesystem subsystem

Wrap access to task credentials so that they can be separated more easily from
the task_struct during the introduction of COW creds.

Change most current->(|e|s|fs)[ug]id to current_(|e|s|fs)[ug]id().

Change some task->e?[ug]id to task_e?[ug]id().  In some places it makes more
sense to use RCU directly rather than a convenient wrapper; these will be
addressed by later patches.

Signed-off-by: David Howells <dhowells@redhat.com>
Reviewed-by: James Morris <jmorris@namei.org>
Acked-by: Serge Hallyn <serue@us.ibm.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: James Morris <jmorris@namei.org>
---
 fs/anon_inodes.c      |  4 ++--
 fs/attr.c             |  4 ++--
 fs/binfmt_elf_fdpic.c |  8 ++++----
 fs/dquot.c            |  4 ++--
 fs/exec.c             | 18 +++++++++---------
 fs/fcntl.c            |  2 +-
 fs/inotify_user.c     |  2 +-
 fs/ioprio.c           |  4 ++--
 fs/locks.c            |  2 +-
 fs/namei.c            | 10 ++++++----
 fs/namespace.c        |  2 +-
 fs/pipe.c             |  4 ++--
 fs/posix_acl.c        |  4 ++--
 fs/quota.c            |  4 ++--
 include/linux/fs.h    |  2 +-
 15 files changed, 38 insertions(+), 36 deletions(-)

(limited to 'include/linux')

diff --git a/fs/anon_inodes.c b/fs/anon_inodes.c
index 3662dd44896..c16d9be1b01 100644
--- a/fs/anon_inodes.c
+++ b/fs/anon_inodes.c
@@ -154,8 +154,8 @@ static struct inode *anon_inode_mkinode(void)
 	 */
 	inode->i_state = I_DIRTY;
 	inode->i_mode = S_IRUSR | S_IWUSR;
-	inode->i_uid = current->fsuid;
-	inode->i_gid = current->fsgid;
+	inode->i_uid = current_fsuid();
+	inode->i_gid = current_fsgid();
 	inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
 	return inode;
 }
diff --git a/fs/attr.c b/fs/attr.c
index 7a83819f6ba..f4360192a93 100644
--- a/fs/attr.c
+++ b/fs/attr.c
@@ -29,13 +29,13 @@ int inode_change_ok(struct inode *inode, struct iattr *attr)
 
 	/* Make sure a caller can chown. */
 	if ((ia_valid & ATTR_UID) &&
-	    (current->fsuid != inode->i_uid ||
+	    (current_fsuid() != inode->i_uid ||
 	     attr->ia_uid != inode->i_uid) && !capable(CAP_CHOWN))
 		goto error;
 
 	/* Make sure caller can chgrp. */
 	if ((ia_valid & ATTR_GID) &&
-	    (current->fsuid != inode->i_uid ||
+	    (current_fsuid() != inode->i_uid ||
 	    (!in_group_p(attr->ia_gid) && attr->ia_gid != inode->i_gid)) &&
 	    !capable(CAP_CHOWN))
 		goto error;
diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c
index 5b5424cb339..488584c8751 100644
--- a/fs/binfmt_elf_fdpic.c
+++ b/fs/binfmt_elf_fdpic.c
@@ -623,10 +623,10 @@ static int create_elf_fdpic_tables(struct linux_binprm *bprm,
 	NEW_AUX_ENT(AT_BASE,	interp_params->elfhdr_addr);
 	NEW_AUX_ENT(AT_FLAGS,	0);
 	NEW_AUX_ENT(AT_ENTRY,	exec_params->entry_addr);
-	NEW_AUX_ENT(AT_UID,	(elf_addr_t) current->uid);
-	NEW_AUX_ENT(AT_EUID,	(elf_addr_t) current->euid);
-	NEW_AUX_ENT(AT_GID,	(elf_addr_t) current->gid);
-	NEW_AUX_ENT(AT_EGID,	(elf_addr_t) current->egid);
+	NEW_AUX_ENT(AT_UID,	(elf_addr_t) current_uid());
+	NEW_AUX_ENT(AT_EUID,	(elf_addr_t) current_euid());
+	NEW_AUX_ENT(AT_GID,	(elf_addr_t) current_gid());
+	NEW_AUX_ENT(AT_EGID,	(elf_addr_t) current_egid());
 	NEW_AUX_ENT(AT_SECURE,	security_bprm_secureexec(bprm));
 	NEW_AUX_ENT(AT_EXECFN,	bprm->exec);
 
diff --git a/fs/dquot.c b/fs/dquot.c
index 5e95261005b..c237ccc8581 100644
--- a/fs/dquot.c
+++ b/fs/dquot.c
@@ -874,7 +874,7 @@ static inline int need_print_warning(struct dquot *dquot)
 
 	switch (dquot->dq_type) {
 		case USRQUOTA:
-			return current->fsuid == dquot->dq_id;
+			return current_fsuid() == dquot->dq_id;
 		case GRPQUOTA:
 			return in_group_p(dquot->dq_id);
 	}
@@ -981,7 +981,7 @@ static void send_warning(const struct dquot *dquot, const char warntype)
 		MINOR(dquot->dq_sb->s_dev));
 	if (ret)
 		goto attr_err_out;
-	ret = nla_put_u64(skb, QUOTA_NL_A_CAUSED_ID, current->user->uid);
+	ret = nla_put_u64(skb, QUOTA_NL_A_CAUSED_ID, current_uid());
 	if (ret)
 		goto attr_err_out;
 	genlmsg_end(skb, msg_head);
diff --git a/fs/exec.c b/fs/exec.c
index 4e834f16d9d..604834f3b20 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -980,7 +980,7 @@ int flush_old_exec(struct linux_binprm * bprm)
 	/* This is the point of no return */
 	current->sas_ss_sp = current->sas_ss_size = 0;
 
-	if (current->euid == current->uid && current->egid == current->gid)
+	if (current_euid() == current_uid() && current_egid() == current_gid())
 		set_dumpable(current->mm, 1);
 	else
 		set_dumpable(current->mm, suid_dumpable);
@@ -1007,7 +1007,7 @@ int flush_old_exec(struct linux_binprm * bprm)
 	 */
 	current->mm->task_size = TASK_SIZE;
 
-	if (bprm->e_uid != current->euid || bprm->e_gid != current->egid) {
+	if (bprm->e_uid != current_euid() || bprm->e_gid != current_egid()) {
 		suid_keys(current);
 		set_dumpable(current->mm, suid_dumpable);
 		current->pdeath_signal = 0;
@@ -1047,8 +1047,8 @@ int prepare_binprm(struct linux_binprm *bprm)
 	if (bprm->file->f_op == NULL)
 		return -EACCES;
 
-	bprm->e_uid = current->euid;
-	bprm->e_gid = current->egid;
+	bprm->e_uid = current_euid();
+	bprm->e_gid = current_egid();
 
 	if(!(bprm->file->f_path.mnt->mnt_flags & MNT_NOSUID)) {
 		/* Set-uid? */
@@ -1096,7 +1096,7 @@ void compute_creds(struct linux_binprm *bprm)
 {
 	int unsafe;
 
-	if (bprm->e_uid != current->uid) {
+	if (bprm->e_uid != current_uid()) {
 		suid_keys(current);
 		current->pdeath_signal = 0;
 	}
@@ -1424,7 +1424,7 @@ static int format_corename(char *corename, long signr)
 			/* uid */
 			case 'u':
 				rc = snprintf(out_ptr, out_end - out_ptr,
-					      "%d", current->uid);
+					      "%d", current_uid());
 				if (rc > out_end - out_ptr)
 					goto out;
 				out_ptr += rc;
@@ -1432,7 +1432,7 @@ static int format_corename(char *corename, long signr)
 			/* gid */
 			case 'g':
 				rc = snprintf(out_ptr, out_end - out_ptr,
-					      "%d", current->gid);
+					      "%d", current_gid());
 				if (rc > out_end - out_ptr)
 					goto out;
 				out_ptr += rc;
@@ -1709,7 +1709,7 @@ int do_coredump(long signr, int exit_code, struct pt_regs * regs)
 	struct inode * inode;
 	struct file * file;
 	int retval = 0;
-	int fsuid = current->fsuid;
+	int fsuid = current_fsuid();
 	int flag = 0;
 	int ispipe = 0;
 	unsigned long core_limit = current->signal->rlim[RLIMIT_CORE].rlim_cur;
@@ -1815,7 +1815,7 @@ int do_coredump(long signr, int exit_code, struct pt_regs * regs)
 	 * Dont allow local users get cute and trick others to coredump
 	 * into their pre-created files:
 	 */
-	if (inode->i_uid != current->fsuid)
+	if (inode->i_uid != current_fsuid())
 		goto close_fail;
 	if (!file->f_op)
 		goto close_fail;
diff --git a/fs/fcntl.c b/fs/fcntl.c
index ac4f7db9f13..bf049a805e5 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -211,7 +211,7 @@ int __f_setown(struct file *filp, struct pid *pid, enum pid_type type,
 	if (err)
 		return err;
 
-	f_modown(filp, pid, type, current->uid, current->euid, force);
+	f_modown(filp, pid, type, current_uid(), current_euid(), force);
 	return 0;
 }
 EXPORT_SYMBOL(__f_setown);
diff --git a/fs/inotify_user.c b/fs/inotify_user.c
index d367e9b9286..e2425bbd871 100644
--- a/fs/inotify_user.c
+++ b/fs/inotify_user.c
@@ -601,7 +601,7 @@ asmlinkage long sys_inotify_init1(int flags)
 		goto out_put_fd;
 	}
 
-	user = get_uid(current->user);
+	user = get_current_user();
 	if (unlikely(atomic_read(&user->inotify_devs) >=
 			inotify_max_user_instances)) {
 		ret = -EMFILE;
diff --git a/fs/ioprio.c b/fs/ioprio.c
index da3cc460d4d..68d2cd80711 100644
--- a/fs/ioprio.c
+++ b/fs/ioprio.c
@@ -32,8 +32,8 @@ static int set_task_ioprio(struct task_struct *task, int ioprio)
 	int err;
 	struct io_context *ioc;
 
-	if (task->uid != current->euid &&
-	    task->uid != current->uid && !capable(CAP_SYS_NICE))
+	if (task->uid != current_euid() &&
+	    task->uid != current_uid() && !capable(CAP_SYS_NICE))
 		return -EPERM;
 
 	err = security_task_setioprio(task, ioprio);
diff --git a/fs/locks.c b/fs/locks.c
index 09062e3ff10..46a2e12f7d4 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -1349,7 +1349,7 @@ int generic_setlease(struct file *filp, long arg, struct file_lock **flp)
 	struct inode *inode = dentry->d_inode;
 	int error, rdlease_count = 0, wrlease_count = 0;
 
-	if ((current->fsuid != inode->i_uid) && !capable(CAP_LEASE))
+	if ((current_fsuid() != inode->i_uid) && !capable(CAP_LEASE))
 		return -EACCES;
 	if (!S_ISREG(inode->i_mode))
 		return -EINVAL;
diff --git a/fs/namei.c b/fs/namei.c
index 09ce58e49e7..42d7b760693 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -186,7 +186,7 @@ int generic_permission(struct inode *inode, int mask,
 
 	mask &= MAY_READ | MAY_WRITE | MAY_EXEC;
 
-	if (current->fsuid == inode->i_uid)
+	if (current_fsuid() == inode->i_uid)
 		mode >>= 6;
 	else {
 		if (IS_POSIXACL(inode) && (mode & S_IRWXG) && check_acl) {
@@ -441,7 +441,7 @@ static int exec_permission_lite(struct inode *inode)
 	if (inode->i_op && inode->i_op->permission)
 		return -EAGAIN;
 
-	if (current->fsuid == inode->i_uid)
+	if (current_fsuid() == inode->i_uid)
 		mode >>= 6;
 	else if (in_group_p(inode->i_gid))
 		mode >>= 3;
@@ -1334,11 +1334,13 @@ static int user_path_parent(int dfd, const char __user *path,
  */
 static inline int check_sticky(struct inode *dir, struct inode *inode)
 {
+	uid_t fsuid = current_fsuid();
+
 	if (!(dir->i_mode & S_ISVTX))
 		return 0;
-	if (inode->i_uid == current->fsuid)
+	if (inode->i_uid == fsuid)
 		return 0;
-	if (dir->i_uid == current->fsuid)
+	if (dir->i_uid == fsuid)
 		return 0;
 	return !capable(CAP_FOWNER);
 }
diff --git a/fs/namespace.c b/fs/namespace.c
index cce46702d33..d8bc2c4704a 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -1176,7 +1176,7 @@ static int mount_is_safe(struct path *path)
 	if (S_ISLNK(path->dentry->d_inode->i_mode))
 		return -EPERM;
 	if (path->dentry->d_inode->i_mode & S_ISVTX) {
-		if (current->uid != path->dentry->d_inode->i_uid)
+		if (current_uid() != path->dentry->d_inode->i_uid)
 			return -EPERM;
 	}
 	if (inode_permission(path->dentry->d_inode, MAY_WRITE))
diff --git a/fs/pipe.c b/fs/pipe.c
index 7aea8b89baa..aaf797bd57b 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -899,8 +899,8 @@ static struct inode * get_pipe_inode(void)
 	 */
 	inode->i_state = I_DIRTY;
 	inode->i_mode = S_IFIFO | S_IRUSR | S_IWUSR;
-	inode->i_uid = current->fsuid;
-	inode->i_gid = current->fsgid;
+	inode->i_uid = current_fsuid();
+	inode->i_gid = current_fsgid();
 	inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
 
 	return inode;
diff --git a/fs/posix_acl.c b/fs/posix_acl.c
index aec931e0997..39df95a0ec2 100644
--- a/fs/posix_acl.c
+++ b/fs/posix_acl.c
@@ -217,11 +217,11 @@ posix_acl_permission(struct inode *inode, const struct posix_acl *acl, int want)
                 switch(pa->e_tag) {
                         case ACL_USER_OBJ:
 				/* (May have been checked already) */
-                                if (inode->i_uid == current->fsuid)
+				if (inode->i_uid == current_fsuid())
                                         goto check_perm;
                                 break;
                         case ACL_USER:
-                                if (pa->e_id == current->fsuid)
+				if (pa->e_id == current_fsuid())
                                         goto mask;
 				break;
                         case ACL_GROUP_OBJ:
diff --git a/fs/quota.c b/fs/quota.c
index 7f4386ebc23..b7fe44e0161 100644
--- a/fs/quota.c
+++ b/fs/quota.c
@@ -79,7 +79,7 @@ static int generic_quotactl_valid(struct super_block *sb, int type, int cmd, qid
 
 	/* Check privileges */
 	if (cmd == Q_GETQUOTA) {
-		if (((type == USRQUOTA && current->euid != id) ||
+		if (((type == USRQUOTA && current_euid() != id) ||
 		     (type == GRPQUOTA && !in_egroup_p(id))) &&
 		    !capable(CAP_SYS_ADMIN))
 			return -EPERM;
@@ -130,7 +130,7 @@ static int xqm_quotactl_valid(struct super_block *sb, int type, int cmd, qid_t i
 
 	/* Check privileges */
 	if (cmd == Q_XGETQUOTA) {
-		if (((type == XQM_USRQUOTA && current->euid != id) ||
+		if (((type == XQM_USRQUOTA && current_euid() != id) ||
 		     (type == XQM_GRPQUOTA && !in_egroup_p(id))) &&
 		     !capable(CAP_SYS_ADMIN))
 			return -EPERM;
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 0dcdd9458f4..b3d404aaabe 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1193,7 +1193,7 @@ enum {
 #define has_fs_excl() atomic_read(&current->fs_excl)
 
 #define is_owner_or_cap(inode)	\
-	((current->fsuid == (inode)->i_uid) || capable(CAP_FOWNER))
+	((current_fsuid() == (inode)->i_uid) || capable(CAP_FOWNER))
 
 /* not quite ready to be deprecated, but... */
 extern void lock_super(struct super_block *);
-- 
cgit v1.2.3-70-g09d2


From e9e349b051d98799b743ebf248cc2d986fedf090 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 14 Nov 2008 10:39:13 +1100
Subject: KEYS: Disperse linux/key_ui.h

Disperse the bits of linux/key_ui.h as the reason they were put here (keyfs)
didn't get in.

Signed-off-by: David Howells <dhowells@redhat.com>
Reviewed-by: James Morris <jmorris@namei.org>
Signed-off-by: James Morris <jmorris@namei.org>
---
 include/keys/keyring-type.h | 31 +++++++++++++++++++++
 include/linux/key-ui.h      | 66 ---------------------------------------------
 security/keys/internal.h    | 31 ++++++++++++++++++++-
 security/keys/keyring.c     |  1 +
 security/keys/request_key.c |  2 ++
 5 files changed, 64 insertions(+), 67 deletions(-)
 create mode 100644 include/keys/keyring-type.h
 delete mode 100644 include/linux/key-ui.h

(limited to 'include/linux')

diff --git a/include/keys/keyring-type.h b/include/keys/keyring-type.h
new file mode 100644
index 00000000000..843f872a4b6
--- /dev/null
+++ b/include/keys/keyring-type.h
@@ -0,0 +1,31 @@
+/* Keyring key type
+ *
+ * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#ifndef _KEYS_KEYRING_TYPE_H
+#define _KEYS_KEYRING_TYPE_H
+
+#include <linux/key.h>
+#include <linux/rcupdate.h>
+
+/*
+ * the keyring payload contains a list of the keys to which the keyring is
+ * subscribed
+ */
+struct keyring_list {
+	struct rcu_head	rcu;		/* RCU deletion hook */
+	unsigned short	maxkeys;	/* max keys this list can hold */
+	unsigned short	nkeys;		/* number of keys currently held */
+	unsigned short	delkey;		/* key to be unlinked by RCU */
+	struct key	*keys[0];
+};
+
+
+#endif /* _KEYS_KEYRING_TYPE_H */
diff --git a/include/linux/key-ui.h b/include/linux/key-ui.h
deleted file mode 100644
index e8b8a7a5c49..00000000000
--- a/include/linux/key-ui.h
+++ /dev/null
@@ -1,66 +0,0 @@
-/* key-ui.h: key userspace interface stuff
- *
- * Copyright (C) 2004 Red Hat, Inc. All Rights Reserved.
- * Written by David Howells (dhowells@redhat.com)
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- */
-
-#ifndef _LINUX_KEY_UI_H
-#define _LINUX_KEY_UI_H
-
-#include <linux/key.h>
-
-/* the key tree */
-extern struct rb_root key_serial_tree;
-extern spinlock_t key_serial_lock;
-
-/* required permissions */
-#define	KEY_VIEW	0x01	/* require permission to view attributes */
-#define	KEY_READ	0x02	/* require permission to read content */
-#define	KEY_WRITE	0x04	/* require permission to update / modify */
-#define	KEY_SEARCH	0x08	/* require permission to search (keyring) or find (key) */
-#define	KEY_LINK	0x10	/* require permission to link */
-#define	KEY_SETATTR	0x20	/* require permission to change attributes */
-#define	KEY_ALL		0x3f	/* all the above permissions */
-
-/*
- * the keyring payload contains a list of the keys to which the keyring is
- * subscribed
- */
-struct keyring_list {
-	struct rcu_head	rcu;		/* RCU deletion hook */
-	unsigned short	maxkeys;	/* max keys this list can hold */
-	unsigned short	nkeys;		/* number of keys currently held */
-	unsigned short	delkey;		/* key to be unlinked by RCU */
-	struct key	*keys[0];
-};
-
-/*
- * check to see whether permission is granted to use a key in the desired way
- */
-extern int key_task_permission(const key_ref_t key_ref,
-			       struct task_struct *context,
-			       key_perm_t perm);
-
-static inline int key_permission(const key_ref_t key_ref, key_perm_t perm)
-{
-	return key_task_permission(key_ref, current, perm);
-}
-
-extern key_ref_t lookup_user_key(struct task_struct *context,
-				 key_serial_t id, int create, int partial,
-				 key_perm_t perm);
-
-extern long join_session_keyring(const char *name);
-
-extern struct key_type *key_type_lookup(const char *type);
-extern void key_type_put(struct key_type *ktype);
-
-#define key_negative_timeout	60	/* default timeout on a negative key's existence */
-
-
-#endif /* _LINUX_KEY_UI_H */
diff --git a/security/keys/internal.h b/security/keys/internal.h
index b39f5c2e2c4..a60c68138b4 100644
--- a/security/keys/internal.h
+++ b/security/keys/internal.h
@@ -13,7 +13,6 @@
 #define _INTERNAL_H
 
 #include <linux/key-type.h>
-#include <linux/key-ui.h>
 
 static inline __attribute__((format(printf, 1, 2)))
 void no_printk(const char *fmt, ...)
@@ -82,6 +81,9 @@ extern struct mutex key_construction_mutex;
 extern wait_queue_head_t request_key_conswq;
 
 
+extern struct key_type *key_type_lookup(const char *type);
+extern void key_type_put(struct key_type *ktype);
+
 extern int __key_link(struct key *keyring, struct key *key);
 
 extern key_ref_t __keyring_search_one(key_ref_t keyring_ref,
@@ -118,6 +120,33 @@ extern struct key *request_key_and_link(struct key_type *type,
 					struct key *dest_keyring,
 					unsigned long flags);
 
+extern key_ref_t lookup_user_key(struct task_struct *context,
+				 key_serial_t id, int create, int partial,
+				 key_perm_t perm);
+
+extern long join_session_keyring(const char *name);
+
+/*
+ * check to see whether permission is granted to use a key in the desired way
+ */
+extern int key_task_permission(const key_ref_t key_ref,
+			       struct task_struct *context,
+			       key_perm_t perm);
+
+static inline int key_permission(const key_ref_t key_ref, key_perm_t perm)
+{
+	return key_task_permission(key_ref, current, perm);
+}
+
+/* required permissions */
+#define	KEY_VIEW	0x01	/* require permission to view attributes */
+#define	KEY_READ	0x02	/* require permission to read content */
+#define	KEY_WRITE	0x04	/* require permission to update / modify */
+#define	KEY_SEARCH	0x08	/* require permission to search (keyring) or find (key) */
+#define	KEY_LINK	0x10	/* require permission to link */
+#define	KEY_SETATTR	0x20	/* require permission to change attributes */
+#define	KEY_ALL		0x3f	/* all the above permissions */
+
 /*
  * request_key authorisation
  */
diff --git a/security/keys/keyring.c b/security/keys/keyring.c
index a9ab8affc09..fdf75f90199 100644
--- a/security/keys/keyring.c
+++ b/security/keys/keyring.c
@@ -16,6 +16,7 @@
 #include <linux/security.h>
 #include <linux/seq_file.h>
 #include <linux/err.h>
+#include <keys/keyring-type.h>
 #include <asm/uaccess.h>
 #include "internal.h"
 
diff --git a/security/keys/request_key.c b/security/keys/request_key.c
index a8ebc9520ca..91953c81449 100644
--- a/security/keys/request_key.c
+++ b/security/keys/request_key.c
@@ -19,6 +19,8 @@
 #include <linux/slab.h>
 #include "internal.h"
 
+#define key_negative_timeout	60	/* default timeout on a negative key's existence */
+
 /*
  * wait_on_bit() sleep function for uninterruptible waiting
  */
-- 
cgit v1.2.3-70-g09d2


From 8bbf4976b59fc9fc2861e79cab7beb3f6d647640 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 14 Nov 2008 10:39:14 +1100
Subject: KEYS: Alter use of key instantiation link-to-keyring argument

Alter the use of the key instantiation and negation functions' link-to-keyring
arguments.  Currently this specifies a keyring in the target process to link
the key into, creating the keyring if it doesn't exist.  This, however, can be
a problem for copy-on-write credentials as it means that the instantiating
process can alter the credentials of the requesting process.

This patch alters the behaviour such that:

 (1) If keyctl_instantiate_key() or keyctl_negate_key() are given a specific
     keyring by ID (ringid >= 0), then that keyring will be used.

 (2) If keyctl_instantiate_key() or keyctl_negate_key() are given one of the
     special constants that refer to the requesting process's keyrings
     (KEY_SPEC_*_KEYRING, all <= 0), then:

     (a) If sys_request_key() was given a keyring to use (destringid) then the
     	 key will be attached to that keyring.

     (b) If sys_request_key() was given a NULL keyring, then the key being
     	 instantiated will be attached to the default keyring as set by
     	 keyctl_set_reqkey_keyring().

 (3) No extra link will be made.

Decision point (1) follows current behaviour, and allows those instantiators
who've searched for a specifically named keyring in the requestor's keyring so
as to partition the keys by type to still have their named keyrings.

Decision point (2) allows the requestor to make sure that the key or keys that
get produced by request_key() go where they want, whilst allowing the
instantiator to request that the key is retained.  This is mainly useful for
situations where the instantiator makes a secondary request, the key for which
should be retained by the initial requestor:

	+-----------+        +--------------+        +--------------+
	|           |        |              |        |              |
	| Requestor |------->| Instantiator |------->| Instantiator |
	|           |        |              |        |              |
	+-----------+        +--------------+        +--------------+
	           request_key()           request_key()

This might be useful, for example, in Kerberos, where the requestor requests a
ticket, and then the ticket instantiator requests the TGT, which someone else
then has to go and fetch.  The TGT, however, should be retained in the
keyrings of the requestor, not the first instantiator.  To make this explict
an extra special keyring constant is also added.

Signed-off-by: David Howells <dhowells@redhat.com>
Reviewed-by: James Morris <jmorris@namei.org>
Signed-off-by: James Morris <jmorris@namei.org>
---
 include/linux/key.h              |  16 +++---
 include/linux/keyctl.h           |   4 +-
 kernel/kmod.c                    |   2 +-
 security/keys/internal.h         |  12 ++--
 security/keys/keyctl.c           | 118 +++++++++++++++++++++++----------------
 security/keys/process_keys.c     |  88 ++++++++++++++++++-----------
 security/keys/request_key.c      |  75 +++++++++++++++++--------
 security/keys/request_key_auth.c |   5 +-
 8 files changed, 199 insertions(+), 121 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/key.h b/include/linux/key.h
index 1b70e35a71e..df709e1af3c 100644
--- a/include/linux/key.h
+++ b/include/linux/key.h
@@ -287,11 +287,11 @@ extern void key_fsuid_changed(struct task_struct *tsk);
 extern void key_fsgid_changed(struct task_struct *tsk);
 extern void key_init(void);
 
-#define __install_session_keyring(tsk, keyring)			\
-({								\
-	struct key *old_session = tsk->signal->session_keyring;	\
-	tsk->signal->session_keyring = keyring;			\
-	old_session;						\
+#define __install_session_keyring(keyring)				\
+({									\
+	struct key *old_session = current->signal->session_keyring;	\
+	current->signal->session_keyring = keyring;			\
+	old_session;							\
 })
 
 #else /* CONFIG_KEYS */
@@ -302,11 +302,11 @@ extern void key_init(void);
 #define key_revoke(k)			do { } while(0)
 #define key_put(k)			do { } while(0)
 #define key_ref_put(k)			do { } while(0)
-#define make_key_ref(k, p)			({ NULL; })
-#define key_ref_to_ptr(k)		({ NULL; })
+#define make_key_ref(k, p)		NULL
+#define key_ref_to_ptr(k)		NULL
 #define is_key_possessed(k)		0
 #define switch_uid_keyring(u)		do { } while(0)
-#define __install_session_keyring(t, k)	({ NULL; })
+#define __install_session_keyring(k)	({ NULL; })
 #define copy_keys(f,t)			0
 #define copy_thread_group_keys(t)	0
 #define exit_keys(t)			do { } while(0)
diff --git a/include/linux/keyctl.h b/include/linux/keyctl.h
index 656ee6b77a4..c0688eb7209 100644
--- a/include/linux/keyctl.h
+++ b/include/linux/keyctl.h
@@ -1,6 +1,6 @@
 /* keyctl.h: keyctl command IDs
  *
- * Copyright (C) 2004 Red Hat, Inc. All Rights Reserved.
+ * Copyright (C) 2004, 2008 Red Hat, Inc. All Rights Reserved.
  * Written by David Howells (dhowells@redhat.com)
  *
  * This program is free software; you can redistribute it and/or
@@ -20,6 +20,7 @@
 #define KEY_SPEC_USER_SESSION_KEYRING	-5	/* - key ID for UID-session keyring */
 #define KEY_SPEC_GROUP_KEYRING		-6	/* - key ID for GID-specific keyring */
 #define KEY_SPEC_REQKEY_AUTH_KEY	-7	/* - key ID for assumed request_key auth key */
+#define KEY_SPEC_REQUESTOR_KEYRING	-8	/* - key ID for request_key() dest keyring */
 
 /* request-key default keyrings */
 #define KEY_REQKEY_DEFL_NO_CHANGE		-1
@@ -30,6 +31,7 @@
 #define KEY_REQKEY_DEFL_USER_KEYRING		4
 #define KEY_REQKEY_DEFL_USER_SESSION_KEYRING	5
 #define KEY_REQKEY_DEFL_GROUP_KEYRING		6
+#define KEY_REQKEY_DEFL_REQUESTOR_KEYRING	7
 
 /* keyctl commands */
 #define KEYCTL_GET_KEYRING_ID		0	/* ask for a keyring's ID */
diff --git a/kernel/kmod.c b/kernel/kmod.c
index 3d3c3ea3a02..f044f8f5770 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -140,7 +140,7 @@ static int ____call_usermodehelper(void *data)
 	/* Unblock all signals and set the session keyring. */
 	new_session = key_get(sub_info->ring);
 	spin_lock_irq(&current->sighand->siglock);
-	old_session = __install_session_keyring(current, new_session);
+	old_session = __install_session_keyring(new_session);
 	flush_signal_handlers(current, 1);
 	sigemptyset(&current->blocked);
 	recalc_sigpending();
diff --git a/security/keys/internal.h b/security/keys/internal.h
index a60c68138b4..d1586c62978 100644
--- a/security/keys/internal.h
+++ b/security/keys/internal.h
@@ -109,8 +109,9 @@ extern key_ref_t search_process_keyrings(struct key_type *type,
 
 extern struct key *find_keyring_by_name(const char *name, bool skip_perm_check);
 
-extern int install_thread_keyring(struct task_struct *tsk);
-extern int install_process_keyring(struct task_struct *tsk);
+extern int install_user_keyrings(void);
+extern int install_thread_keyring(void);
+extern int install_process_keyring(void);
 
 extern struct key *request_key_and_link(struct key_type *type,
 					const char *description,
@@ -120,8 +121,7 @@ extern struct key *request_key_and_link(struct key_type *type,
 					struct key *dest_keyring,
 					unsigned long flags);
 
-extern key_ref_t lookup_user_key(struct task_struct *context,
-				 key_serial_t id, int create, int partial,
+extern key_ref_t lookup_user_key(key_serial_t id, int create, int partial,
 				 key_perm_t perm);
 
 extern long join_session_keyring(const char *name);
@@ -152,6 +152,7 @@ static inline int key_permission(const key_ref_t key_ref, key_perm_t perm)
  */
 struct request_key_auth {
 	struct key		*target_key;
+	struct key		*dest_keyring;
 	struct task_struct	*context;
 	void			*callout_info;
 	size_t			callout_len;
@@ -161,7 +162,8 @@ struct request_key_auth {
 extern struct key_type key_type_request_key_auth;
 extern struct key *request_key_auth_new(struct key *target,
 					const void *callout_info,
-					size_t callout_len);
+					size_t callout_len,
+					struct key *dest_keyring);
 
 extern struct key *key_get_instantiation_authkey(key_serial_t target_id);
 
diff --git a/security/keys/keyctl.c b/security/keys/keyctl.c
index 3f09e5b2a78..fcce331eca7 100644
--- a/security/keys/keyctl.c
+++ b/security/keys/keyctl.c
@@ -103,7 +103,7 @@ asmlinkage long sys_add_key(const char __user *_type,
 	}
 
 	/* find the target keyring (which must be writable) */
-	keyring_ref = lookup_user_key(NULL, ringid, 1, 0, KEY_WRITE);
+	keyring_ref = lookup_user_key(ringid, 1, 0, KEY_WRITE);
 	if (IS_ERR(keyring_ref)) {
 		ret = PTR_ERR(keyring_ref);
 		goto error3;
@@ -185,7 +185,7 @@ asmlinkage long sys_request_key(const char __user *_type,
 	/* get the destination keyring if specified */
 	dest_ref = NULL;
 	if (destringid) {
-		dest_ref = lookup_user_key(NULL, destringid, 1, 0, KEY_WRITE);
+		dest_ref = lookup_user_key(destringid, 1, 0, KEY_WRITE);
 		if (IS_ERR(dest_ref)) {
 			ret = PTR_ERR(dest_ref);
 			goto error3;
@@ -235,7 +235,7 @@ long keyctl_get_keyring_ID(key_serial_t id, int create)
 	key_ref_t key_ref;
 	long ret;
 
-	key_ref = lookup_user_key(NULL, id, create, 0, KEY_SEARCH);
+	key_ref = lookup_user_key(id, create, 0, KEY_SEARCH);
 	if (IS_ERR(key_ref)) {
 		ret = PTR_ERR(key_ref);
 		goto error;
@@ -308,7 +308,7 @@ long keyctl_update_key(key_serial_t id,
 	}
 
 	/* find the target key (which must be writable) */
-	key_ref = lookup_user_key(NULL, id, 0, 0, KEY_WRITE);
+	key_ref = lookup_user_key(id, 0, 0, KEY_WRITE);
 	if (IS_ERR(key_ref)) {
 		ret = PTR_ERR(key_ref);
 		goto error2;
@@ -336,7 +336,7 @@ long keyctl_revoke_key(key_serial_t id)
 	key_ref_t key_ref;
 	long ret;
 
-	key_ref = lookup_user_key(NULL, id, 0, 0, KEY_WRITE);
+	key_ref = lookup_user_key(id, 0, 0, KEY_WRITE);
 	if (IS_ERR(key_ref)) {
 		ret = PTR_ERR(key_ref);
 		goto error;
@@ -362,7 +362,7 @@ long keyctl_keyring_clear(key_serial_t ringid)
 	key_ref_t keyring_ref;
 	long ret;
 
-	keyring_ref = lookup_user_key(NULL, ringid, 1, 0, KEY_WRITE);
+	keyring_ref = lookup_user_key(ringid, 1, 0, KEY_WRITE);
 	if (IS_ERR(keyring_ref)) {
 		ret = PTR_ERR(keyring_ref);
 		goto error;
@@ -388,13 +388,13 @@ long keyctl_keyring_link(key_serial_t id, key_serial_t ringid)
 	key_ref_t keyring_ref, key_ref;
 	long ret;
 
-	keyring_ref = lookup_user_key(NULL, ringid, 1, 0, KEY_WRITE);
+	keyring_ref = lookup_user_key(ringid, 1, 0, KEY_WRITE);
 	if (IS_ERR(keyring_ref)) {
 		ret = PTR_ERR(keyring_ref);
 		goto error;
 	}
 
-	key_ref = lookup_user_key(NULL, id, 1, 0, KEY_LINK);
+	key_ref = lookup_user_key(id, 1, 0, KEY_LINK);
 	if (IS_ERR(key_ref)) {
 		ret = PTR_ERR(key_ref);
 		goto error2;
@@ -422,13 +422,13 @@ long keyctl_keyring_unlink(key_serial_t id, key_serial_t ringid)
 	key_ref_t keyring_ref, key_ref;
 	long ret;
 
-	keyring_ref = lookup_user_key(NULL, ringid, 0, 0, KEY_WRITE);
+	keyring_ref = lookup_user_key(ringid, 0, 0, KEY_WRITE);
 	if (IS_ERR(keyring_ref)) {
 		ret = PTR_ERR(keyring_ref);
 		goto error;
 	}
 
-	key_ref = lookup_user_key(NULL, id, 0, 0, 0);
+	key_ref = lookup_user_key(id, 0, 0, 0);
 	if (IS_ERR(key_ref)) {
 		ret = PTR_ERR(key_ref);
 		goto error2;
@@ -464,7 +464,7 @@ long keyctl_describe_key(key_serial_t keyid,
 	char *tmpbuf;
 	long ret;
 
-	key_ref = lookup_user_key(NULL, keyid, 0, 1, KEY_VIEW);
+	key_ref = lookup_user_key(keyid, 0, 1, KEY_VIEW);
 	if (IS_ERR(key_ref)) {
 		/* viewing a key under construction is permitted if we have the
 		 * authorisation token handy */
@@ -472,7 +472,7 @@ long keyctl_describe_key(key_serial_t keyid,
 			instkey = key_get_instantiation_authkey(keyid);
 			if (!IS_ERR(instkey)) {
 				key_put(instkey);
-				key_ref = lookup_user_key(NULL, keyid,
+				key_ref = lookup_user_key(keyid,
 							  0, 1, 0);
 				if (!IS_ERR(key_ref))
 					goto okay;
@@ -557,7 +557,7 @@ long keyctl_keyring_search(key_serial_t ringid,
 	}
 
 	/* get the keyring at which to begin the search */
-	keyring_ref = lookup_user_key(NULL, ringid, 0, 0, KEY_SEARCH);
+	keyring_ref = lookup_user_key(ringid, 0, 0, KEY_SEARCH);
 	if (IS_ERR(keyring_ref)) {
 		ret = PTR_ERR(keyring_ref);
 		goto error2;
@@ -566,7 +566,7 @@ long keyctl_keyring_search(key_serial_t ringid,
 	/* get the destination keyring if specified */
 	dest_ref = NULL;
 	if (destringid) {
-		dest_ref = lookup_user_key(NULL, destringid, 1, 0, KEY_WRITE);
+		dest_ref = lookup_user_key(destringid, 1, 0, KEY_WRITE);
 		if (IS_ERR(dest_ref)) {
 			ret = PTR_ERR(dest_ref);
 			goto error3;
@@ -636,7 +636,7 @@ long keyctl_read_key(key_serial_t keyid, char __user *buffer, size_t buflen)
 	long ret;
 
 	/* find the key first */
-	key_ref = lookup_user_key(NULL, keyid, 0, 0, 0);
+	key_ref = lookup_user_key(keyid, 0, 0, 0);
 	if (IS_ERR(key_ref)) {
 		ret = -ENOKEY;
 		goto error;
@@ -699,7 +699,7 @@ long keyctl_chown_key(key_serial_t id, uid_t uid, gid_t gid)
 	if (uid == (uid_t) -1 && gid == (gid_t) -1)
 		goto error;
 
-	key_ref = lookup_user_key(NULL, id, 1, 1, KEY_SETATTR);
+	key_ref = lookup_user_key(id, 1, 1, KEY_SETATTR);
 	if (IS_ERR(key_ref)) {
 		ret = PTR_ERR(key_ref);
 		goto error;
@@ -804,7 +804,7 @@ long keyctl_setperm_key(key_serial_t id, key_perm_t perm)
 	if (perm & ~(KEY_POS_ALL | KEY_USR_ALL | KEY_GRP_ALL | KEY_OTH_ALL))
 		goto error;
 
-	key_ref = lookup_user_key(NULL, id, 1, 1, KEY_SETATTR);
+	key_ref = lookup_user_key(id, 1, 1, KEY_SETATTR);
 	if (IS_ERR(key_ref)) {
 		ret = PTR_ERR(key_ref);
 		goto error;
@@ -829,6 +829,43 @@ error:
 
 } /* end keyctl_setperm_key() */
 
+/*
+ * get the destination keyring for instantiation
+ */
+static long get_instantiation_keyring(key_serial_t ringid,
+				      struct request_key_auth *rka,
+				      struct key **_dest_keyring)
+{
+	key_ref_t dkref;
+
+	/* just return a NULL pointer if we weren't asked to make a link */
+	if (ringid == 0) {
+		*_dest_keyring = NULL;
+		return 0;
+	}
+
+	/* if a specific keyring is nominated by ID, then use that */
+	if (ringid > 0) {
+		dkref = lookup_user_key(ringid, 1, 0, KEY_WRITE);
+		if (IS_ERR(dkref))
+			return PTR_ERR(dkref);
+		*_dest_keyring = key_ref_to_ptr(dkref);
+		return 0;
+	}
+
+	if (ringid == KEY_SPEC_REQKEY_AUTH_KEY)
+		return -EINVAL;
+
+	/* otherwise specify the destination keyring recorded in the
+	 * authorisation key (any KEY_SPEC_*_KEYRING) */
+	if (ringid >= KEY_SPEC_REQUESTOR_KEYRING) {
+		*_dest_keyring = rka->dest_keyring;
+		return 0;
+	}
+
+	return -ENOKEY;
+}
+
 /*****************************************************************************/
 /*
  * instantiate the key with the specified payload, and, if one is given, link
@@ -840,8 +877,7 @@ long keyctl_instantiate_key(key_serial_t id,
 			    key_serial_t ringid)
 {
 	struct request_key_auth *rka;
-	struct key *instkey;
-	key_ref_t keyring_ref;
+	struct key *instkey, *dest_keyring;
 	void *payload;
 	long ret;
 	bool vm = false;
@@ -883,21 +919,15 @@ long keyctl_instantiate_key(key_serial_t id,
 
 	/* find the destination keyring amongst those belonging to the
 	 * requesting task */
-	keyring_ref = NULL;
-	if (ringid) {
-		keyring_ref = lookup_user_key(rka->context, ringid, 1, 0,
-					      KEY_WRITE);
-		if (IS_ERR(keyring_ref)) {
-			ret = PTR_ERR(keyring_ref);
-			goto error2;
-		}
-	}
+	ret = get_instantiation_keyring(ringid, rka, &dest_keyring);
+	if (ret < 0)
+		goto error2;
 
 	/* instantiate the key and link it into a keyring */
 	ret = key_instantiate_and_link(rka->target_key, payload, plen,
-				       key_ref_to_ptr(keyring_ref), instkey);
+				       dest_keyring, instkey);
 
-	key_ref_put(keyring_ref);
+	key_put(dest_keyring);
 
 	/* discard the assumed authority if it's just been disabled by
 	 * instantiation of the key */
@@ -924,8 +954,7 @@ error:
 long keyctl_negate_key(key_serial_t id, unsigned timeout, key_serial_t ringid)
 {
 	struct request_key_auth *rka;
-	struct key *instkey;
-	key_ref_t keyring_ref;
+	struct key *instkey, *dest_keyring;
 	long ret;
 
 	/* the appropriate instantiation authorisation key must have been
@@ -941,20 +970,15 @@ long keyctl_negate_key(key_serial_t id, unsigned timeout, key_serial_t ringid)
 
 	/* find the destination keyring if present (which must also be
 	 * writable) */
-	keyring_ref = NULL;
-	if (ringid) {
-		keyring_ref = lookup_user_key(NULL, ringid, 1, 0, KEY_WRITE);
-		if (IS_ERR(keyring_ref)) {
-			ret = PTR_ERR(keyring_ref);
-			goto error;
-		}
-	}
+	ret = get_instantiation_keyring(ringid, rka, &dest_keyring);
+	if (ret < 0)
+		goto error;
 
 	/* instantiate the key and link it into a keyring */
 	ret = key_negate_and_link(rka->target_key, timeout,
-				  key_ref_to_ptr(keyring_ref), instkey);
+				  dest_keyring, instkey);
 
-	key_ref_put(keyring_ref);
+	key_put(dest_keyring);
 
 	/* discard the assumed authority if it's just been disabled by
 	 * instantiation of the key */
@@ -979,13 +1003,13 @@ long keyctl_set_reqkey_keyring(int reqkey_defl)
 
 	switch (reqkey_defl) {
 	case KEY_REQKEY_DEFL_THREAD_KEYRING:
-		ret = install_thread_keyring(current);
+		ret = install_thread_keyring();
 		if (ret < 0)
 			return ret;
 		goto set;
 
 	case KEY_REQKEY_DEFL_PROCESS_KEYRING:
-		ret = install_process_keyring(current);
+		ret = install_process_keyring();
 		if (ret < 0)
 			return ret;
 
@@ -1018,7 +1042,7 @@ long keyctl_set_timeout(key_serial_t id, unsigned timeout)
 	time_t expiry;
 	long ret;
 
-	key_ref = lookup_user_key(NULL, id, 1, 1, KEY_SETATTR);
+	key_ref = lookup_user_key(id, 1, 1, KEY_SETATTR);
 	if (IS_ERR(key_ref)) {
 		ret = PTR_ERR(key_ref);
 		goto error;
@@ -1105,7 +1129,7 @@ long keyctl_get_security(key_serial_t keyid,
 	char *context;
 	long ret;
 
-	key_ref = lookup_user_key(NULL, keyid, 0, 1, KEY_VIEW);
+	key_ref = lookup_user_key(keyid, 0, 1, KEY_VIEW);
 	if (IS_ERR(key_ref)) {
 		if (PTR_ERR(key_ref) != -EACCES)
 			return PTR_ERR(key_ref);
@@ -1117,7 +1141,7 @@ long keyctl_get_security(key_serial_t keyid,
 			return PTR_ERR(key_ref);
 		key_put(instkey);
 
-		key_ref = lookup_user_key(NULL, keyid, 0, 1, 0);
+		key_ref = lookup_user_key(keyid, 0, 1, 0);
 		if (IS_ERR(key_ref))
 			return PTR_ERR(key_ref);
 	}
diff --git a/security/keys/process_keys.c b/security/keys/process_keys.c
index 5be6d018759..1c793b7090a 100644
--- a/security/keys/process_keys.c
+++ b/security/keys/process_keys.c
@@ -40,9 +40,9 @@ struct key_user root_key_user = {
 /*
  * install user and user session keyrings for a particular UID
  */
-static int install_user_keyrings(struct task_struct *tsk)
+int install_user_keyrings(void)
 {
-	struct user_struct *user = tsk->user;
+	struct user_struct *user = current->user;
 	struct key *uid_keyring, *session_keyring;
 	char buf[20];
 	int ret;
@@ -67,7 +67,7 @@ static int install_user_keyrings(struct task_struct *tsk)
 		uid_keyring = find_keyring_by_name(buf, true);
 		if (IS_ERR(uid_keyring)) {
 			uid_keyring = keyring_alloc(buf, user->uid, (gid_t) -1,
-						    tsk, KEY_ALLOC_IN_QUOTA,
+						    current, KEY_ALLOC_IN_QUOTA,
 						    NULL);
 			if (IS_ERR(uid_keyring)) {
 				ret = PTR_ERR(uid_keyring);
@@ -83,7 +83,8 @@ static int install_user_keyrings(struct task_struct *tsk)
 		if (IS_ERR(session_keyring)) {
 			session_keyring =
 				keyring_alloc(buf, user->uid, (gid_t) -1,
-					      tsk, KEY_ALLOC_IN_QUOTA, NULL);
+					      current, KEY_ALLOC_IN_QUOTA,
+					      NULL);
 			if (IS_ERR(session_keyring)) {
 				ret = PTR_ERR(session_keyring);
 				goto error_release;
@@ -146,8 +147,9 @@ void switch_uid_keyring(struct user_struct *new_user)
 /*
  * install a fresh thread keyring, discarding the old one
  */
-int install_thread_keyring(struct task_struct *tsk)
+int install_thread_keyring(void)
 {
+	struct task_struct *tsk = current;
 	struct key *keyring, *old;
 	char buf[20];
 	int ret;
@@ -178,8 +180,9 @@ error:
 /*
  * make sure a process keyring is installed
  */
-int install_process_keyring(struct task_struct *tsk)
+int install_process_keyring(void)
 {
+	struct task_struct *tsk = current;
 	struct key *keyring;
 	char buf[20];
 	int ret;
@@ -218,9 +221,9 @@ error:
  * install a session keyring, discarding the old one
  * - if a keyring is not supplied, an empty one is invented
  */
-static int install_session_keyring(struct task_struct *tsk,
-				   struct key *keyring)
+static int install_session_keyring(struct key *keyring)
 {
+	struct task_struct *tsk = current;
 	unsigned long flags;
 	struct key *old;
 	char buf[20];
@@ -572,93 +575,91 @@ static int lookup_user_key_possessed(const struct key *key, const void *target)
  * - don't create special keyrings unless so requested
  * - partially constructed keys aren't found unless requested
  */
-key_ref_t lookup_user_key(struct task_struct *context, key_serial_t id,
-			  int create, int partial, key_perm_t perm)
+key_ref_t lookup_user_key(key_serial_t id, int create, int partial,
+			  key_perm_t perm)
 {
+	struct request_key_auth *rka;
+	struct task_struct *t = current;
 	key_ref_t key_ref, skey_ref;
 	struct key *key;
 	int ret;
 
-	if (!context)
-		context = current;
-
 	key_ref = ERR_PTR(-ENOKEY);
 
 	switch (id) {
 	case KEY_SPEC_THREAD_KEYRING:
-		if (!context->thread_keyring) {
+		if (!t->thread_keyring) {
 			if (!create)
 				goto error;
 
-			ret = install_thread_keyring(context);
+			ret = install_thread_keyring();
 			if (ret < 0) {
 				key = ERR_PTR(ret);
 				goto error;
 			}
 		}
 
-		key = context->thread_keyring;
+		key = t->thread_keyring;
 		atomic_inc(&key->usage);
 		key_ref = make_key_ref(key, 1);
 		break;
 
 	case KEY_SPEC_PROCESS_KEYRING:
-		if (!context->signal->process_keyring) {
+		if (!t->signal->process_keyring) {
 			if (!create)
 				goto error;
 
-			ret = install_process_keyring(context);
+			ret = install_process_keyring();
 			if (ret < 0) {
 				key = ERR_PTR(ret);
 				goto error;
 			}
 		}
 
-		key = context->signal->process_keyring;
+		key = t->signal->process_keyring;
 		atomic_inc(&key->usage);
 		key_ref = make_key_ref(key, 1);
 		break;
 
 	case KEY_SPEC_SESSION_KEYRING:
-		if (!context->signal->session_keyring) {
+		if (!t->signal->session_keyring) {
 			/* always install a session keyring upon access if one
 			 * doesn't exist yet */
-			ret = install_user_keyrings(context);
+			ret = install_user_keyrings();
 			if (ret < 0)
 				goto error;
-			ret = install_session_keyring(
-				context, context->user->session_keyring);
+			ret = install_session_keyring(t->user->session_keyring);
 			if (ret < 0)
 				goto error;
 		}
 
 		rcu_read_lock();
-		key = rcu_dereference(context->signal->session_keyring);
+		key = rcu_dereference(t->signal->session_keyring);
 		atomic_inc(&key->usage);
 		rcu_read_unlock();
 		key_ref = make_key_ref(key, 1);
 		break;
 
 	case KEY_SPEC_USER_KEYRING:
-		if (!context->user->uid_keyring) {
-			ret = install_user_keyrings(context);
+		if (!t->user->uid_keyring) {
+			ret = install_user_keyrings();
 			if (ret < 0)
 				goto error;
 		}
 
-		key = context->user->uid_keyring;
+		key = t->user->uid_keyring;
 		atomic_inc(&key->usage);
 		key_ref = make_key_ref(key, 1);
 		break;
 
 	case KEY_SPEC_USER_SESSION_KEYRING:
-		if (!context->user->session_keyring) {
-			ret = install_user_keyrings(context);
+		if (!t->user->session_keyring) {
+			ret = install_user_keyrings();
 			if (ret < 0)
 				goto error;
 		}
 
-		key = context->user->session_keyring;
+		key = t->user->session_keyring;
 		atomic_inc(&key->usage);
 		key_ref = make_key_ref(key, 1);
 		break;
@@ -669,7 +670,7 @@ key_ref_t lookup_user_key(struct task_struct *context, key_serial_t id,
 		goto error;
 
 	case KEY_SPEC_REQKEY_AUTH_KEY:
-		key = context->request_key_auth;
+		key = t->request_key_auth;
 		if (!key)
 			goto error;
 
@@ -677,6 +678,25 @@ key_ref_t lookup_user_key(struct task_struct *context, key_serial_t id,
 		key_ref = make_key_ref(key, 1);
 		break;
 
+	case KEY_SPEC_REQUESTOR_KEYRING:
+		if (!t->request_key_auth)
+			goto error;
+
+		down_read(&t->request_key_auth->sem);
+		if (t->request_key_auth->flags & KEY_FLAG_REVOKED) {
+			key_ref = ERR_PTR(-EKEYREVOKED);
+			key = NULL;
+		} else {
+			rka = t->request_key_auth->payload.data;
+			key = rka->dest_keyring;
+			atomic_inc(&key->usage);
+		}
+		up_read(&t->request_key_auth->sem);
+		if (!key)
+			goto error;
+		key_ref = make_key_ref(key, 1);
+		break;
+
 	default:
 		key_ref = ERR_PTR(-EINVAL);
 		if (id < 1)
@@ -725,7 +745,7 @@ key_ref_t lookup_user_key(struct task_struct *context, key_serial_t id,
 		goto invalid_key;
 
 	/* check the permissions */
-	ret = key_task_permission(key_ref, context, perm);
+	ret = key_task_permission(key_ref, t, perm);
 	if (ret < 0)
 		goto invalid_key;
 
@@ -754,7 +774,7 @@ long join_session_keyring(const char *name)
 
 	/* if no name is provided, install an anonymous keyring */
 	if (!name) {
-		ret = install_session_keyring(tsk, NULL);
+		ret = install_session_keyring(NULL);
 		if (ret < 0)
 			goto error;
 
@@ -784,7 +804,7 @@ long join_session_keyring(const char *name)
 	}
 
 	/* we've got a keyring - now to install it */
-	ret = install_session_keyring(tsk, keyring);
+	ret = install_session_keyring(keyring);
 	if (ret < 0)
 		goto error2;
 
diff --git a/security/keys/request_key.c b/security/keys/request_key.c
index 91953c81449..8e9d93b4a40 100644
--- a/security/keys/request_key.c
+++ b/security/keys/request_key.c
@@ -76,6 +76,10 @@ static int call_sbin_request_key(struct key_construction *cons,
 
 	kenter("{%d},{%d},%s", key->serial, authkey->serial, op);
 
+	ret = install_user_keyrings();
+	if (ret < 0)
+		goto error_alloc;
+
 	/* allocate a new session keyring */
 	sprintf(desc, "_req.%u", key->serial);
 
@@ -165,7 +169,8 @@ error_alloc:
  * - we ignore program failure and go on key status instead
  */
 static int construct_key(struct key *key, const void *callout_info,
-			 size_t callout_len, void *aux)
+			 size_t callout_len, void *aux,
+			 struct key *dest_keyring)
 {
 	struct key_construction *cons;
 	request_key_actor_t actor;
@@ -179,7 +184,8 @@ static int construct_key(struct key *key, const void *callout_info,
 		return -ENOMEM;
 
 	/* allocate an authorisation key */
-	authkey = request_key_auth_new(key, callout_info, callout_len);
+	authkey = request_key_auth_new(key, callout_info, callout_len,
+				       dest_keyring);
 	if (IS_ERR(authkey)) {
 		kfree(cons);
 		ret = PTR_ERR(authkey);
@@ -207,27 +213,48 @@ static int construct_key(struct key *key, const void *callout_info,
 }
 
 /*
- * link a key to the appropriate destination keyring
- * - the caller must hold a write lock on the destination keyring
+ * get the appropriate destination keyring for the request
+ * - we return whatever keyring we select with an extra reference upon it which
+ *   the caller must release
  */
-static void construct_key_make_link(struct key *key, struct key *dest_keyring)
+static void construct_get_dest_keyring(struct key **_dest_keyring)
 {
+	struct request_key_auth *rka;
 	struct task_struct *tsk = current;
-	struct key *drop = NULL;
+	struct key *dest_keyring = *_dest_keyring, *authkey;
 
-	kenter("{%d},%p", key->serial, dest_keyring);
+	kenter("%p", dest_keyring);
 
 	/* find the appropriate keyring */
-	if (!dest_keyring) {
+	if (dest_keyring) {
+		/* the caller supplied one */
+		key_get(dest_keyring);
+	} else {
+		/* use a default keyring; falling through the cases until we
+		 * find one that we actually have */
 		switch (tsk->jit_keyring) {
 		case KEY_REQKEY_DEFL_DEFAULT:
+		case KEY_REQKEY_DEFL_REQUESTOR_KEYRING:
+			if (tsk->request_key_auth) {
+				authkey = tsk->request_key_auth;
+				down_read(&authkey->sem);
+				rka = authkey->payload.data;
+				if (!test_bit(KEY_FLAG_REVOKED,
+					      &authkey->flags))
+					dest_keyring =
+						key_get(rka->dest_keyring);
+				up_read(&authkey->sem);
+				if (dest_keyring)
+					break;
+			}
+
 		case KEY_REQKEY_DEFL_THREAD_KEYRING:
-			dest_keyring = tsk->thread_keyring;
+			dest_keyring = key_get(tsk->thread_keyring);
 			if (dest_keyring)
 				break;
 
 		case KEY_REQKEY_DEFL_PROCESS_KEYRING:
-			dest_keyring = tsk->signal->process_keyring;
+			dest_keyring = key_get(tsk->signal->process_keyring);
 			if (dest_keyring)
 				break;
 
@@ -236,17 +263,16 @@ static void construct_key_make_link(struct key *key, struct key *dest_keyring)
 			dest_keyring = key_get(
 				rcu_dereference(tsk->signal->session_keyring));
 			rcu_read_unlock();
-			drop = dest_keyring;
 
 			if (dest_keyring)
 				break;
 
 		case KEY_REQKEY_DEFL_USER_SESSION_KEYRING:
-			dest_keyring = tsk->user->session_keyring;
+			dest_keyring = key_get(tsk->user->session_keyring);
 			break;
 
 		case KEY_REQKEY_DEFL_USER_KEYRING:
-			dest_keyring = tsk->user->uid_keyring;
+			dest_keyring = key_get(tsk->user->uid_keyring);
 			break;
 
 		case KEY_REQKEY_DEFL_GROUP_KEYRING:
@@ -255,10 +281,9 @@ static void construct_key_make_link(struct key *key, struct key *dest_keyring)
 		}
 	}
 
-	/* and attach the key to it */
-	__key_link(dest_keyring, key);
-	key_put(drop);
-	kleave("");
+	*_dest_keyring = dest_keyring;
+	kleave(" [dk %d]", key_serial(dest_keyring));
+	return;
 }
 
 /*
@@ -288,8 +313,7 @@ static int construct_alloc_key(struct key_type *type,
 
 	set_bit(KEY_FLAG_USER_CONSTRUCT, &key->flags);
 
-	if (dest_keyring)
-		down_write(&dest_keyring->sem);
+	down_write(&dest_keyring->sem);
 
 	/* attach the key to the destination keyring under lock, but we do need
 	 * to do another check just in case someone beat us to it whilst we
@@ -301,12 +325,10 @@ static int construct_alloc_key(struct key_type *type,
 	if (!IS_ERR(key_ref))
 		goto key_already_present;
 
-	if (dest_keyring)
-		construct_key_make_link(key, dest_keyring);
+	__key_link(dest_keyring, key);
 
 	mutex_unlock(&key_construction_mutex);
-	if (dest_keyring)
-		up_write(&dest_keyring->sem);
+	up_write(&dest_keyring->sem);
 	mutex_unlock(&user->cons_lock);
 	*_key = key;
 	kleave(" = 0 [%d]", key_serial(key));
@@ -348,21 +370,26 @@ static struct key *construct_key_and_link(struct key_type *type,
 	if (!user)
 		return ERR_PTR(-ENOMEM);
 
+	construct_get_dest_keyring(&dest_keyring);
+
 	ret = construct_alloc_key(type, description, dest_keyring, flags, user,
 				  &key);
 	key_user_put(user);
 
 	if (ret == 0) {
-		ret = construct_key(key, callout_info, callout_len, aux);
+		ret = construct_key(key, callout_info, callout_len, aux,
+				    dest_keyring);
 		if (ret < 0)
 			goto construction_failed;
 	}
 
+	key_put(dest_keyring);
 	return key;
 
 construction_failed:
 	key_negate_and_link(key, key_negative_timeout, NULL, NULL);
 	key_put(key);
+	key_put(dest_keyring);
 	return ERR_PTR(ret);
 }
 
diff --git a/security/keys/request_key_auth.c b/security/keys/request_key_auth.c
index 729156b3485..1762d44711d 100644
--- a/security/keys/request_key_auth.c
+++ b/security/keys/request_key_auth.c
@@ -128,6 +128,7 @@ static void request_key_auth_destroy(struct key *key)
 	}
 
 	key_put(rka->target_key);
+	key_put(rka->dest_keyring);
 	kfree(rka->callout_info);
 	kfree(rka);
 
@@ -139,7 +140,7 @@ static void request_key_auth_destroy(struct key *key)
  * access to the caller's security data
  */
 struct key *request_key_auth_new(struct key *target, const void *callout_info,
-				 size_t callout_len)
+				 size_t callout_len, struct key *dest_keyring)
 {
 	struct request_key_auth *rka, *irka;
 	struct key *authkey = NULL;
@@ -188,6 +189,7 @@ struct key *request_key_auth_new(struct key *target, const void *callout_info,
 	}
 
 	rka->target_key = key_get(target);
+	rka->dest_keyring = key_get(dest_keyring);
 	memcpy(rka->callout_info, callout_info, callout_len);
 	rka->callout_len = callout_len;
 
@@ -223,6 +225,7 @@ error_inst:
 	key_put(authkey);
 error_alloc:
 	key_put(rka->target_key);
+	key_put(rka->dest_keyring);
 	kfree(rka->callout_info);
 	kfree(rka);
 	kleave("= %d", ret);
-- 
cgit v1.2.3-70-g09d2


From 1cdcbec1a3372c0c49c59d292e708fd07b509f18 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 14 Nov 2008 10:39:14 +1100
Subject: CRED: Neuter sys_capset()

Take away the ability for sys_capset() to affect processes other than current.

This means that current will not need to lock its own credentials when reading
them against interference by other processes.

This has effectively been the case for a while anyway, since:

 (1) Without LSM enabled, sys_capset() is disallowed.

 (2) With file-based capabilities, sys_capset() is neutered.

Signed-off-by: David Howells <dhowells@redhat.com>
Acked-by: Serge Hallyn <serue@us.ibm.com>
Acked-by: Andrew G. Morgan <morgan@kernel.org>
Acked-by: James Morris <jmorris@namei.org>
Signed-off-by: James Morris <jmorris@namei.org>
---
 fs/open.c                |  12 +--
 include/linux/security.h |  48 ++++------
 kernel/capability.c      | 227 +++++------------------------------------------
 security/commoncap.c     |  29 ++----
 security/security.c      |  18 ++--
 security/selinux/hooks.c |  10 +--
 6 files changed, 61 insertions(+), 283 deletions(-)

(limited to 'include/linux')

diff --git a/fs/open.c b/fs/open.c
index 83cdb9dee0c..500cc0c5476 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -441,17 +441,7 @@ asmlinkage long sys_faccessat(int dfd, const char __user *filename, int mode)
 	current->fsgid = current->gid;
 
 	if (!issecure(SECURE_NO_SETUID_FIXUP)) {
-		/*
-		 * Clear the capabilities if we switch to a non-root user
-		 */
-#ifndef CONFIG_SECURITY_FILE_CAPABILITIES
-		/*
-		 * FIXME: There is a race here against sys_capset.  The
-		 * capabilities can change yet we will restore the old
-		 * value below.  We should hold task_capabilities_lock,
-		 * but we cannot because user_path_at can sleep.
-		 */
-#endif /* ndef CONFIG_SECURITY_FILE_CAPABILITIES */
+		/* Clear the capabilities if we switch to a non-root user */
 		if (current->uid)
 			old_cap = cap_set_effective(__cap_empty_set);
 		else
diff --git a/include/linux/security.h b/include/linux/security.h
index 5fe28a671cd..d1ce8beddbd 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -53,8 +53,8 @@ extern int cap_settime(struct timespec *ts, struct timezone *tz);
 extern int cap_ptrace_may_access(struct task_struct *child, unsigned int mode);
 extern int cap_ptrace_traceme(struct task_struct *parent);
 extern int cap_capget(struct task_struct *target, kernel_cap_t *effective, kernel_cap_t *inheritable, kernel_cap_t *permitted);
-extern int cap_capset_check(struct task_struct *target, kernel_cap_t *effective, kernel_cap_t *inheritable, kernel_cap_t *permitted);
-extern void cap_capset_set(struct task_struct *target, kernel_cap_t *effective, kernel_cap_t *inheritable, kernel_cap_t *permitted);
+extern int cap_capset_check(kernel_cap_t *effective, kernel_cap_t *inheritable, kernel_cap_t *permitted);
+extern void cap_capset_set(kernel_cap_t *effective, kernel_cap_t *inheritable, kernel_cap_t *permitted);
 extern int cap_bprm_set_security(struct linux_binprm *bprm);
 extern void cap_bprm_apply_creds(struct linux_binprm *bprm, int unsafe);
 extern int cap_bprm_secureexec(struct linux_binprm *bprm);
@@ -1191,24 +1191,14 @@ static inline void security_free_mnt_opts(struct security_mnt_opts *opts)
  *	Return 0 if the capability sets were successfully obtained.
  * @capset_check:
  *	Check permission before setting the @effective, @inheritable, and
- *	@permitted capability sets for the @target process.
- *	Caveat:  @target is also set to current if a set of processes is
- *	specified (i.e. all processes other than current and init or a
- *	particular process group).  Hence, the capset_set hook may need to
- *	revalidate permission to the actual target process.
- *	@target contains the task_struct structure for target process.
+ *	@permitted capability sets for the current process.
  *	@effective contains the effective capability set.
  *	@inheritable contains the inheritable capability set.
  *	@permitted contains the permitted capability set.
  *	Return 0 if permission is granted.
  * @capset_set:
  *	Set the @effective, @inheritable, and @permitted capability sets for
- *	the @target process.  Since capset_check cannot always check permission
- *	to the real @target process, this hook may also perform permission
- *	checking to determine if the current process is allowed to set the
- *	capability sets of the @target process.  However, this hook has no way
- *	of returning an error due to the structure of the sys_capset code.
- *	@target contains the task_struct structure for target process.
+ *	the current process.
  *	@effective contains the effective capability set.
  *	@inheritable contains the inheritable capability set.
  *	@permitted contains the permitted capability set.
@@ -1303,12 +1293,10 @@ struct security_operations {
 	int (*capget) (struct task_struct *target,
 		       kernel_cap_t *effective,
 		       kernel_cap_t *inheritable, kernel_cap_t *permitted);
-	int (*capset_check) (struct task_struct *target,
-			     kernel_cap_t *effective,
+	int (*capset_check) (kernel_cap_t *effective,
 			     kernel_cap_t *inheritable,
 			     kernel_cap_t *permitted);
-	void (*capset_set) (struct task_struct *target,
-			    kernel_cap_t *effective,
+	void (*capset_set) (kernel_cap_t *effective,
 			    kernel_cap_t *inheritable,
 			    kernel_cap_t *permitted);
 	int (*capable) (struct task_struct *tsk, int cap, int audit);
@@ -1572,12 +1560,10 @@ int security_capget(struct task_struct *target,
 		    kernel_cap_t *effective,
 		    kernel_cap_t *inheritable,
 		    kernel_cap_t *permitted);
-int security_capset_check(struct task_struct *target,
-			  kernel_cap_t *effective,
+int security_capset_check(kernel_cap_t *effective,
 			  kernel_cap_t *inheritable,
 			  kernel_cap_t *permitted);
-void security_capset_set(struct task_struct *target,
-			 kernel_cap_t *effective,
+void security_capset_set(kernel_cap_t *effective,
 			 kernel_cap_t *inheritable,
 			 kernel_cap_t *permitted);
 int security_capable(struct task_struct *tsk, int cap);
@@ -1769,20 +1755,18 @@ static inline int security_capget(struct task_struct *target,
 	return cap_capget(target, effective, inheritable, permitted);
 }
 
-static inline int security_capset_check(struct task_struct *target,
-					 kernel_cap_t *effective,
-					 kernel_cap_t *inheritable,
-					 kernel_cap_t *permitted)
+static inline int security_capset_check(kernel_cap_t *effective,
+					kernel_cap_t *inheritable,
+					kernel_cap_t *permitted)
 {
-	return cap_capset_check(target, effective, inheritable, permitted);
+	return cap_capset_check(effective, inheritable, permitted);
 }
 
-static inline void security_capset_set(struct task_struct *target,
-					kernel_cap_t *effective,
-					kernel_cap_t *inheritable,
-					kernel_cap_t *permitted)
+static inline void security_capset_set(kernel_cap_t *effective,
+				       kernel_cap_t *inheritable,
+				       kernel_cap_t *permitted)
 {
-	cap_capset_set(target, effective, inheritable, permitted);
+	cap_capset_set(effective, inheritable, permitted);
 }
 
 static inline int security_capable(struct task_struct *tsk, int cap)
diff --git a/kernel/capability.c b/kernel/capability.c
index adb262f83de..58b00519624 100644
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -127,160 +127,6 @@ static int cap_validate_magic(cap_user_header_t header, unsigned *tocopy)
 	return 0;
 }
 
-#ifndef CONFIG_SECURITY_FILE_CAPABILITIES
-
-/*
- * Without filesystem capability support, we nominally support one process
- * setting the capabilities of another
- */
-static inline int cap_get_target_pid(pid_t pid, kernel_cap_t *pEp,
-				     kernel_cap_t *pIp, kernel_cap_t *pPp)
-{
-	struct task_struct *target;
-	int ret;
-
-	spin_lock(&task_capability_lock);
-	read_lock(&tasklist_lock);
-
-	if (pid && pid != task_pid_vnr(current)) {
-		target = find_task_by_vpid(pid);
-		if (!target) {
-			ret = -ESRCH;
-			goto out;
-		}
-	} else
-		target = current;
-
-	ret = security_capget(target, pEp, pIp, pPp);
-
-out:
-	read_unlock(&tasklist_lock);
-	spin_unlock(&task_capability_lock);
-
-	return ret;
-}
-
-/*
- * cap_set_pg - set capabilities for all processes in a given process
- * group.  We call this holding task_capability_lock and tasklist_lock.
- */
-static inline int cap_set_pg(int pgrp_nr, kernel_cap_t *effective,
-			     kernel_cap_t *inheritable,
-			     kernel_cap_t *permitted)
-{
-	struct task_struct *g, *target;
-	int ret = -EPERM;
-	int found = 0;
-	struct pid *pgrp;
-
-	spin_lock(&task_capability_lock);
-	read_lock(&tasklist_lock);
-
-	pgrp = find_vpid(pgrp_nr);
-	do_each_pid_task(pgrp, PIDTYPE_PGID, g) {
-		target = g;
-		while_each_thread(g, target) {
-			if (!security_capset_check(target, effective,
-						   inheritable, permitted)) {
-				security_capset_set(target, effective,
-						    inheritable, permitted);
-				ret = 0;
-			}
-			found = 1;
-		}
-	} while_each_pid_task(pgrp, PIDTYPE_PGID, g);
-
-	read_unlock(&tasklist_lock);
-	spin_unlock(&task_capability_lock);
-
-	if (!found)
-		ret = 0;
-	return ret;
-}
-
-/*
- * cap_set_all - set capabilities for all processes other than init
- * and self.  We call this holding task_capability_lock and tasklist_lock.
- */
-static inline int cap_set_all(kernel_cap_t *effective,
-			      kernel_cap_t *inheritable,
-			      kernel_cap_t *permitted)
-{
-	struct task_struct *g, *target;
-	int ret = -EPERM;
-	int found = 0;
-
-	spin_lock(&task_capability_lock);
-	read_lock(&tasklist_lock);
-
-	do_each_thread(g, target) {
-		if (target == current
-		    || is_container_init(target->group_leader))
-			continue;
-		found = 1;
-		if (security_capset_check(target, effective, inheritable,
-					  permitted))
-			continue;
-		ret = 0;
-		security_capset_set(target, effective, inheritable, permitted);
-	} while_each_thread(g, target);
-
-	read_unlock(&tasklist_lock);
-	spin_unlock(&task_capability_lock);
-
-	if (!found)
-		ret = 0;
-
-	return ret;
-}
-
-/*
- * Given the target pid does not refer to the current process we
- * need more elaborate support... (This support is not present when
- * filesystem capabilities are configured.)
- */
-static inline int do_sys_capset_other_tasks(pid_t pid, kernel_cap_t *effective,
-					    kernel_cap_t *inheritable,
-					    kernel_cap_t *permitted)
-{
-	struct task_struct *target;
-	int ret;
-
-	if (!capable(CAP_SETPCAP))
-		return -EPERM;
-
-	if (pid == -1)	          /* all procs other than current and init */
-		return cap_set_all(effective, inheritable, permitted);
-
-	else if (pid < 0)                    /* all procs in process group */
-		return cap_set_pg(-pid, effective, inheritable, permitted);
-
-	/* target != current */
-	spin_lock(&task_capability_lock);
-	read_lock(&tasklist_lock);
-
-	target = find_task_by_vpid(pid);
-	if (!target)
-		ret = -ESRCH;
-	else {
-		ret = security_capset_check(target, effective, inheritable,
-					    permitted);
-
-		/* having verified that the proposed changes are legal,
-		   we now put them into effect. */
-		if (!ret)
-			security_capset_set(target, effective, inheritable,
-					    permitted);
-	}
-
-	read_unlock(&tasklist_lock);
-	spin_unlock(&task_capability_lock);
-
-	return ret;
-}
-
-#else /* ie., def CONFIG_SECURITY_FILE_CAPABILITIES */
-
 /*
  * If we have configured with filesystem capability support, then the
  * only thing that can change the capabilities of the current process
@@ -314,22 +160,6 @@ static inline int cap_get_target_pid(pid_t pid, kernel_cap_t *pEp,
 	return ret;
 }
 
-/*
- * With filesystem capability support configured, the kernel does not
- * permit the changing of capabilities in one process by another
- * process. (CAP_SETPCAP has much less broad semantics when configured
- * this way.)
- */
-static inline int do_sys_capset_other_tasks(pid_t pid,
-					    kernel_cap_t *effective,
-					    kernel_cap_t *inheritable,
-					    kernel_cap_t *permitted)
-{
-	return -EPERM;
-}
-
-#endif /* ie., ndef CONFIG_SECURITY_FILE_CAPABILITIES */
-
 /*
  * Atomically modify the effective capabilities returning the original
  * value. No permission check is performed here - it is assumed that the
@@ -424,16 +254,14 @@ asmlinkage long sys_capget(cap_user_header_t header, cap_user_data_t dataptr)
  * @data: pointer to struct that contains the effective, permitted,
  *	and inheritable capabilities
  *
- * Set capabilities for a given process, all processes, or all
- * processes in a given process group.
+ * Set capabilities for the current process only.  The ability to any other
+ * process(es) has been deprecated and removed.
  *
  * The restrictions on setting capabilities are specified as:
  *
- * [pid is for the 'target' task.  'current' is the calling task.]
- *
- * I: any raised capabilities must be a subset of the (old current) permitted
- * P: any raised capabilities must be a subset of the (old current) permitted
- * E: must be set to a subset of (new target) permitted
+ * I: any raised capabilities must be a subset of the old permitted
+ * P: any raised capabilities must be a subset of the old permitted
+ * E: must be set to a subset of new permitted
  *
  * Returns 0 on success and < 0 on error.
  */
@@ -452,10 +280,13 @@ asmlinkage long sys_capset(cap_user_header_t header, const cap_user_data_t data)
 	if (get_user(pid, &header->pid))
 		return -EFAULT;
 
+	/* may only affect current now */
+	if (pid != 0 && pid != task_pid_vnr(current))
+		return -EPERM;
+
 	if (copy_from_user(&kdata, data, tocopy
-			   * sizeof(struct __user_cap_data_struct))) {
+			   * sizeof(struct __user_cap_data_struct)))
 		return -EFAULT;
-	}
 
 	for (i = 0; i < tocopy; i++) {
 		effective.cap[i] = kdata[i].effective;
@@ -473,32 +304,20 @@ asmlinkage long sys_capset(cap_user_header_t header, const cap_user_data_t data)
 	if (ret)
 		return ret;
 
-	if (pid && (pid != task_pid_vnr(current)))
-		ret = do_sys_capset_other_tasks(pid, &effective, &inheritable,
-						&permitted);
-	else {
-		/*
-		 * This lock is required even when filesystem
-		 * capability support is configured - it protects the
-		 * sys_capget() call from returning incorrect data in
-		 * the case that the targeted process is not the
-		 * current one.
-		 */
-		spin_lock(&task_capability_lock);
-
-		ret = security_capset_check(current, &effective, &inheritable,
-					    &permitted);
-		/*
-		 * Having verified that the proposed changes are
-		 * legal, we now put them into effect.
-		 */
-		if (!ret)
-			security_capset_set(current, &effective, &inheritable,
-					    &permitted);
-		spin_unlock(&task_capability_lock);
-	}
-
+	/* This lock is required even when filesystem capability support is
+	 * configured - it protects the sys_capget() call from returning
+	 * incorrect data in the case that the targeted process is not the
+	 * current one.
+	 */
+	spin_lock(&task_capability_lock);
 
+	ret = security_capset_check(&effective, &inheritable, &permitted);
+	/* Having verified that the proposed changes are legal, we now put them
+	 * into effect.
+	 */
+	if (!ret)
+		security_capset_set(&effective, &inheritable, &permitted);
+	spin_unlock(&task_capability_lock);
 	return ret;
 }
 
diff --git a/security/commoncap.c b/security/commoncap.c
index 8283271f076..e3f36ef629f 100644
--- a/security/commoncap.c
+++ b/security/commoncap.c
@@ -96,15 +96,6 @@ int cap_capget (struct task_struct *target, kernel_cap_t *effective,
 
 #ifdef CONFIG_SECURITY_FILE_CAPABILITIES
 
-static inline int cap_block_setpcap(struct task_struct *target)
-{
-	/*
-	 * No support for remote process capability manipulation with
-	 * filesystem capability support.
-	 */
-	return (target != current);
-}
-
 static inline int cap_inh_is_capped(void)
 {
 	/*
@@ -119,7 +110,6 @@ static inline int cap_limit_ptraced_target(void) { return 1; }
 
 #else /* ie., ndef CONFIG_SECURITY_FILE_CAPABILITIES */
 
-static inline int cap_block_setpcap(struct task_struct *t) { return 0; }
 static inline int cap_inh_is_capped(void) { return 1; }
 static inline int cap_limit_ptraced_target(void)
 {
@@ -128,21 +118,18 @@ static inline int cap_limit_ptraced_target(void)
 
 #endif /* def CONFIG_SECURITY_FILE_CAPABILITIES */
 
-int cap_capset_check (struct task_struct *target, kernel_cap_t *effective,
+int cap_capset_check (kernel_cap_t *effective,
 		      kernel_cap_t *inheritable, kernel_cap_t *permitted)
 {
-	if (cap_block_setpcap(target)) {
-		return -EPERM;
-	}
 	if (cap_inh_is_capped()
 	    && !cap_issubset(*inheritable,
-			     cap_combine(target->cap_inheritable,
+			     cap_combine(current->cap_inheritable,
 					 current->cap_permitted))) {
 		/* incapable of using this inheritable set */
 		return -EPERM;
 	}
 	if (!cap_issubset(*inheritable,
-			   cap_combine(target->cap_inheritable,
+			   cap_combine(current->cap_inheritable,
 				       current->cap_bset))) {
 		/* no new pI capabilities outside bounding set */
 		return -EPERM;
@@ -150,7 +137,7 @@ int cap_capset_check (struct task_struct *target, kernel_cap_t *effective,
 
 	/* verify restrictions on target's new Permitted set */
 	if (!cap_issubset (*permitted,
-			   cap_combine (target->cap_permitted,
+			   cap_combine (current->cap_permitted,
 					current->cap_permitted))) {
 		return -EPERM;
 	}
@@ -163,12 +150,12 @@ int cap_capset_check (struct task_struct *target, kernel_cap_t *effective,
 	return 0;
 }
 
-void cap_capset_set (struct task_struct *target, kernel_cap_t *effective,
+void cap_capset_set (kernel_cap_t *effective,
 		     kernel_cap_t *inheritable, kernel_cap_t *permitted)
 {
-	target->cap_effective = *effective;
-	target->cap_inheritable = *inheritable;
-	target->cap_permitted = *permitted;
+	current->cap_effective = *effective;
+	current->cap_inheritable = *inheritable;
+	current->cap_permitted = *permitted;
 }
 
 static inline void bprm_clear_caps(struct linux_binprm *bprm)
diff --git a/security/security.c b/security/security.c
index 346f21e0ec2..dca37381e2a 100644
--- a/security/security.c
+++ b/security/security.c
@@ -145,20 +145,18 @@ int security_capget(struct task_struct *target,
 	return security_ops->capget(target, effective, inheritable, permitted);
 }
 
-int security_capset_check(struct task_struct *target,
-			   kernel_cap_t *effective,
-			   kernel_cap_t *inheritable,
-			   kernel_cap_t *permitted)
+int security_capset_check(kernel_cap_t *effective,
+			  kernel_cap_t *inheritable,
+			  kernel_cap_t *permitted)
 {
-	return security_ops->capset_check(target, effective, inheritable, permitted);
+	return security_ops->capset_check(effective, inheritable, permitted);
 }
 
-void security_capset_set(struct task_struct *target,
-			  kernel_cap_t *effective,
-			  kernel_cap_t *inheritable,
-			  kernel_cap_t *permitted)
+void security_capset_set(kernel_cap_t *effective,
+			 kernel_cap_t *inheritable,
+			 kernel_cap_t *permitted)
 {
-	security_ops->capset_set(target, effective, inheritable, permitted);
+	security_ops->capset_set(effective, inheritable, permitted);
 }
 
 int security_capable(struct task_struct *tsk, int cap)
diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index 378dc53c08e..df9986940e9 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -1790,22 +1790,22 @@ static int selinux_capget(struct task_struct *target, kernel_cap_t *effective,
 	return secondary_ops->capget(target, effective, inheritable, permitted);
 }
 
-static int selinux_capset_check(struct task_struct *target, kernel_cap_t *effective,
+static int selinux_capset_check(kernel_cap_t *effective,
 				kernel_cap_t *inheritable, kernel_cap_t *permitted)
 {
 	int error;
 
-	error = secondary_ops->capset_check(target, effective, inheritable, permitted);
+	error = secondary_ops->capset_check(effective, inheritable, permitted);
 	if (error)
 		return error;
 
-	return task_has_perm(current, target, PROCESS__SETCAP);
+	return task_has_perm(current, current, PROCESS__SETCAP);
 }
 
-static void selinux_capset_set(struct task_struct *target, kernel_cap_t *effective,
+static void selinux_capset_set(kernel_cap_t *effective,
 			       kernel_cap_t *inheritable, kernel_cap_t *permitted)
 {
-	secondary_ops->capset_set(target, effective, inheritable, permitted);
+	secondary_ops->capset_set(effective, inheritable, permitted);
 }
 
 static int selinux_capable(struct task_struct *tsk, int cap, int audit)
-- 
cgit v1.2.3-70-g09d2


From 15a2460ed0af7538ca8e6c610fe607a2cd9da142 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 14 Nov 2008 10:39:15 +1100
Subject: CRED: Constify the kernel_cap_t arguments to the capset LSM hooks

Constify the kernel_cap_t arguments to the capset LSM hooks.

Signed-off-by: David Howells <dhowells@redhat.com>
Acked-by: Serge Hallyn <serue@us.ibm.com>
Acked-by: James Morris <jmorris@namei.org>
Signed-off-by: James Morris <jmorris@namei.org>
---
 include/linux/security.h | 44 ++++++++++++++++++++++++--------------------
 security/commoncap.c     | 10 ++++++----
 security/security.c      | 12 ++++++------
 security/selinux/hooks.c | 10 ++++++----
 4 files changed, 42 insertions(+), 34 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/security.h b/include/linux/security.h
index d1ce8beddbd..9f305d4a31a 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -53,8 +53,12 @@ extern int cap_settime(struct timespec *ts, struct timezone *tz);
 extern int cap_ptrace_may_access(struct task_struct *child, unsigned int mode);
 extern int cap_ptrace_traceme(struct task_struct *parent);
 extern int cap_capget(struct task_struct *target, kernel_cap_t *effective, kernel_cap_t *inheritable, kernel_cap_t *permitted);
-extern int cap_capset_check(kernel_cap_t *effective, kernel_cap_t *inheritable, kernel_cap_t *permitted);
-extern void cap_capset_set(kernel_cap_t *effective, kernel_cap_t *inheritable, kernel_cap_t *permitted);
+extern int cap_capset_check(const kernel_cap_t *effective,
+			    const kernel_cap_t *inheritable,
+			    const kernel_cap_t *permitted);
+extern void cap_capset_set(const kernel_cap_t *effective,
+			   const kernel_cap_t *inheritable,
+			   const kernel_cap_t *permitted);
 extern int cap_bprm_set_security(struct linux_binprm *bprm);
 extern void cap_bprm_apply_creds(struct linux_binprm *bprm, int unsafe);
 extern int cap_bprm_secureexec(struct linux_binprm *bprm);
@@ -1293,12 +1297,12 @@ struct security_operations {
 	int (*capget) (struct task_struct *target,
 		       kernel_cap_t *effective,
 		       kernel_cap_t *inheritable, kernel_cap_t *permitted);
-	int (*capset_check) (kernel_cap_t *effective,
-			     kernel_cap_t *inheritable,
-			     kernel_cap_t *permitted);
-	void (*capset_set) (kernel_cap_t *effective,
-			    kernel_cap_t *inheritable,
-			    kernel_cap_t *permitted);
+	int (*capset_check) (const kernel_cap_t *effective,
+			     const kernel_cap_t *inheritable,
+			     const kernel_cap_t *permitted);
+	void (*capset_set) (const kernel_cap_t *effective,
+			    const kernel_cap_t *inheritable,
+			    const kernel_cap_t *permitted);
 	int (*capable) (struct task_struct *tsk, int cap, int audit);
 	int (*acct) (struct file *file);
 	int (*sysctl) (struct ctl_table *table, int op);
@@ -1560,12 +1564,12 @@ int security_capget(struct task_struct *target,
 		    kernel_cap_t *effective,
 		    kernel_cap_t *inheritable,
 		    kernel_cap_t *permitted);
-int security_capset_check(kernel_cap_t *effective,
-			  kernel_cap_t *inheritable,
-			  kernel_cap_t *permitted);
-void security_capset_set(kernel_cap_t *effective,
-			 kernel_cap_t *inheritable,
-			 kernel_cap_t *permitted);
+int security_capset_check(const kernel_cap_t *effective,
+			  const kernel_cap_t *inheritable,
+			  const kernel_cap_t *permitted);
+void security_capset_set(const kernel_cap_t *effective,
+			 const kernel_cap_t *inheritable,
+			 const kernel_cap_t *permitted);
 int security_capable(struct task_struct *tsk, int cap);
 int security_capable_noaudit(struct task_struct *tsk, int cap);
 int security_acct(struct file *file);
@@ -1755,16 +1759,16 @@ static inline int security_capget(struct task_struct *target,
 	return cap_capget(target, effective, inheritable, permitted);
 }
 
-static inline int security_capset_check(kernel_cap_t *effective,
-					kernel_cap_t *inheritable,
-					kernel_cap_t *permitted)
+static inline int security_capset_check(const kernel_cap_t *effective,
+					const kernel_cap_t *inheritable,
+					const kernel_cap_t *permitted)
 {
 	return cap_capset_check(effective, inheritable, permitted);
 }
 
-static inline void security_capset_set(kernel_cap_t *effective,
-				       kernel_cap_t *inheritable,
-				       kernel_cap_t *permitted)
+static inline void security_capset_set(const kernel_cap_t *effective,
+				       const kernel_cap_t *inheritable,
+				       const kernel_cap_t *permitted)
 {
 	cap_capset_set(effective, inheritable, permitted);
 }
diff --git a/security/commoncap.c b/security/commoncap.c
index e3f36ef629f..fb4e240720d 100644
--- a/security/commoncap.c
+++ b/security/commoncap.c
@@ -118,8 +118,9 @@ static inline int cap_limit_ptraced_target(void)
 
 #endif /* def CONFIG_SECURITY_FILE_CAPABILITIES */
 
-int cap_capset_check (kernel_cap_t *effective,
-		      kernel_cap_t *inheritable, kernel_cap_t *permitted)
+int cap_capset_check(const kernel_cap_t *effective,
+		     const kernel_cap_t *inheritable,
+		     const kernel_cap_t *permitted)
 {
 	if (cap_inh_is_capped()
 	    && !cap_issubset(*inheritable,
@@ -150,8 +151,9 @@ int cap_capset_check (kernel_cap_t *effective,
 	return 0;
 }
 
-void cap_capset_set (kernel_cap_t *effective,
-		     kernel_cap_t *inheritable, kernel_cap_t *permitted)
+void cap_capset_set(const kernel_cap_t *effective,
+		    const kernel_cap_t *inheritable,
+		    const kernel_cap_t *permitted)
 {
 	current->cap_effective = *effective;
 	current->cap_inheritable = *inheritable;
diff --git a/security/security.c b/security/security.c
index dca37381e2a..81c956a1230 100644
--- a/security/security.c
+++ b/security/security.c
@@ -145,16 +145,16 @@ int security_capget(struct task_struct *target,
 	return security_ops->capget(target, effective, inheritable, permitted);
 }
 
-int security_capset_check(kernel_cap_t *effective,
-			  kernel_cap_t *inheritable,
-			  kernel_cap_t *permitted)
+int security_capset_check(const kernel_cap_t *effective,
+			  const kernel_cap_t *inheritable,
+			  const kernel_cap_t *permitted)
 {
 	return security_ops->capset_check(effective, inheritable, permitted);
 }
 
-void security_capset_set(kernel_cap_t *effective,
-			 kernel_cap_t *inheritable,
-			 kernel_cap_t *permitted)
+void security_capset_set(const kernel_cap_t *effective,
+			 const kernel_cap_t *inheritable,
+			 const kernel_cap_t *permitted)
 {
 	security_ops->capset_set(effective, inheritable, permitted);
 }
diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index df9986940e9..9f6da154cc8 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -1790,8 +1790,9 @@ static int selinux_capget(struct task_struct *target, kernel_cap_t *effective,
 	return secondary_ops->capget(target, effective, inheritable, permitted);
 }
 
-static int selinux_capset_check(kernel_cap_t *effective,
-				kernel_cap_t *inheritable, kernel_cap_t *permitted)
+static int selinux_capset_check(const kernel_cap_t *effective,
+				const kernel_cap_t *inheritable,
+				const kernel_cap_t *permitted)
 {
 	int error;
 
@@ -1802,8 +1803,9 @@ static int selinux_capset_check(kernel_cap_t *effective,
 	return task_has_perm(current, current, PROCESS__SETCAP);
 }
 
-static void selinux_capset_set(kernel_cap_t *effective,
-			       kernel_cap_t *inheritable, kernel_cap_t *permitted)
+static void selinux_capset_set(const kernel_cap_t *effective,
+			       const kernel_cap_t *inheritable,
+			       const kernel_cap_t *permitted)
 {
 	secondary_ops->capset_set(effective, inheritable, permitted);
 }
-- 
cgit v1.2.3-70-g09d2


From b6dff3ec5e116e3af6f537d4caedcad6b9e5082a Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 14 Nov 2008 10:39:16 +1100
Subject: CRED: Separate task security context from task_struct

Separate the task security context from task_struct.  At this point, the
security data is temporarily embedded in the task_struct with two pointers
pointing to it.

Note that the Alpha arch is altered as it refers to (E)UID and (E)GID in
entry.S via asm-offsets.

With comment fixes Signed-off-by: Marc Dionne <marc.c.dionne@gmail.com>

Signed-off-by: David Howells <dhowells@redhat.com>
Acked-by: James Morris <jmorris@namei.org>
Acked-by: Serge Hallyn <serue@us.ibm.com>
Signed-off-by: James Morris <jmorris@namei.org>
---
 arch/alpha/kernel/asm-offsets.c  |  11 +-
 arch/alpha/kernel/entry.S        |  10 +-
 arch/ia64/ia32/sys_ia32.c        |   8 +-
 arch/mips/kernel/kspd.c          |   4 +-
 arch/s390/kernel/compat_linux.c  |  28 ++---
 drivers/connector/cn_proc.c      |   8 +-
 fs/binfmt_elf.c                  |  12 +-
 fs/binfmt_elf_fdpic.c            |  12 +-
 fs/exec.c                        |   4 +-
 fs/fcntl.c                       |   4 +-
 fs/file_table.c                  |   4 +-
 fs/fuse/dir.c                    |  12 +-
 fs/hugetlbfs/inode.c             |   4 +-
 fs/ioprio.c                      |  12 +-
 fs/nfsd/auth.c                   |  22 ++--
 fs/nfsd/nfs4recover.c            |  12 +-
 fs/nfsd/nfsfh.c                  |   6 +-
 fs/open.c                        |  17 +--
 fs/proc/array.c                  |  18 +--
 fs/proc/base.c                   |  16 +--
 fs/xfs/linux-2.6/xfs_cred.h      |   6 +-
 fs/xfs/linux-2.6/xfs_globals.h   |   2 +-
 fs/xfs/xfs_inode.h               |   2 +-
 fs/xfs/xfs_vnodeops.h            |  10 +-
 include/linux/cred.h             | 155 +++++++++++++++++++----
 include/linux/init_task.h        |  24 ++--
 include/linux/sched.h            |  52 +-------
 include/linux/securebits.h       |   2 +-
 ipc/mqueue.c                     |   2 +-
 ipc/shm.c                        |   4 +-
 kernel/auditsc.c                 |  52 ++++----
 kernel/capability.c              |   4 +-
 kernel/cgroup.c                  |   4 +-
 kernel/exit.c                    |  10 +-
 kernel/fork.c                    |  24 ++--
 kernel/futex.c                   |   6 +-
 kernel/futex_compat.c            |   5 +-
 kernel/ptrace.c                  |  19 +--
 kernel/sched.c                   |  10 +-
 kernel/signal.c                  |  16 +--
 kernel/sys.c                     | 266 ++++++++++++++++++++++-----------------
 kernel/trace/trace.c             |   2 +-
 kernel/tsacct.c                  |   4 +-
 kernel/uid16.c                   |  28 ++---
 kernel/user.c                    |   4 +-
 mm/mempolicy.c                   |  10 +-
 mm/migrate.c                     |  10 +-
 mm/oom_kill.c                    |   2 +-
 net/core/scm.c                   |  10 +-
 net/sunrpc/auth.c                |   2 +-
 security/commoncap.c             | 161 +++++++++++++-----------
 security/keys/keyctl.c           |  25 ++--
 security/keys/permission.c       |  11 +-
 security/keys/process_keys.c     |  98 ++++++++-------
 security/keys/request_key.c      |  18 +--
 security/keys/request_key_auth.c |  12 +-
 security/selinux/exports.c       |   2 +-
 security/selinux/hooks.c         | 116 ++++++++---------
 security/selinux/selinuxfs.c     |   2 +-
 security/selinux/xfrm.c          |   6 +-
 security/smack/smack_access.c    |   4 +-
 security/smack/smack_lsm.c       |  77 ++++++------
 security/smack/smackfs.c         |   6 +-
 63 files changed, 832 insertions(+), 677 deletions(-)

(limited to 'include/linux')

diff --git a/arch/alpha/kernel/asm-offsets.c b/arch/alpha/kernel/asm-offsets.c
index 4b18cd94d59..6ff8886e7e2 100644
--- a/arch/alpha/kernel/asm-offsets.c
+++ b/arch/alpha/kernel/asm-offsets.c
@@ -19,15 +19,18 @@ void foo(void)
 	BLANK();
 
         DEFINE(TASK_BLOCKED, offsetof(struct task_struct, blocked));
-        DEFINE(TASK_UID, offsetof(struct task_struct, uid));
-        DEFINE(TASK_EUID, offsetof(struct task_struct, euid));
-        DEFINE(TASK_GID, offsetof(struct task_struct, gid));
-        DEFINE(TASK_EGID, offsetof(struct task_struct, egid));
+        DEFINE(TASK_CRED, offsetof(struct task_struct, cred));
         DEFINE(TASK_REAL_PARENT, offsetof(struct task_struct, real_parent));
         DEFINE(TASK_GROUP_LEADER, offsetof(struct task_struct, group_leader));
         DEFINE(TASK_TGID, offsetof(struct task_struct, tgid));
         BLANK();
 
+        DEFINE(CRED_UID,  offsetof(struct cred, uid));
+        DEFINE(CRED_EUID, offsetof(struct cred, euid));
+        DEFINE(CRED_GID,  offsetof(struct cred, gid));
+        DEFINE(CRED_EGID, offsetof(struct cred, egid));
+        BLANK();
+
 	DEFINE(SIZEOF_PT_REGS, sizeof(struct pt_regs));
 	DEFINE(PT_PTRACED, PT_PTRACED);
 	DEFINE(CLONE_VM, CLONE_VM);
diff --git a/arch/alpha/kernel/entry.S b/arch/alpha/kernel/entry.S
index 5fc61e281ac..f77345bc66a 100644
--- a/arch/alpha/kernel/entry.S
+++ b/arch/alpha/kernel/entry.S
@@ -850,8 +850,9 @@ osf_getpriority:
 sys_getxuid:
 	.prologue 0
 	ldq	$2, TI_TASK($8)
-	ldl	$0, TASK_UID($2)
-	ldl	$1, TASK_EUID($2)
+	ldq	$3, TASK_CRED($2)
+	ldl	$0, CRED_UID($3)
+	ldl	$1, CRED_EUID($3)
 	stq	$1, 80($sp)
 	ret
 .end sys_getxuid
@@ -862,8 +863,9 @@ sys_getxuid:
 sys_getxgid:
 	.prologue 0
 	ldq	$2, TI_TASK($8)
-	ldl	$0, TASK_GID($2)
-	ldl	$1, TASK_EGID($2)
+	ldq	$3, TASK_CRED($2)
+	ldl	$0, CRED_GID($3)
+	ldl	$1, CRED_EGID($3)
 	stq	$1, 80($sp)
 	ret
 .end sys_getxgid
diff --git a/arch/ia64/ia32/sys_ia32.c b/arch/ia64/ia32/sys_ia32.c
index 5e92ae00bdb..2445a9d3488 100644
--- a/arch/ia64/ia32/sys_ia32.c
+++ b/arch/ia64/ia32/sys_ia32.c
@@ -1772,20 +1772,20 @@ sys32_getgroups16 (int gidsetsize, short __user *grouplist)
 	if (gidsetsize < 0)
 		return -EINVAL;
 
-	get_group_info(current->group_info);
-	i = current->group_info->ngroups;
+	get_group_info(current->cred->group_info);
+	i = current->cred->group_info->ngroups;
 	if (gidsetsize) {
 		if (i > gidsetsize) {
 			i = -EINVAL;
 			goto out;
 		}
-		if (groups16_to_user(grouplist, current->group_info)) {
+		if (groups16_to_user(grouplist, current->cred->group_info)) {
 			i = -EFAULT;
 			goto out;
 		}
 	}
 out:
-	put_group_info(current->group_info);
+	put_group_info(current->cred->group_info);
 	return i;
 }
 
diff --git a/arch/mips/kernel/kspd.c b/arch/mips/kernel/kspd.c
index b0591ae0ce5..fd6e5122403 100644
--- a/arch/mips/kernel/kspd.c
+++ b/arch/mips/kernel/kspd.c
@@ -174,8 +174,8 @@ static unsigned int translate_open_flags(int flags)
 
 static void sp_setfsuidgid( uid_t uid, gid_t gid)
 {
-	current->fsuid = uid;
-	current->fsgid = gid;
+	current->cred->fsuid = uid;
+	current->cred->fsgid = gid;
 
 	key_fsuid_changed(current);
 	key_fsgid_changed(current);
diff --git a/arch/s390/kernel/compat_linux.c b/arch/s390/kernel/compat_linux.c
index 4646382af34..6cc87d8c868 100644
--- a/arch/s390/kernel/compat_linux.c
+++ b/arch/s390/kernel/compat_linux.c
@@ -148,9 +148,9 @@ asmlinkage long sys32_getresuid16(u16 __user *ruid, u16 __user *euid, u16 __user
 {
 	int retval;
 
-	if (!(retval = put_user(high2lowuid(current->uid), ruid)) &&
-	    !(retval = put_user(high2lowuid(current->euid), euid)))
-		retval = put_user(high2lowuid(current->suid), suid);
+	if (!(retval = put_user(high2lowuid(current->cred->uid), ruid)) &&
+	    !(retval = put_user(high2lowuid(current->cred->euid), euid)))
+		retval = put_user(high2lowuid(current->cred->suid), suid);
 
 	return retval;
 }
@@ -165,9 +165,9 @@ asmlinkage long sys32_getresgid16(u16 __user *rgid, u16 __user *egid, u16 __user
 {
 	int retval;
 
-	if (!(retval = put_user(high2lowgid(current->gid), rgid)) &&
-	    !(retval = put_user(high2lowgid(current->egid), egid)))
-		retval = put_user(high2lowgid(current->sgid), sgid);
+	if (!(retval = put_user(high2lowgid(current->cred->gid), rgid)) &&
+	    !(retval = put_user(high2lowgid(current->cred->egid), egid)))
+		retval = put_user(high2lowgid(current->cred->sgid), sgid);
 
 	return retval;
 }
@@ -217,20 +217,20 @@ asmlinkage long sys32_getgroups16(int gidsetsize, u16 __user *grouplist)
 	if (gidsetsize < 0)
 		return -EINVAL;
 
-	get_group_info(current->group_info);
-	i = current->group_info->ngroups;
+	get_group_info(current->cred->group_info);
+	i = current->cred->group_info->ngroups;
 	if (gidsetsize) {
 		if (i > gidsetsize) {
 			i = -EINVAL;
 			goto out;
 		}
-		if (groups16_to_user(grouplist, current->group_info)) {
+		if (groups16_to_user(grouplist, current->cred->group_info)) {
 			i = -EFAULT;
 			goto out;
 		}
 	}
 out:
-	put_group_info(current->group_info);
+	put_group_info(current->cred->group_info);
 	return i;
 }
 
@@ -261,22 +261,22 @@ asmlinkage long sys32_setgroups16(int gidsetsize, u16 __user *grouplist)
 
 asmlinkage long sys32_getuid16(void)
 {
-	return high2lowuid(current->uid);
+	return high2lowuid(current->cred->uid);
 }
 
 asmlinkage long sys32_geteuid16(void)
 {
-	return high2lowuid(current->euid);
+	return high2lowuid(current->cred->euid);
 }
 
 asmlinkage long sys32_getgid16(void)
 {
-	return high2lowgid(current->gid);
+	return high2lowgid(current->cred->gid);
 }
 
 asmlinkage long sys32_getegid16(void)
 {
-	return high2lowgid(current->egid);
+	return high2lowgid(current->cred->egid);
 }
 
 /*
diff --git a/drivers/connector/cn_proc.c b/drivers/connector/cn_proc.c
index 5c9f67f98d1..354c1ff1715 100644
--- a/drivers/connector/cn_proc.c
+++ b/drivers/connector/cn_proc.c
@@ -116,11 +116,11 @@ void proc_id_connector(struct task_struct *task, int which_id)
 	ev->event_data.id.process_pid = task->pid;
 	ev->event_data.id.process_tgid = task->tgid;
 	if (which_id == PROC_EVENT_UID) {
-	 	ev->event_data.id.r.ruid = task->uid;
-	 	ev->event_data.id.e.euid = task->euid;
+		ev->event_data.id.r.ruid = task->cred->uid;
+		ev->event_data.id.e.euid = task->cred->euid;
 	} else if (which_id == PROC_EVENT_GID) {
-	   	ev->event_data.id.r.rgid = task->gid;
-	   	ev->event_data.id.e.egid = task->egid;
+		ev->event_data.id.r.rgid = task->cred->gid;
+		ev->event_data.id.e.egid = task->cred->egid;
 	} else
 	     	return;
 	get_seq(&msg->seq, &ev->cpu);
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 8fcfa398d35..7a52477ce49 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -223,10 +223,10 @@ create_elf_tables(struct linux_binprm *bprm, struct elfhdr *exec,
 	NEW_AUX_ENT(AT_BASE, interp_load_addr);
 	NEW_AUX_ENT(AT_FLAGS, 0);
 	NEW_AUX_ENT(AT_ENTRY, exec->e_entry);
-	NEW_AUX_ENT(AT_UID, tsk->uid);
-	NEW_AUX_ENT(AT_EUID, tsk->euid);
-	NEW_AUX_ENT(AT_GID, tsk->gid);
-	NEW_AUX_ENT(AT_EGID, tsk->egid);
+	NEW_AUX_ENT(AT_UID, tsk->cred->uid);
+	NEW_AUX_ENT(AT_EUID, tsk->cred->euid);
+	NEW_AUX_ENT(AT_GID, tsk->cred->gid);
+	NEW_AUX_ENT(AT_EGID, tsk->cred->egid);
  	NEW_AUX_ENT(AT_SECURE, security_bprm_secureexec(bprm));
 	NEW_AUX_ENT(AT_EXECFN, bprm->exec);
 	if (k_platform) {
@@ -1388,8 +1388,8 @@ static int fill_psinfo(struct elf_prpsinfo *psinfo, struct task_struct *p,
 	psinfo->pr_zomb = psinfo->pr_sname == 'Z';
 	psinfo->pr_nice = task_nice(p);
 	psinfo->pr_flag = p->flags;
-	SET_UID(psinfo->pr_uid, p->uid);
-	SET_GID(psinfo->pr_gid, p->gid);
+	SET_UID(psinfo->pr_uid, p->cred->uid);
+	SET_GID(psinfo->pr_gid, p->cred->gid);
 	strncpy(psinfo->pr_fname, p->comm, sizeof(psinfo->pr_fname));
 	
 	return 0;
diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c
index 488584c8751..9f67054c2c4 100644
--- a/fs/binfmt_elf_fdpic.c
+++ b/fs/binfmt_elf_fdpic.c
@@ -623,10 +623,10 @@ static int create_elf_fdpic_tables(struct linux_binprm *bprm,
 	NEW_AUX_ENT(AT_BASE,	interp_params->elfhdr_addr);
 	NEW_AUX_ENT(AT_FLAGS,	0);
 	NEW_AUX_ENT(AT_ENTRY,	exec_params->entry_addr);
-	NEW_AUX_ENT(AT_UID,	(elf_addr_t) current_uid());
-	NEW_AUX_ENT(AT_EUID,	(elf_addr_t) current_euid());
-	NEW_AUX_ENT(AT_GID,	(elf_addr_t) current_gid());
-	NEW_AUX_ENT(AT_EGID,	(elf_addr_t) current_egid());
+	NEW_AUX_ENT(AT_UID,	(elf_addr_t) current->cred->uid);
+	NEW_AUX_ENT(AT_EUID,	(elf_addr_t) current->cred->euid);
+	NEW_AUX_ENT(AT_GID,	(elf_addr_t) current->cred->gid);
+	NEW_AUX_ENT(AT_EGID,	(elf_addr_t) current->cred->egid);
 	NEW_AUX_ENT(AT_SECURE,	security_bprm_secureexec(bprm));
 	NEW_AUX_ENT(AT_EXECFN,	bprm->exec);
 
@@ -1440,8 +1440,8 @@ static int fill_psinfo(struct elf_prpsinfo *psinfo, struct task_struct *p,
 	psinfo->pr_zomb = psinfo->pr_sname == 'Z';
 	psinfo->pr_nice = task_nice(p);
 	psinfo->pr_flag = p->flags;
-	SET_UID(psinfo->pr_uid, p->uid);
-	SET_GID(psinfo->pr_gid, p->gid);
+	SET_UID(psinfo->pr_uid, p->cred->uid);
+	SET_GID(psinfo->pr_gid, p->cred->gid);
 	strncpy(psinfo->pr_fname, p->comm, sizeof(psinfo->pr_fname));
 
 	return 0;
diff --git a/fs/exec.c b/fs/exec.c
index 604834f3b20..31149e430a8 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1738,7 +1738,7 @@ int do_coredump(long signr, int exit_code, struct pt_regs * regs)
 	 */
 	if (get_dumpable(mm) == 2) {	/* Setuid core dump mode */
 		flag = O_EXCL;		/* Stop rewrite attacks */
-		current->fsuid = 0;	/* Dump root private */
+		current->cred->fsuid = 0;	/* Dump root private */
 	}
 
 	retval = coredump_wait(exit_code, &core_state);
@@ -1834,7 +1834,7 @@ fail_unlock:
 	if (helper_argv)
 		argv_free(helper_argv);
 
-	current->fsuid = fsuid;
+	current->cred->fsuid = fsuid;
 	coredump_finish(mm);
 fail:
 	return retval;
diff --git a/fs/fcntl.c b/fs/fcntl.c
index bf049a805e5..63964d863ad 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -401,8 +401,8 @@ static inline int sigio_perm(struct task_struct *p,
                              struct fown_struct *fown, int sig)
 {
 	return (((fown->euid == 0) ||
-		 (fown->euid == p->suid) || (fown->euid == p->uid) ||
-		 (fown->uid == p->suid) || (fown->uid == p->uid)) &&
+		 (fown->euid == p->cred->suid) || (fown->euid == p->cred->uid) ||
+		 (fown->uid == p->cred->suid) || (fown->uid == p->cred->uid)) &&
 		!security_file_send_sigiotask(p, fown, sig));
 }
 
diff --git a/fs/file_table.c b/fs/file_table.c
index 5ad0eca6eea..3152b53cfab 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -122,8 +122,8 @@ struct file *get_empty_filp(void)
 	INIT_LIST_HEAD(&f->f_u.fu_list);
 	atomic_long_set(&f->f_count, 1);
 	rwlock_init(&f->f_owner.lock);
-	f->f_uid = tsk->fsuid;
-	f->f_gid = tsk->fsgid;
+	f->f_uid = tsk->cred->fsuid;
+	f->f_gid = tsk->cred->fsgid;
 	eventpoll_init_file(f);
 	/* f->f_version: 0 */
 	return f;
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index fd03330cade..e97a9898186 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -872,12 +872,12 @@ int fuse_allow_task(struct fuse_conn *fc, struct task_struct *task)
 	if (fc->flags & FUSE_ALLOW_OTHER)
 		return 1;
 
-	if (task->euid == fc->user_id &&
-	    task->suid == fc->user_id &&
-	    task->uid == fc->user_id &&
-	    task->egid == fc->group_id &&
-	    task->sgid == fc->group_id &&
-	    task->gid == fc->group_id)
+	if (task->cred->euid == fc->user_id &&
+	    task->cred->suid == fc->user_id &&
+	    task->cred->uid == fc->user_id &&
+	    task->cred->egid == fc->group_id &&
+	    task->cred->sgid == fc->group_id &&
+	    task->cred->gid == fc->group_id)
 		return 1;
 
 	return 0;
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 08ad76c79b4..870a721b8bd 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -958,7 +958,7 @@ struct file *hugetlb_file_setup(const char *name, size_t size)
 	if (!can_do_hugetlb_shm())
 		return ERR_PTR(-EPERM);
 
-	if (!user_shm_lock(size, current->user))
+	if (!user_shm_lock(size, current->cred->user))
 		return ERR_PTR(-ENOMEM);
 
 	root = hugetlbfs_vfsmount->mnt_root;
@@ -998,7 +998,7 @@ out_inode:
 out_dentry:
 	dput(dentry);
 out_shm_unlock:
-	user_shm_unlock(size, current->user);
+	user_shm_unlock(size, current->cred->user);
 	return ERR_PTR(error);
 }
 
diff --git a/fs/ioprio.c b/fs/ioprio.c
index 68d2cd80711..bb5210af77c 100644
--- a/fs/ioprio.c
+++ b/fs/ioprio.c
@@ -32,8 +32,8 @@ static int set_task_ioprio(struct task_struct *task, int ioprio)
 	int err;
 	struct io_context *ioc;
 
-	if (task->uid != current_euid() &&
-	    task->uid != current_uid() && !capable(CAP_SYS_NICE))
+	if (task->cred->uid != current_euid() &&
+	    task->cred->uid != current_uid() && !capable(CAP_SYS_NICE))
 		return -EPERM;
 
 	err = security_task_setioprio(task, ioprio);
@@ -123,7 +123,7 @@ asmlinkage long sys_ioprio_set(int which, int who, int ioprio)
 			break;
 		case IOPRIO_WHO_USER:
 			if (!who)
-				user = current->user;
+				user = current->cred->user;
 			else
 				user = find_user(who);
 
@@ -131,7 +131,7 @@ asmlinkage long sys_ioprio_set(int which, int who, int ioprio)
 				break;
 
 			do_each_thread(g, p) {
-				if (p->uid != who)
+				if (p->cred->uid != who)
 					continue;
 				ret = set_task_ioprio(p, ioprio);
 				if (ret)
@@ -216,7 +216,7 @@ asmlinkage long sys_ioprio_get(int which, int who)
 			break;
 		case IOPRIO_WHO_USER:
 			if (!who)
-				user = current->user;
+				user = current->cred->user;
 			else
 				user = find_user(who);
 
@@ -224,7 +224,7 @@ asmlinkage long sys_ioprio_get(int which, int who)
 				break;
 
 			do_each_thread(g, p) {
-				if (p->uid != user->uid)
+				if (p->cred->uid != user->uid)
 					continue;
 				tmpio = get_task_ioprio(p);
 				if (tmpio < 0)
diff --git a/fs/nfsd/auth.c b/fs/nfsd/auth.c
index 294992e9bf6..808fc03a6fb 100644
--- a/fs/nfsd/auth.c
+++ b/fs/nfsd/auth.c
@@ -27,6 +27,7 @@ int nfsexp_flags(struct svc_rqst *rqstp, struct svc_export *exp)
 
 int nfsd_setuser(struct svc_rqst *rqstp, struct svc_export *exp)
 {
+	struct cred *act_as = current->cred ;
 	struct svc_cred	cred = rqstp->rq_cred;
 	int i;
 	int flags = nfsexp_flags(rqstp, exp);
@@ -55,25 +56,26 @@ int nfsd_setuser(struct svc_rqst *rqstp, struct svc_export *exp)
 		get_group_info(cred.cr_group_info);
 
 	if (cred.cr_uid != (uid_t) -1)
-		current->fsuid = cred.cr_uid;
+		act_as->fsuid = cred.cr_uid;
 	else
-		current->fsuid = exp->ex_anon_uid;
+		act_as->fsuid = exp->ex_anon_uid;
 	if (cred.cr_gid != (gid_t) -1)
-		current->fsgid = cred.cr_gid;
+		act_as->fsgid = cred.cr_gid;
 	else
-		current->fsgid = exp->ex_anon_gid;
+		act_as->fsgid = exp->ex_anon_gid;
 
 	if (!cred.cr_group_info)
 		return -ENOMEM;
-	ret = set_current_groups(cred.cr_group_info);
+	ret = set_groups(act_as, cred.cr_group_info);
 	put_group_info(cred.cr_group_info);
 	if ((cred.cr_uid)) {
-		current->cap_effective =
-			cap_drop_nfsd_set(current->cap_effective);
+		act_as->cap_effective =
+			cap_drop_nfsd_set(act_as->cap_effective);
 	} else {
-		current->cap_effective =
-			cap_raise_nfsd_set(current->cap_effective,
-					   current->cap_permitted);
+		act_as->cap_effective =
+			cap_raise_nfsd_set(act_as->cap_effective,
+					   act_as->cap_permitted);
 	}
 	return ret;
 }
+
diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c
index bb93946ace2..a5e14e8695e 100644
--- a/fs/nfsd/nfs4recover.c
+++ b/fs/nfsd/nfs4recover.c
@@ -57,17 +57,17 @@ static int rec_dir_init = 0;
 static void
 nfs4_save_user(uid_t *saveuid, gid_t *savegid)
 {
-	*saveuid = current->fsuid;
-	*savegid = current->fsgid;
-	current->fsuid = 0;
-	current->fsgid = 0;
+	*saveuid = current->cred->fsuid;
+	*savegid = current->cred->fsgid;
+	current->cred->fsuid = 0;
+	current->cred->fsgid = 0;
 }
 
 static void
 nfs4_reset_user(uid_t saveuid, gid_t savegid)
 {
-	current->fsuid = saveuid;
-	current->fsgid = savegid;
+	current->cred->fsuid = saveuid;
+	current->cred->fsgid = savegid;
 }
 
 static void
diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c
index cd25d91895a..e67cfaea086 100644
--- a/fs/nfsd/nfsfh.c
+++ b/fs/nfsd/nfsfh.c
@@ -186,9 +186,9 @@ static __be32 nfsd_set_fh_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp)
 		 * access control settings being in effect, we cannot
 		 * fix that case easily.
 		 */
-		current->cap_effective =
-			cap_raise_nfsd_set(current->cap_effective,
-					   current->cap_permitted);
+		current->cred->cap_effective =
+			cap_raise_nfsd_set(current->cred->cap_effective,
+					   current->cred->cap_permitted);
 	} else {
 		error = nfsd_setuser_and_check_port(rqstp, exp);
 		if (error)
diff --git a/fs/open.c b/fs/open.c
index 500cc0c5476..b1238e195e7 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -425,6 +425,7 @@ out:
  */
 asmlinkage long sys_faccessat(int dfd, const char __user *filename, int mode)
 {
+	struct cred *cred = current->cred;
 	struct path path;
 	struct inode *inode;
 	int old_fsuid, old_fsgid;
@@ -434,18 +435,18 @@ asmlinkage long sys_faccessat(int dfd, const char __user *filename, int mode)
 	if (mode & ~S_IRWXO)	/* where's F_OK, X_OK, W_OK, R_OK? */
 		return -EINVAL;
 
-	old_fsuid = current->fsuid;
-	old_fsgid = current->fsgid;
+	old_fsuid = cred->fsuid;
+	old_fsgid = cred->fsgid;
 
-	current->fsuid = current->uid;
-	current->fsgid = current->gid;
+	cred->fsuid = cred->uid;
+	cred->fsgid = cred->gid;
 
 	if (!issecure(SECURE_NO_SETUID_FIXUP)) {
 		/* Clear the capabilities if we switch to a non-root user */
-		if (current->uid)
+		if (current->cred->uid)
 			old_cap = cap_set_effective(__cap_empty_set);
 		else
-			old_cap = cap_set_effective(current->cap_permitted);
+			old_cap = cap_set_effective(cred->cap_permitted);
 	}
 
 	res = user_path_at(dfd, filename, LOOKUP_FOLLOW, &path);
@@ -484,8 +485,8 @@ asmlinkage long sys_faccessat(int dfd, const char __user *filename, int mode)
 out_path_release:
 	path_put(&path);
 out:
-	current->fsuid = old_fsuid;
-	current->fsgid = old_fsgid;
+	cred->fsuid = old_fsuid;
+	cred->fsgid = old_fsgid;
 
 	if (!issecure(SECURE_NO_SETUID_FIXUP))
 		cap_set_effective(old_cap);
diff --git a/fs/proc/array.c b/fs/proc/array.c
index 6af7fba7abb..62fe9b2009b 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -182,8 +182,8 @@ static inline void task_state(struct seq_file *m, struct pid_namespace *ns,
 		task_tgid_nr_ns(p, ns),
 		pid_nr_ns(pid, ns),
 		ppid, tpid,
-		p->uid, p->euid, p->suid, p->fsuid,
-		p->gid, p->egid, p->sgid, p->fsgid);
+		p->cred->uid, p->cred->euid, p->cred->suid, p->cred->fsuid,
+		p->cred->gid, p->cred->egid, p->cred->sgid, p->cred->fsgid);
 
 	task_lock(p);
 	if (p->files)
@@ -194,7 +194,7 @@ static inline void task_state(struct seq_file *m, struct pid_namespace *ns,
 		fdt ? fdt->max_fds : 0);
 	rcu_read_unlock();
 
-	group_info = p->group_info;
+	group_info = p->cred->group_info;
 	get_group_info(group_info);
 	task_unlock(p);
 
@@ -262,7 +262,7 @@ static inline void task_sig(struct seq_file *m, struct task_struct *p)
 		blocked = p->blocked;
 		collect_sigign_sigcatch(p, &ignored, &caught);
 		num_threads = atomic_read(&p->signal->count);
-		qsize = atomic_read(&p->user->sigpending);
+		qsize = atomic_read(&p->cred->user->sigpending);
 		qlim = p->signal->rlim[RLIMIT_SIGPENDING].rlim_cur;
 		unlock_task_sighand(p, &flags);
 	}
@@ -293,10 +293,12 @@ static void render_cap_t(struct seq_file *m, const char *header,
 
 static inline void task_cap(struct seq_file *m, struct task_struct *p)
 {
-	render_cap_t(m, "CapInh:\t", &p->cap_inheritable);
-	render_cap_t(m, "CapPrm:\t", &p->cap_permitted);
-	render_cap_t(m, "CapEff:\t", &p->cap_effective);
-	render_cap_t(m, "CapBnd:\t", &p->cap_bset);
+	struct cred *cred = p->cred;
+
+	render_cap_t(m, "CapInh:\t", &cred->cap_inheritable);
+	render_cap_t(m, "CapPrm:\t", &cred->cap_permitted);
+	render_cap_t(m, "CapEff:\t", &cred->cap_effective);
+	render_cap_t(m, "CapBnd:\t", &cred->cap_bset);
 }
 
 static inline void task_context_switch_counts(struct seq_file *m,
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 486cf3fe713..6862b360c36 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -1428,8 +1428,8 @@ static struct inode *proc_pid_make_inode(struct super_block * sb, struct task_st
 	inode->i_uid = 0;
 	inode->i_gid = 0;
 	if (task_dumpable(task)) {
-		inode->i_uid = task->euid;
-		inode->i_gid = task->egid;
+		inode->i_uid = task->cred->euid;
+		inode->i_gid = task->cred->egid;
 	}
 	security_task_to_inode(task, inode);
 
@@ -1454,8 +1454,8 @@ static int pid_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat
 	if (task) {
 		if ((inode->i_mode == (S_IFDIR|S_IRUGO|S_IXUGO)) ||
 		    task_dumpable(task)) {
-			stat->uid = task->euid;
-			stat->gid = task->egid;
+			stat->uid = task->cred->euid;
+			stat->gid = task->cred->egid;
 		}
 	}
 	rcu_read_unlock();
@@ -1486,8 +1486,8 @@ static int pid_revalidate(struct dentry *dentry, struct nameidata *nd)
 	if (task) {
 		if ((inode->i_mode == (S_IFDIR|S_IRUGO|S_IXUGO)) ||
 		    task_dumpable(task)) {
-			inode->i_uid = task->euid;
-			inode->i_gid = task->egid;
+			inode->i_uid = task->cred->euid;
+			inode->i_gid = task->cred->egid;
 		} else {
 			inode->i_uid = 0;
 			inode->i_gid = 0;
@@ -1658,8 +1658,8 @@ static int tid_fd_revalidate(struct dentry *dentry, struct nameidata *nd)
 				rcu_read_unlock();
 				put_files_struct(files);
 				if (task_dumpable(task)) {
-					inode->i_uid = task->euid;
-					inode->i_gid = task->egid;
+					inode->i_uid = task->cred->euid;
+					inode->i_gid = task->cred->egid;
 				} else {
 					inode->i_uid = 0;
 					inode->i_gid = 0;
diff --git a/fs/xfs/linux-2.6/xfs_cred.h b/fs/xfs/linux-2.6/xfs_cred.h
index 293043a5573..8c022cd0ad6 100644
--- a/fs/xfs/linux-2.6/xfs_cred.h
+++ b/fs/xfs/linux-2.6/xfs_cred.h
@@ -23,11 +23,9 @@
 /*
  * Credentials
  */
-typedef struct cred {
-       /* EMPTY */
-} cred_t;
+typedef const struct cred cred_t;
 
-extern struct cred *sys_cred;
+extern cred_t *sys_cred;
 
 /* this is a hack.. (assumes sys_cred is the only cred_t in the system) */
 static inline int capable_cred(cred_t *cr, int cid)
diff --git a/fs/xfs/linux-2.6/xfs_globals.h b/fs/xfs/linux-2.6/xfs_globals.h
index 2770b0085ee..6eda8a3eb6f 100644
--- a/fs/xfs/linux-2.6/xfs_globals.h
+++ b/fs/xfs/linux-2.6/xfs_globals.h
@@ -19,6 +19,6 @@
 #define __XFS_GLOBALS_H__
 
 extern uint64_t	xfs_panic_mask;		/* set to cause more panics */
-extern struct cred *sys_cred;
+extern cred_t *sys_cred;
 
 #endif	/* __XFS_GLOBALS_H__ */
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index 1420c49674d..6be310d41da 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -497,7 +497,7 @@ int		xfs_iread(struct xfs_mount *, struct xfs_trans *, xfs_ino_t,
 			  xfs_inode_t **, xfs_daddr_t, uint);
 int		xfs_iread_extents(struct xfs_trans *, xfs_inode_t *, int);
 int		xfs_ialloc(struct xfs_trans *, xfs_inode_t *, mode_t,
-			   xfs_nlink_t, xfs_dev_t, struct cred *, xfs_prid_t,
+			   xfs_nlink_t, xfs_dev_t, cred_t *, xfs_prid_t,
 			   int, struct xfs_buf **, boolean_t *, xfs_inode_t **);
 void		xfs_dinode_from_disk(struct xfs_icdinode *,
 				     struct xfs_dinode_core *);
diff --git a/fs/xfs/xfs_vnodeops.h b/fs/xfs/xfs_vnodeops.h
index e932a96bec5..7b0c2ab8833 100644
--- a/fs/xfs/xfs_vnodeops.h
+++ b/fs/xfs/xfs_vnodeops.h
@@ -16,7 +16,7 @@ struct xfs_iomap;
 
 int xfs_open(struct xfs_inode *ip);
 int xfs_setattr(struct xfs_inode *ip, struct iattr *vap, int flags,
-		struct cred *credp);
+		cred_t *credp);
 #define	XFS_ATTR_DMI		0x01	/* invocation from a DMI function */
 #define	XFS_ATTR_NONBLOCK	0x02	/* return EAGAIN if operation would block */
 #define XFS_ATTR_NOLOCK		0x04	/* Don't grab any conflicting locks */
@@ -28,24 +28,24 @@ int xfs_inactive(struct xfs_inode *ip);
 int xfs_lookup(struct xfs_inode *dp, struct xfs_name *name,
 		struct xfs_inode **ipp, struct xfs_name *ci_name);
 int xfs_create(struct xfs_inode *dp, struct xfs_name *name, mode_t mode,
-		xfs_dev_t rdev, struct xfs_inode **ipp, struct cred *credp);
+		xfs_dev_t rdev, struct xfs_inode **ipp, cred_t *credp);
 int xfs_remove(struct xfs_inode *dp, struct xfs_name *name,
 		struct xfs_inode *ip);
 int xfs_link(struct xfs_inode *tdp, struct xfs_inode *sip,
 		struct xfs_name *target_name);
 int xfs_mkdir(struct xfs_inode *dp, struct xfs_name *dir_name,
-		mode_t mode, struct xfs_inode **ipp, struct cred *credp);
+		mode_t mode, struct xfs_inode **ipp, cred_t *credp);
 int xfs_readdir(struct xfs_inode	*dp, void *dirent, size_t bufsize,
 		       xfs_off_t *offset, filldir_t filldir);
 int xfs_symlink(struct xfs_inode *dp, struct xfs_name *link_name,
 		const char *target_path, mode_t mode, struct xfs_inode **ipp,
-		struct cred *credp);
+		cred_t *credp);
 int xfs_inode_flush(struct xfs_inode *ip, int flags);
 int xfs_set_dmattrs(struct xfs_inode *ip, u_int evmask, u_int16_t state);
 int xfs_reclaim(struct xfs_inode *ip);
 int xfs_change_file_space(struct xfs_inode *ip, int cmd,
 		xfs_flock64_t *bf, xfs_off_t offset,
-		struct cred *credp, int	attr_flags);
+		cred_t *credp, int	attr_flags);
 int xfs_rename(struct xfs_inode *src_dp, struct xfs_name *src_name,
 		struct xfs_inode *src_ip, struct xfs_inode *target_dp,
 		struct xfs_name *target_name, struct xfs_inode *target_ip);
diff --git a/include/linux/cred.h b/include/linux/cred.h
index b69222cc1fd..3e65587a72e 100644
--- a/include/linux/cred.h
+++ b/include/linux/cred.h
@@ -12,39 +12,150 @@
 #ifndef _LINUX_CRED_H
 #define _LINUX_CRED_H
 
-#define get_current_user()	(get_uid(current->user))
-
-#define task_uid(task)		((task)->uid)
-#define task_gid(task)		((task)->gid)
-#define task_euid(task)		((task)->euid)
-#define task_egid(task)		((task)->egid)
-
-#define current_uid()		(current->uid)
-#define current_gid()		(current->gid)
-#define current_euid()		(current->euid)
-#define current_egid()		(current->egid)
-#define current_suid()		(current->suid)
-#define current_sgid()		(current->sgid)
-#define current_fsuid()		(current->fsuid)
-#define current_fsgid()		(current->fsgid)
-#define current_cap()		(current->cap_effective)
+#include <linux/capability.h>
+#include <linux/key.h>
+#include <asm/atomic.h>
+
+struct user_struct;
+struct cred;
+
+/*
+ * COW Supplementary groups list
+ */
+#define NGROUPS_SMALL		32
+#define NGROUPS_PER_BLOCK	((unsigned int)(PAGE_SIZE / sizeof(gid_t)))
+
+struct group_info {
+	atomic_t	usage;
+	int		ngroups;
+	int		nblocks;
+	gid_t		small_block[NGROUPS_SMALL];
+	gid_t		*blocks[0];
+};
+
+/**
+ * get_group_info - Get a reference to a group info structure
+ * @group_info: The group info to reference
+ *
+ * This must be called with the owning task locked (via task_lock()) when task
+ * != current.  The reason being that the vast majority of callers are looking
+ * at current->group_info, which can not be changed except by the current task.
+ * Changing current->group_info requires the task lock, too.
+ */
+#define get_group_info(group_info)		\
+do {						\
+	atomic_inc(&(group_info)->usage);	\
+} while (0)
+
+/**
+ * put_group_info - Release a reference to a group info structure
+ * @group_info: The group info to release
+ */
+#define put_group_info(group_info)			\
+do {							\
+	if (atomic_dec_and_test(&(group_info)->usage))	\
+		groups_free(group_info);		\
+} while (0)
+
+extern struct group_info *groups_alloc(int);
+extern void groups_free(struct group_info *);
+extern int set_current_groups(struct group_info *);
+extern int set_groups(struct cred *, struct group_info *);
+extern int groups_search(struct group_info *, gid_t);
+
+/* access the groups "array" with this macro */
+#define GROUP_AT(gi, i) \
+	((gi)->blocks[(i) / NGROUPS_PER_BLOCK][(i) % NGROUPS_PER_BLOCK])
+
+extern int in_group_p(gid_t);
+extern int in_egroup_p(gid_t);
+
+/*
+ * The security context of a task
+ *
+ * The parts of the context break down into two categories:
+ *
+ *  (1) The objective context of a task.  These parts are used when some other
+ *	task is attempting to affect this one.
+ *
+ *  (2) The subjective context.  These details are used when the task is acting
+ *	upon another object, be that a file, a task, a key or whatever.
+ *
+ * Note that some members of this structure belong to both categories - the
+ * LSM security pointer for instance.
+ *
+ * A task has two security pointers.  task->real_cred points to the objective
+ * context that defines that task's actual details.  The objective part of this
+ * context is used whenever that task is acted upon.
+ *
+ * task->cred points to the subjective context that defines the details of how
+ * that task is going to act upon another object.  This may be overridden
+ * temporarily to point to another security context, but normally points to the
+ * same context as task->real_cred.
+ */
+struct cred {
+	atomic_t	usage;
+	uid_t		uid;		/* real UID of the task */
+	gid_t		gid;		/* real GID of the task */
+	uid_t		suid;		/* saved UID of the task */
+	gid_t		sgid;		/* saved GID of the task */
+	uid_t		euid;		/* effective UID of the task */
+	gid_t		egid;		/* effective GID of the task */
+	uid_t		fsuid;		/* UID for VFS ops */
+	gid_t		fsgid;		/* GID for VFS ops */
+	unsigned	securebits;	/* SUID-less security management */
+	kernel_cap_t	cap_inheritable; /* caps our children can inherit */
+	kernel_cap_t	cap_permitted;	/* caps we're permitted */
+	kernel_cap_t	cap_effective;	/* caps we can actually use */
+	kernel_cap_t	cap_bset;	/* capability bounding set */
+#ifdef CONFIG_KEYS
+	unsigned char	jit_keyring;	/* default keyring to attach requested
+					 * keys to */
+	struct key	*thread_keyring; /* keyring private to this thread */
+	struct key	*request_key_auth; /* assumed request_key authority */
+#endif
+#ifdef CONFIG_SECURITY
+	void		*security;	/* subjective LSM security */
+#endif
+	struct user_struct *user;	/* real user ID subscription */
+	struct group_info *group_info;	/* supplementary groups for euid/fsgid */
+	struct rcu_head	rcu;		/* RCU deletion hook */
+	spinlock_t	lock;		/* lock for pointer changes */
+};
+
+#define get_current_user()	(get_uid(current->cred->user))
+
+#define task_uid(task)		((task)->cred->uid)
+#define task_gid(task)		((task)->cred->gid)
+#define task_euid(task)		((task)->cred->euid)
+#define task_egid(task)		((task)->cred->egid)
+
+#define current_uid()		(current->cred->uid)
+#define current_gid()		(current->cred->gid)
+#define current_euid()		(current->cred->euid)
+#define current_egid()		(current->cred->egid)
+#define current_suid()		(current->cred->suid)
+#define current_sgid()		(current->cred->sgid)
+#define current_fsuid()		(current->cred->fsuid)
+#define current_fsgid()		(current->cred->fsgid)
+#define current_cap()		(current->cred->cap_effective)
 
 #define current_uid_gid(_uid, _gid)		\
 do {						\
-	*(_uid) = current->uid;			\
-	*(_gid) = current->gid;			\
+	*(_uid) = current->cred->uid;		\
+	*(_gid) = current->cred->gid;		\
 } while(0)
 
 #define current_euid_egid(_uid, _gid)		\
 do {						\
-	*(_uid) = current->euid;		\
-	*(_gid) = current->egid;		\
+	*(_uid) = current->cred->euid;		\
+	*(_gid) = current->cred->egid;		\
 } while(0)
 
 #define current_fsuid_fsgid(_uid, _gid)		\
 do {						\
-	*(_uid) = current->fsuid;		\
-	*(_gid) = current->fsgid;		\
+	*(_uid) = current->cred->fsuid;		\
+	*(_gid) = current->cred->fsgid;		\
 } while(0)
 
 #endif /* _LINUX_CRED_H */
diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index 23fd8909b9e..9de41ccd67b 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -113,6 +113,21 @@ extern struct group_info init_groups;
 # define CAP_INIT_BSET  CAP_INIT_EFF_SET
 #endif
 
+extern struct cred init_cred;
+
+#define INIT_CRED(p)						\
+{								\
+	.usage			= ATOMIC_INIT(3),		\
+	.securebits		= SECUREBITS_DEFAULT,		\
+	.cap_inheritable	= CAP_INIT_INH_SET,		\
+	.cap_permitted		= CAP_FULL_SET,			\
+	.cap_effective		= CAP_INIT_EFF_SET,		\
+	.cap_bset		= CAP_INIT_BSET,		\
+	.user			= INIT_USER,			\
+	.group_info		= &init_groups,			\
+	.lock			= __SPIN_LOCK_UNLOCKED(p.lock),	\
+}
+
 /*
  *  INIT_TASK is used to set up the first task table, touch at
  * your own risk!. Base=0, limit=0x1fffff (=2MB)
@@ -147,13 +162,8 @@ extern struct group_info init_groups;
 	.children	= LIST_HEAD_INIT(tsk.children),			\
 	.sibling	= LIST_HEAD_INIT(tsk.sibling),			\
 	.group_leader	= &tsk,						\
-	.group_info	= &init_groups,					\
-	.cap_effective	= CAP_INIT_EFF_SET,				\
-	.cap_inheritable = CAP_INIT_INH_SET,				\
-	.cap_permitted	= CAP_FULL_SET,					\
-	.cap_bset 	= CAP_INIT_BSET,				\
-	.securebits     = SECUREBITS_DEFAULT,				\
-	.user		= INIT_USER,					\
+	.__temp_cred	= INIT_CRED(tsk.__temp_cred),			\
+	.cred		= &tsk.__temp_cred,				\
 	.comm		= "swapper",					\
 	.thread		= INIT_THREAD,					\
 	.fs		= &init_fs,					\
diff --git a/include/linux/sched.h b/include/linux/sched.h
index b483f39a711..c8b92502354 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -660,6 +660,7 @@ extern struct user_struct *find_user(uid_t);
 extern struct user_struct root_user;
 #define INIT_USER (&root_user)
 
+
 struct backing_dev_info;
 struct reclaim_state;
 
@@ -883,38 +884,7 @@ partition_sched_domains(int ndoms_new, cpumask_t *doms_new,
 #endif	/* !CONFIG_SMP */
 
 struct io_context;			/* See blkdev.h */
-#define NGROUPS_SMALL		32
-#define NGROUPS_PER_BLOCK	((unsigned int)(PAGE_SIZE / sizeof(gid_t)))
-struct group_info {
-	int ngroups;
-	atomic_t usage;
-	gid_t small_block[NGROUPS_SMALL];
-	int nblocks;
-	gid_t *blocks[0];
-};
-
-/*
- * get_group_info() must be called with the owning task locked (via task_lock())
- * when task != current.  The reason being that the vast majority of callers are
- * looking at current->group_info, which can not be changed except by the
- * current task.  Changing current->group_info requires the task lock, too.
- */
-#define get_group_info(group_info) do { \
-	atomic_inc(&(group_info)->usage); \
-} while (0)
 
-#define put_group_info(group_info) do { \
-	if (atomic_dec_and_test(&(group_info)->usage)) \
-		groups_free(group_info); \
-} while (0)
-
-extern struct group_info *groups_alloc(int gidsetsize);
-extern void groups_free(struct group_info *group_info);
-extern int set_current_groups(struct group_info *group_info);
-extern int groups_search(struct group_info *group_info, gid_t grp);
-/* access the groups "array" with this macro */
-#define GROUP_AT(gi, i) \
-    ((gi)->blocks[(i)/NGROUPS_PER_BLOCK][(i)%NGROUPS_PER_BLOCK])
 
 #ifdef ARCH_HAS_PREFETCH_SWITCH_STACK
 extern void prefetch_stack(struct task_struct *t);
@@ -1181,17 +1151,9 @@ struct task_struct {
 	struct list_head cpu_timers[3];
 
 /* process credentials */
-	uid_t uid,euid,suid,fsuid;
-	gid_t gid,egid,sgid,fsgid;
-	struct group_info *group_info;
-	kernel_cap_t   cap_effective, cap_inheritable, cap_permitted, cap_bset;
-	struct user_struct *user;
-	unsigned securebits;
-#ifdef CONFIG_KEYS
-	unsigned char jit_keyring;	/* default keyring to attach requested keys to */
-	struct key *request_key_auth;	/* assumed request_key authority */
-	struct key *thread_keyring;	/* keyring private to this thread */
-#endif
+	struct cred __temp_cred __deprecated; /* temporary credentials to be removed */
+	struct cred *cred;	/* actual/objective task credentials */
+
 	char comm[TASK_COMM_LEN]; /* executable name excluding path
 				     - access with [gs]et_task_comm (which lock
 				       it with task_lock())
@@ -1228,9 +1190,6 @@ struct task_struct {
 	int (*notifier)(void *priv);
 	void *notifier_data;
 	sigset_t *notifier_mask;
-#ifdef CONFIG_SECURITY
-	void *security;
-#endif
 	struct audit_context *audit_context;
 #ifdef CONFIG_AUDITSYSCALL
 	uid_t loginuid;
@@ -1787,9 +1746,6 @@ extern void wake_up_new_task(struct task_struct *tsk,
 extern void sched_fork(struct task_struct *p, int clone_flags);
 extern void sched_dead(struct task_struct *p);
 
-extern int in_group_p(gid_t);
-extern int in_egroup_p(gid_t);
-
 extern void proc_caches_init(void);
 extern void flush_signals(struct task_struct *);
 extern void ignore_signals(struct task_struct *);
diff --git a/include/linux/securebits.h b/include/linux/securebits.h
index 92f09bdf117..6d389491bfa 100644
--- a/include/linux/securebits.h
+++ b/include/linux/securebits.h
@@ -32,7 +32,7 @@
    setting is locked or not. A setting which is locked cannot be
    changed from user-level. */
 #define issecure_mask(X)	(1 << (X))
-#define issecure(X)		(issecure_mask(X) & current->securebits)
+#define issecure(X)		(issecure_mask(X) & current->cred->securebits)
 
 #define SECURE_ALL_BITS		(issecure_mask(SECURE_NOROOT) | \
 				 issecure_mask(SECURE_NO_SETUID_FIXUP) | \
diff --git a/ipc/mqueue.c b/ipc/mqueue.c
index abda5991d7e..e1885b494ba 100644
--- a/ipc/mqueue.c
+++ b/ipc/mqueue.c
@@ -126,7 +126,7 @@ static struct inode *mqueue_get_inode(struct super_block *sb, int mode,
 		if (S_ISREG(mode)) {
 			struct mqueue_inode_info *info;
 			struct task_struct *p = current;
-			struct user_struct *u = p->user;
+			struct user_struct *u = p->cred->user;
 			unsigned long mq_bytes, mq_msg_tblsz;
 
 			inode->i_fop = &mqueue_file_operations;
diff --git a/ipc/shm.c b/ipc/shm.c
index 0c3debbe32d..264a9d33c5d 100644
--- a/ipc/shm.c
+++ b/ipc/shm.c
@@ -366,7 +366,7 @@ static int newseg(struct ipc_namespace *ns, struct ipc_params *params)
 	if (shmflg & SHM_HUGETLB) {
 		/* hugetlb_file_setup takes care of mlock user accounting */
 		file = hugetlb_file_setup(name, size);
-		shp->mlock_user = current->user;
+		shp->mlock_user = current->cred->user;
 	} else {
 		int acctflag = VM_ACCOUNT;
 		/*
@@ -767,7 +767,7 @@ asmlinkage long sys_shmctl(int shmid, int cmd, struct shmid_ds __user *buf)
 			goto out_unlock;
 		
 		if(cmd==SHM_LOCK) {
-			struct user_struct * user = current->user;
+			struct user_struct *user = current->cred->user;
 			if (!is_file_hugepages(shp->shm_file)) {
 				err = shmem_lock(shp->shm_file, 1, user);
 				if (!err && !(shp->shm_perm.mode & SHM_LOCKED)){
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 9c7e47ae457..2febf5165fa 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -447,6 +447,7 @@ static int audit_filter_rules(struct task_struct *tsk,
 			      struct audit_names *name,
 			      enum audit_state *state)
 {
+	struct cred *cred = tsk->cred;
 	int i, j, need_sid = 1;
 	u32 sid;
 
@@ -466,28 +467,28 @@ static int audit_filter_rules(struct task_struct *tsk,
 			}
 			break;
 		case AUDIT_UID:
-			result = audit_comparator(tsk->uid, f->op, f->val);
+			result = audit_comparator(cred->uid, f->op, f->val);
 			break;
 		case AUDIT_EUID:
-			result = audit_comparator(tsk->euid, f->op, f->val);
+			result = audit_comparator(cred->euid, f->op, f->val);
 			break;
 		case AUDIT_SUID:
-			result = audit_comparator(tsk->suid, f->op, f->val);
+			result = audit_comparator(cred->suid, f->op, f->val);
 			break;
 		case AUDIT_FSUID:
-			result = audit_comparator(tsk->fsuid, f->op, f->val);
+			result = audit_comparator(cred->fsuid, f->op, f->val);
 			break;
 		case AUDIT_GID:
-			result = audit_comparator(tsk->gid, f->op, f->val);
+			result = audit_comparator(cred->gid, f->op, f->val);
 			break;
 		case AUDIT_EGID:
-			result = audit_comparator(tsk->egid, f->op, f->val);
+			result = audit_comparator(cred->egid, f->op, f->val);
 			break;
 		case AUDIT_SGID:
-			result = audit_comparator(tsk->sgid, f->op, f->val);
+			result = audit_comparator(cred->sgid, f->op, f->val);
 			break;
 		case AUDIT_FSGID:
-			result = audit_comparator(tsk->fsgid, f->op, f->val);
+			result = audit_comparator(cred->fsgid, f->op, f->val);
 			break;
 		case AUDIT_PERS:
 			result = audit_comparator(tsk->personality, f->op, f->val);
@@ -1228,6 +1229,7 @@ static void audit_log_fcaps(struct audit_buffer *ab, struct audit_names *name)
 
 static void audit_log_exit(struct audit_context *context, struct task_struct *tsk)
 {
+	struct cred *cred = tsk->cred;
 	int i, call_panic = 0;
 	struct audit_buffer *ab;
 	struct audit_aux_data *aux;
@@ -1237,14 +1239,14 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
 	context->pid = tsk->pid;
 	if (!context->ppid)
 		context->ppid = sys_getppid();
-	context->uid = tsk->uid;
-	context->gid = tsk->gid;
-	context->euid = tsk->euid;
-	context->suid = tsk->suid;
-	context->fsuid = tsk->fsuid;
-	context->egid = tsk->egid;
-	context->sgid = tsk->sgid;
-	context->fsgid = tsk->fsgid;
+	context->uid = cred->uid;
+	context->gid = cred->gid;
+	context->euid = cred->euid;
+	context->suid = cred->suid;
+	context->fsuid = cred->fsuid;
+	context->egid = cred->egid;
+	context->sgid = cred->sgid;
+	context->fsgid = cred->fsgid;
 	context->personality = tsk->personality;
 
 	ab = audit_log_start(context, GFP_KERNEL, AUDIT_SYSCALL);
@@ -2086,7 +2088,7 @@ int audit_set_loginuid(struct task_struct *task, uid_t loginuid)
 			audit_log_format(ab, "login pid=%d uid=%u "
 				"old auid=%u new auid=%u"
 				" old ses=%u new ses=%u",
-				task->pid, task->uid,
+				task->pid, task->cred->uid,
 				task->loginuid, loginuid,
 				task->sessionid, sessionid);
 			audit_log_end(ab);
@@ -2469,7 +2471,7 @@ void __audit_ptrace(struct task_struct *t)
 
 	context->target_pid = t->pid;
 	context->target_auid = audit_get_loginuid(t);
-	context->target_uid = t->uid;
+	context->target_uid = t->cred->uid;
 	context->target_sessionid = audit_get_sessionid(t);
 	security_task_getsecid(t, &context->target_sid);
 	memcpy(context->target_comm, t->comm, TASK_COMM_LEN);
@@ -2495,7 +2497,7 @@ int __audit_signal_info(int sig, struct task_struct *t)
 			if (tsk->loginuid != -1)
 				audit_sig_uid = tsk->loginuid;
 			else
-				audit_sig_uid = tsk->uid;
+				audit_sig_uid = tsk->cred->uid;
 			security_task_getsecid(tsk, &audit_sig_sid);
 		}
 		if (!audit_signals || audit_dummy_context())
@@ -2507,7 +2509,7 @@ int __audit_signal_info(int sig, struct task_struct *t)
 	if (!ctx->target_pid) {
 		ctx->target_pid = t->tgid;
 		ctx->target_auid = audit_get_loginuid(t);
-		ctx->target_uid = t->uid;
+		ctx->target_uid = t->cred->uid;
 		ctx->target_sessionid = audit_get_sessionid(t);
 		security_task_getsecid(t, &ctx->target_sid);
 		memcpy(ctx->target_comm, t->comm, TASK_COMM_LEN);
@@ -2528,7 +2530,7 @@ int __audit_signal_info(int sig, struct task_struct *t)
 
 	axp->target_pid[axp->pid_count] = t->tgid;
 	axp->target_auid[axp->pid_count] = audit_get_loginuid(t);
-	axp->target_uid[axp->pid_count] = t->uid;
+	axp->target_uid[axp->pid_count] = t->cred->uid;
 	axp->target_sessionid[axp->pid_count] = audit_get_sessionid(t);
 	security_task_getsecid(t, &axp->target_sid[axp->pid_count]);
 	memcpy(axp->target_comm[axp->pid_count], t->comm, TASK_COMM_LEN);
@@ -2575,12 +2577,12 @@ void __audit_log_bprm_fcaps(struct linux_binprm *bprm, kernel_cap_t *pP, kernel_
 	ax->fcap_ver = (vcaps.magic_etc & VFS_CAP_REVISION_MASK) >> VFS_CAP_REVISION_SHIFT;
 
 	ax->old_pcap.permitted = *pP;
-	ax->old_pcap.inheritable = current->cap_inheritable;
+	ax->old_pcap.inheritable = current->cred->cap_inheritable;
 	ax->old_pcap.effective = *pE;
 
-	ax->new_pcap.permitted = current->cap_permitted;
-	ax->new_pcap.inheritable = current->cap_inheritable;
-	ax->new_pcap.effective = current->cap_effective;
+	ax->new_pcap.permitted = current->cred->cap_permitted;
+	ax->new_pcap.inheritable = current->cred->cap_inheritable;
+	ax->new_pcap.effective = current->cred->cap_effective;
 }
 
 /**
diff --git a/kernel/capability.c b/kernel/capability.c
index 58b00519624..a404b980b1b 100644
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -171,8 +171,8 @@ kernel_cap_t cap_set_effective(const kernel_cap_t pE_new)
 
 	spin_lock(&task_capability_lock);
 
-	pE_old = current->cap_effective;
-	current->cap_effective = pE_new;
+	pE_old = current->cred->cap_effective;
+	current->cred->cap_effective = pE_new;
 
 	spin_unlock(&task_capability_lock);
 
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 78f9b310c4f..e210526e640 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -1293,7 +1293,9 @@ static int attach_task_by_pid(struct cgroup *cgrp, u64 pid)
 		rcu_read_unlock();
 
 		euid = current_euid();
-		if (euid && euid != tsk->uid && euid != tsk->suid) {
+		if (euid &&
+		    euid != tsk->cred->uid &&
+		    euid != tsk->cred->suid) {
 			put_task_struct(tsk);
 			return -EACCES;
 		}
diff --git a/kernel/exit.c b/kernel/exit.c
index 80137a5d946..e0f6e1892fb 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -160,7 +160,7 @@ void release_task(struct task_struct * p)
 	int zap_leader;
 repeat:
 	tracehook_prepare_release_task(p);
-	atomic_dec(&p->user->processes);
+	atomic_dec(&p->cred->user->processes);
 	proc_flush_task(p);
 	write_lock_irq(&tasklist_lock);
 	tracehook_finish_release_task(p);
@@ -1272,7 +1272,7 @@ static int wait_task_zombie(struct task_struct *p, int options,
 		return 0;
 
 	if (unlikely(options & WNOWAIT)) {
-		uid_t uid = p->uid;
+		uid_t uid = p->cred->uid;
 		int exit_code = p->exit_code;
 		int why, status;
 
@@ -1393,7 +1393,7 @@ static int wait_task_zombie(struct task_struct *p, int options,
 	if (!retval && infop)
 		retval = put_user(pid, &infop->si_pid);
 	if (!retval && infop)
-		retval = put_user(p->uid, &infop->si_uid);
+		retval = put_user(p->cred->uid, &infop->si_uid);
 	if (!retval)
 		retval = pid;
 
@@ -1458,7 +1458,7 @@ static int wait_task_stopped(int ptrace, struct task_struct *p,
 	if (!unlikely(options & WNOWAIT))
 		p->exit_code = 0;
 
-	uid = p->uid;
+	uid = p->cred->uid;
 unlock_sig:
 	spin_unlock_irq(&p->sighand->siglock);
 	if (!exit_code)
@@ -1535,7 +1535,7 @@ static int wait_task_continued(struct task_struct *p, int options,
 	spin_unlock_irq(&p->sighand->siglock);
 
 	pid = task_pid_vnr(p);
-	uid = p->uid;
+	uid = p->cred->uid;
 	get_task_struct(p);
 	read_unlock(&tasklist_lock);
 
diff --git a/kernel/fork.c b/kernel/fork.c
index f6083561dfe..81fdc773390 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -147,8 +147,8 @@ void __put_task_struct(struct task_struct *tsk)
 	WARN_ON(tsk == current);
 
 	security_task_free(tsk);
-	free_uid(tsk->user);
-	put_group_info(tsk->group_info);
+	free_uid(tsk->__temp_cred.user);
+	put_group_info(tsk->__temp_cred.group_info);
 	delayacct_tsk_free(tsk);
 
 	if (!profile_handoff_task(tsk))
@@ -969,17 +969,18 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 	DEBUG_LOCKS_WARN_ON(!p->hardirqs_enabled);
 	DEBUG_LOCKS_WARN_ON(!p->softirqs_enabled);
 #endif
+	p->cred = &p->__temp_cred;
 	retval = -EAGAIN;
-	if (atomic_read(&p->user->processes) >=
+	if (atomic_read(&p->cred->user->processes) >=
 			p->signal->rlim[RLIMIT_NPROC].rlim_cur) {
 		if (!capable(CAP_SYS_ADMIN) && !capable(CAP_SYS_RESOURCE) &&
-		    p->user != current->nsproxy->user_ns->root_user)
+		    p->cred->user != current->nsproxy->user_ns->root_user)
 			goto bad_fork_free;
 	}
 
-	atomic_inc(&p->user->__count);
-	atomic_inc(&p->user->processes);
-	get_group_info(p->group_info);
+	atomic_inc(&p->cred->user->__count);
+	atomic_inc(&p->cred->user->processes);
+	get_group_info(p->cred->group_info);
 
 	/*
 	 * If multiple threads are within copy_process(), then this check
@@ -1035,9 +1036,8 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 	p->real_start_time = p->start_time;
 	monotonic_to_bootbased(&p->real_start_time);
 #ifdef CONFIG_SECURITY
-	p->security = NULL;
+	p->cred->security = NULL;
 #endif
-	p->cap_bset = current->cap_bset;
 	p->io_context = NULL;
 	p->audit_context = NULL;
 	cgroup_fork(p);
@@ -1298,9 +1298,9 @@ bad_fork_cleanup_cgroup:
 bad_fork_cleanup_put_domain:
 	module_put(task_thread_info(p)->exec_domain->module);
 bad_fork_cleanup_count:
-	put_group_info(p->group_info);
-	atomic_dec(&p->user->processes);
-	free_uid(p->user);
+	put_group_info(p->cred->group_info);
+	atomic_dec(&p->cred->user->processes);
+	free_uid(p->cred->user);
 bad_fork_free:
 	free_task(p);
 fork_out:
diff --git a/kernel/futex.c b/kernel/futex.c
index e06962132aa..28421d8210b 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -443,7 +443,8 @@ static struct task_struct * futex_find_get_task(pid_t pid)
 
 	rcu_read_lock();
 	p = find_task_by_vpid(pid);
-	if (!p || (euid != p->euid && euid != p->uid))
+	if (!p || (euid != p->cred->euid &&
+		   euid != p->cred->uid))
 		p = ERR_PTR(-ESRCH);
 	else
 		get_task_struct(p);
@@ -1846,7 +1847,8 @@ sys_get_robust_list(int pid, struct robust_list_head __user * __user *head_ptr,
 		if (!p)
 			goto err_unlock;
 		ret = -EPERM;
-		if (euid != p->euid && euid != p->uid &&
+		if (euid != p->cred->euid &&
+		    euid != p->cred->uid &&
 		    !capable(CAP_SYS_PTRACE))
 			goto err_unlock;
 		head = p->robust_list;
diff --git a/kernel/futex_compat.c b/kernel/futex_compat.c
index 3254d4e41e8..2c3fd5ed34f 100644
--- a/kernel/futex_compat.c
+++ b/kernel/futex_compat.c
@@ -151,8 +151,9 @@ compat_sys_get_robust_list(int pid, compat_uptr_t __user *head_ptr,
 		if (!p)
 			goto err_unlock;
 		ret = -EPERM;
-		if (euid != p->euid && euid != p->uid &&
-				!capable(CAP_SYS_PTRACE))
+		if (euid != p->cred->euid &&
+		    euid != p->cred->uid &&
+		    !capable(CAP_SYS_PTRACE))
 			goto err_unlock;
 		head = p->compat_robust_list;
 		read_unlock(&tasklist_lock);
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 937f6b5b200..49849d12dd1 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -115,6 +115,8 @@ int ptrace_check_attach(struct task_struct *child, int kill)
 
 int __ptrace_may_access(struct task_struct *task, unsigned int mode)
 {
+	struct cred *cred = current->cred, *tcred = task->cred;
+
 	/* May we inspect the given task?
 	 * This check is used both for attaching with ptrace
 	 * and for allowing access to sensitive information in /proc.
@@ -123,19 +125,18 @@ int __ptrace_may_access(struct task_struct *task, unsigned int mode)
 	 * because setting up the necessary parent/child relationship
 	 * or halting the specified task is impossible.
 	 */
-	uid_t uid;
-	gid_t gid;
+	uid_t uid = cred->uid;
+	gid_t gid = cred->gid;
 	int dumpable = 0;
 	/* Don't let security modules deny introspection */
 	if (task == current)
 		return 0;
-	current_uid_gid(&uid, &gid);
-	if ((uid != task->euid ||
-	     uid != task->suid ||
-	     uid != task->uid  ||
-	     gid != task->egid ||
-	     gid != task->sgid ||
-	     gid != task->gid) && !capable(CAP_SYS_PTRACE))
+	if ((uid != tcred->euid ||
+	     uid != tcred->suid ||
+	     uid != tcred->uid  ||
+	     gid != tcred->egid ||
+	     gid != tcred->sgid ||
+	     gid != tcred->gid) && !capable(CAP_SYS_PTRACE))
 		return -EPERM;
 	smp_rmb();
 	if (task->mm)
diff --git a/kernel/sched.c b/kernel/sched.c
index c3b8b1fcde0..733c59e645a 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -345,7 +345,7 @@ static inline struct task_group *task_group(struct task_struct *p)
 	struct task_group *tg;
 
 #ifdef CONFIG_USER_SCHED
-	tg = p->user->tg;
+	tg = p->cred->user->tg;
 #elif defined(CONFIG_CGROUP_SCHED)
 	tg = container_of(task_subsys_state(p, cpu_cgroup_subsys_id),
 				struct task_group, css);
@@ -5182,8 +5182,8 @@ recheck:
 
 		/* can't change other user's priorities */
 		euid = current_euid();
-		if (euid != p->euid &&
-		    euid != p->uid)
+		if (euid != p->cred->euid &&
+		    euid != p->cred->uid)
 			return -EPERM;
 	}
 
@@ -5417,7 +5417,9 @@ long sched_setaffinity(pid_t pid, const cpumask_t *in_mask)
 
 	euid = current_euid();
 	retval = -EPERM;
-	if (euid != p->euid && euid != p->uid && !capable(CAP_SYS_NICE))
+	if (euid != p->cred->euid &&
+	    euid != p->cred->uid &&
+	    !capable(CAP_SYS_NICE))
 		goto out_unlock;
 
 	retval = security_task_setscheduler(p, 0, NULL);
diff --git a/kernel/signal.c b/kernel/signal.c
index 167b535fe1a..80e8a6489f9 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -187,7 +187,7 @@ static struct sigqueue *__sigqueue_alloc(struct task_struct *t, gfp_t flags,
 	 * In order to avoid problems with "switch_user()", we want to make
 	 * sure that the compiler doesn't re-load "t->user"
 	 */
-	user = t->user;
+	user = t->cred->user;
 	barrier();
 	atomic_inc(&user->sigpending);
 	if (override_rlimit ||
@@ -582,8 +582,8 @@ static int check_kill_permission(int sig, struct siginfo *info,
 
 	uid = current_uid();
 	euid = current_euid();
-	if ((euid ^ t->suid) && (euid ^ t->uid) &&
-	    (uid  ^ t->suid) && (uid  ^ t->uid) &&
+	if ((euid ^ t->cred->suid) && (euid ^ t->cred->uid) &&
+	    (uid  ^ t->cred->suid) && (uid  ^ t->cred->uid) &&
 	    !capable(CAP_KILL)) {
 		switch (sig) {
 		case SIGCONT:
@@ -1100,8 +1100,8 @@ int kill_pid_info_as_uid(int sig, struct siginfo *info, struct pid *pid,
 		goto out_unlock;
 	}
 	if ((info == SEND_SIG_NOINFO || (!is_si_special(info) && SI_FROMUSER(info)))
-	    && (euid != p->suid) && (euid != p->uid)
-	    && (uid != p->suid) && (uid != p->uid)) {
+	    && (euid != p->cred->suid) && (euid != p->cred->uid)
+	    && (uid != p->cred->suid) && (uid != p->cred->uid)) {
 		ret = -EPERM;
 		goto out_unlock;
 	}
@@ -1374,7 +1374,7 @@ int do_notify_parent(struct task_struct *tsk, int sig)
 	info.si_pid = task_pid_nr_ns(tsk, tsk->parent->nsproxy->pid_ns);
 	rcu_read_unlock();
 
-	info.si_uid = tsk->uid;
+	info.si_uid = tsk->cred->uid;
 
 	thread_group_cputime(tsk, &cputime);
 	info.si_utime = cputime_to_jiffies(cputime.utime);
@@ -1445,7 +1445,7 @@ static void do_notify_parent_cldstop(struct task_struct *tsk, int why)
 	info.si_pid = task_pid_nr_ns(tsk, tsk->parent->nsproxy->pid_ns);
 	rcu_read_unlock();
 
-	info.si_uid = tsk->uid;
+	info.si_uid = tsk->cred->uid;
 
 	info.si_utime = cputime_to_clock_t(tsk->utime);
 	info.si_stime = cputime_to_clock_t(tsk->stime);
@@ -1713,7 +1713,7 @@ static int ptrace_signal(int signr, siginfo_t *info,
 		info->si_errno = 0;
 		info->si_code = SI_USER;
 		info->si_pid = task_pid_vnr(current->parent);
-		info->si_uid = current->parent->uid;
+		info->si_uid = current->parent->cred->uid;
 	}
 
 	/* If the (new) signal is now blocked, requeue it.  */
diff --git a/kernel/sys.c b/kernel/sys.c
index ed5c29c748a..5d81f07c015 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -117,7 +117,9 @@ static int set_one_prio(struct task_struct *p, int niceval, int error)
 	uid_t euid = current_euid();
 	int no_nice;
 
-	if (p->uid != euid && p->euid != euid && !capable(CAP_SYS_NICE)) {
+	if (p->cred->uid  != euid &&
+	    p->cred->euid != euid &&
+	    !capable(CAP_SYS_NICE)) {
 		error = -EPERM;
 		goto out;
 	}
@@ -174,7 +176,7 @@ asmlinkage long sys_setpriority(int which, int who, int niceval)
 			} while_each_pid_thread(pgrp, PIDTYPE_PGID, p);
 			break;
 		case PRIO_USER:
-			user = current->user;
+			user = current->cred->user;
 			if (!who)
 				who = current_uid();
 			else
@@ -182,7 +184,7 @@ asmlinkage long sys_setpriority(int which, int who, int niceval)
 					goto out_unlock;	/* No processes for this user */
 
 			do_each_thread(g, p)
-				if (p->uid == who)
+				if (p->cred->uid == who)
 					error = set_one_prio(p, niceval, error);
 			while_each_thread(g, p);
 			if (who != current_uid())
@@ -236,7 +238,7 @@ asmlinkage long sys_getpriority(int which, int who)
 			} while_each_pid_thread(pgrp, PIDTYPE_PGID, p);
 			break;
 		case PRIO_USER:
-			user = current->user;
+			user = current->cred->user;
 			if (!who)
 				who = current_uid();
 			else
@@ -244,7 +246,7 @@ asmlinkage long sys_getpriority(int which, int who)
 					goto out_unlock;	/* No processes for this user */
 
 			do_each_thread(g, p)
-				if (p->uid == who) {
+				if (p->cred->uid == who) {
 					niceval = 20 - task_nice(p);
 					if (niceval > retval)
 						retval = niceval;
@@ -472,8 +474,9 @@ void ctrl_alt_del(void)
  */
 asmlinkage long sys_setregid(gid_t rgid, gid_t egid)
 {
-	int old_rgid = current->gid;
-	int old_egid = current->egid;
+	struct cred *cred = current->cred;
+	int old_rgid = cred->gid;
+	int old_egid = cred->egid;
 	int new_rgid = old_rgid;
 	int new_egid = old_egid;
 	int retval;
@@ -484,7 +487,7 @@ asmlinkage long sys_setregid(gid_t rgid, gid_t egid)
 
 	if (rgid != (gid_t) -1) {
 		if ((old_rgid == rgid) ||
-		    (current->egid==rgid) ||
+		    (cred->egid == rgid) ||
 		    capable(CAP_SETGID))
 			new_rgid = rgid;
 		else
@@ -492,8 +495,8 @@ asmlinkage long sys_setregid(gid_t rgid, gid_t egid)
 	}
 	if (egid != (gid_t) -1) {
 		if ((old_rgid == egid) ||
-		    (current->egid == egid) ||
-		    (current->sgid == egid) ||
+		    (cred->egid == egid) ||
+		    (cred->sgid == egid) ||
 		    capable(CAP_SETGID))
 			new_egid = egid;
 		else
@@ -505,10 +508,10 @@ asmlinkage long sys_setregid(gid_t rgid, gid_t egid)
 	}
 	if (rgid != (gid_t) -1 ||
 	    (egid != (gid_t) -1 && egid != old_rgid))
-		current->sgid = new_egid;
-	current->fsgid = new_egid;
-	current->egid = new_egid;
-	current->gid = new_rgid;
+		cred->sgid = new_egid;
+	cred->fsgid = new_egid;
+	cred->egid = new_egid;
+	cred->gid = new_rgid;
 	key_fsgid_changed(current);
 	proc_id_connector(current, PROC_EVENT_GID);
 	return 0;
@@ -521,7 +524,8 @@ asmlinkage long sys_setregid(gid_t rgid, gid_t egid)
  */
 asmlinkage long sys_setgid(gid_t gid)
 {
-	int old_egid = current->egid;
+	struct cred *cred = current->cred;
+	int old_egid = cred->egid;
 	int retval;
 
 	retval = security_task_setgid(gid, (gid_t)-1, (gid_t)-1, LSM_SETID_ID);
@@ -533,13 +537,13 @@ asmlinkage long sys_setgid(gid_t gid)
 			set_dumpable(current->mm, suid_dumpable);
 			smp_wmb();
 		}
-		current->gid = current->egid = current->sgid = current->fsgid = gid;
-	} else if ((gid == current->gid) || (gid == current->sgid)) {
+		cred->gid = cred->egid = cred->sgid = cred->fsgid = gid;
+	} else if ((gid == cred->gid) || (gid == cred->sgid)) {
 		if (old_egid != gid) {
 			set_dumpable(current->mm, suid_dumpable);
 			smp_wmb();
 		}
-		current->egid = current->fsgid = gid;
+		cred->egid = cred->fsgid = gid;
 	}
 	else
 		return -EPERM;
@@ -570,7 +574,7 @@ static int set_user(uid_t new_ruid, int dumpclear)
 		set_dumpable(current->mm, suid_dumpable);
 		smp_wmb();
 	}
-	current->uid = new_ruid;
+	current->cred->uid = new_ruid;
 	return 0;
 }
 
@@ -591,6 +595,7 @@ static int set_user(uid_t new_ruid, int dumpclear)
  */
 asmlinkage long sys_setreuid(uid_t ruid, uid_t euid)
 {
+	struct cred *cred = current->cred;
 	int old_ruid, old_euid, old_suid, new_ruid, new_euid;
 	int retval;
 
@@ -598,14 +603,14 @@ asmlinkage long sys_setreuid(uid_t ruid, uid_t euid)
 	if (retval)
 		return retval;
 
-	new_ruid = old_ruid = current->uid;
-	new_euid = old_euid = current->euid;
-	old_suid = current->suid;
+	new_ruid = old_ruid = cred->uid;
+	new_euid = old_euid = cred->euid;
+	old_suid = cred->suid;
 
 	if (ruid != (uid_t) -1) {
 		new_ruid = ruid;
 		if ((old_ruid != ruid) &&
-		    (current->euid != ruid) &&
+		    (cred->euid != ruid) &&
 		    !capable(CAP_SETUID))
 			return -EPERM;
 	}
@@ -613,8 +618,8 @@ asmlinkage long sys_setreuid(uid_t ruid, uid_t euid)
 	if (euid != (uid_t) -1) {
 		new_euid = euid;
 		if ((old_ruid != euid) &&
-		    (current->euid != euid) &&
-		    (current->suid != euid) &&
+		    (cred->euid != euid) &&
+		    (cred->suid != euid) &&
 		    !capable(CAP_SETUID))
 			return -EPERM;
 	}
@@ -626,11 +631,11 @@ asmlinkage long sys_setreuid(uid_t ruid, uid_t euid)
 		set_dumpable(current->mm, suid_dumpable);
 		smp_wmb();
 	}
-	current->fsuid = current->euid = new_euid;
+	cred->fsuid = cred->euid = new_euid;
 	if (ruid != (uid_t) -1 ||
 	    (euid != (uid_t) -1 && euid != old_ruid))
-		current->suid = current->euid;
-	current->fsuid = current->euid;
+		cred->suid = cred->euid;
+	cred->fsuid = cred->euid;
 
 	key_fsuid_changed(current);
 	proc_id_connector(current, PROC_EVENT_UID);
@@ -653,7 +658,8 @@ asmlinkage long sys_setreuid(uid_t ruid, uid_t euid)
  */
 asmlinkage long sys_setuid(uid_t uid)
 {
-	int old_euid = current->euid;
+	struct cred *cred = current->cred;
+	int old_euid = cred->euid;
 	int old_ruid, old_suid, new_suid;
 	int retval;
 
@@ -661,23 +667,23 @@ asmlinkage long sys_setuid(uid_t uid)
 	if (retval)
 		return retval;
 
-	old_ruid = current->uid;
-	old_suid = current->suid;
+	old_ruid = cred->uid;
+	old_suid = cred->suid;
 	new_suid = old_suid;
 	
 	if (capable(CAP_SETUID)) {
 		if (uid != old_ruid && set_user(uid, old_euid != uid) < 0)
 			return -EAGAIN;
 		new_suid = uid;
-	} else if ((uid != current->uid) && (uid != new_suid))
+	} else if ((uid != cred->uid) && (uid != new_suid))
 		return -EPERM;
 
 	if (old_euid != uid) {
 		set_dumpable(current->mm, suid_dumpable);
 		smp_wmb();
 	}
-	current->fsuid = current->euid = uid;
-	current->suid = new_suid;
+	cred->fsuid = cred->euid = uid;
+	cred->suid = new_suid;
 
 	key_fsuid_changed(current);
 	proc_id_connector(current, PROC_EVENT_UID);
@@ -692,9 +698,10 @@ asmlinkage long sys_setuid(uid_t uid)
  */
 asmlinkage long sys_setresuid(uid_t ruid, uid_t euid, uid_t suid)
 {
-	int old_ruid = current->uid;
-	int old_euid = current->euid;
-	int old_suid = current->suid;
+	struct cred *cred = current->cred;
+	int old_ruid = cred->uid;
+	int old_euid = cred->euid;
+	int old_suid = cred->suid;
 	int retval;
 
 	retval = security_task_setuid(ruid, euid, suid, LSM_SETID_RES);
@@ -702,30 +709,31 @@ asmlinkage long sys_setresuid(uid_t ruid, uid_t euid, uid_t suid)
 		return retval;
 
 	if (!capable(CAP_SETUID)) {
-		if ((ruid != (uid_t) -1) && (ruid != current->uid) &&
-		    (ruid != current->euid) && (ruid != current->suid))
+		if ((ruid != (uid_t) -1) && (ruid != cred->uid) &&
+		    (ruid != cred->euid) && (ruid != cred->suid))
 			return -EPERM;
-		if ((euid != (uid_t) -1) && (euid != current->uid) &&
-		    (euid != current->euid) && (euid != current->suid))
+		if ((euid != (uid_t) -1) && (euid != cred->uid) &&
+		    (euid != cred->euid) && (euid != cred->suid))
 			return -EPERM;
-		if ((suid != (uid_t) -1) && (suid != current->uid) &&
-		    (suid != current->euid) && (suid != current->suid))
+		if ((suid != (uid_t) -1) && (suid != cred->uid) &&
+		    (suid != cred->euid) && (suid != cred->suid))
 			return -EPERM;
 	}
 	if (ruid != (uid_t) -1) {
-		if (ruid != current->uid && set_user(ruid, euid != current->euid) < 0)
+		if (ruid != cred->uid &&
+		    set_user(ruid, euid != cred->euid) < 0)
 			return -EAGAIN;
 	}
 	if (euid != (uid_t) -1) {
-		if (euid != current->euid) {
+		if (euid != cred->euid) {
 			set_dumpable(current->mm, suid_dumpable);
 			smp_wmb();
 		}
-		current->euid = euid;
+		cred->euid = euid;
 	}
-	current->fsuid = current->euid;
+	cred->fsuid = cred->euid;
 	if (suid != (uid_t) -1)
-		current->suid = suid;
+		cred->suid = suid;
 
 	key_fsuid_changed(current);
 	proc_id_connector(current, PROC_EVENT_UID);
@@ -735,11 +743,12 @@ asmlinkage long sys_setresuid(uid_t ruid, uid_t euid, uid_t suid)
 
 asmlinkage long sys_getresuid(uid_t __user *ruid, uid_t __user *euid, uid_t __user *suid)
 {
+	struct cred *cred = current->cred;
 	int retval;
 
-	if (!(retval = put_user(current->uid, ruid)) &&
-	    !(retval = put_user(current->euid, euid)))
-		retval = put_user(current->suid, suid);
+	if (!(retval = put_user(cred->uid, ruid)) &&
+	    !(retval = put_user(cred->euid, euid)))
+		retval = put_user(cred->suid, suid);
 
 	return retval;
 }
@@ -749,6 +758,7 @@ asmlinkage long sys_getresuid(uid_t __user *ruid, uid_t __user *euid, uid_t __us
  */
 asmlinkage long sys_setresgid(gid_t rgid, gid_t egid, gid_t sgid)
 {
+	struct cred *cred = current->cred;
 	int retval;
 
 	retval = security_task_setgid(rgid, egid, sgid, LSM_SETID_RES);
@@ -756,28 +766,28 @@ asmlinkage long sys_setresgid(gid_t rgid, gid_t egid, gid_t sgid)
 		return retval;
 
 	if (!capable(CAP_SETGID)) {
-		if ((rgid != (gid_t) -1) && (rgid != current->gid) &&
-		    (rgid != current->egid) && (rgid != current->sgid))
+		if ((rgid != (gid_t) -1) && (rgid != cred->gid) &&
+		    (rgid != cred->egid) && (rgid != cred->sgid))
 			return -EPERM;
-		if ((egid != (gid_t) -1) && (egid != current->gid) &&
-		    (egid != current->egid) && (egid != current->sgid))
+		if ((egid != (gid_t) -1) && (egid != cred->gid) &&
+		    (egid != cred->egid) && (egid != cred->sgid))
 			return -EPERM;
-		if ((sgid != (gid_t) -1) && (sgid != current->gid) &&
-		    (sgid != current->egid) && (sgid != current->sgid))
+		if ((sgid != (gid_t) -1) && (sgid != cred->gid) &&
+		    (sgid != cred->egid) && (sgid != cred->sgid))
 			return -EPERM;
 	}
 	if (egid != (gid_t) -1) {
-		if (egid != current->egid) {
+		if (egid != cred->egid) {
 			set_dumpable(current->mm, suid_dumpable);
 			smp_wmb();
 		}
-		current->egid = egid;
+		cred->egid = egid;
 	}
-	current->fsgid = current->egid;
+	cred->fsgid = cred->egid;
 	if (rgid != (gid_t) -1)
-		current->gid = rgid;
+		cred->gid = rgid;
 	if (sgid != (gid_t) -1)
-		current->sgid = sgid;
+		cred->sgid = sgid;
 
 	key_fsgid_changed(current);
 	proc_id_connector(current, PROC_EVENT_GID);
@@ -786,11 +796,12 @@ asmlinkage long sys_setresgid(gid_t rgid, gid_t egid, gid_t sgid)
 
 asmlinkage long sys_getresgid(gid_t __user *rgid, gid_t __user *egid, gid_t __user *sgid)
 {
+	struct cred *cred = current->cred;
 	int retval;
 
-	if (!(retval = put_user(current->gid, rgid)) &&
-	    !(retval = put_user(current->egid, egid)))
-		retval = put_user(current->sgid, sgid);
+	if (!(retval = put_user(cred->gid, rgid)) &&
+	    !(retval = put_user(cred->egid, egid)))
+		retval = put_user(cred->sgid, sgid);
 
 	return retval;
 }
@@ -804,20 +815,21 @@ asmlinkage long sys_getresgid(gid_t __user *rgid, gid_t __user *egid, gid_t __us
  */
 asmlinkage long sys_setfsuid(uid_t uid)
 {
+	struct cred *cred = current->cred;
 	int old_fsuid;
 
-	old_fsuid = current->fsuid;
+	old_fsuid = cred->fsuid;
 	if (security_task_setuid(uid, (uid_t)-1, (uid_t)-1, LSM_SETID_FS))
 		return old_fsuid;
 
-	if (uid == current->uid || uid == current->euid ||
-	    uid == current->suid || uid == current->fsuid || 
+	if (uid == cred->uid || uid == cred->euid ||
+	    uid == cred->suid || uid == cred->fsuid ||
 	    capable(CAP_SETUID)) {
 		if (uid != old_fsuid) {
 			set_dumpable(current->mm, suid_dumpable);
 			smp_wmb();
 		}
-		current->fsuid = uid;
+		cred->fsuid = uid;
 	}
 
 	key_fsuid_changed(current);
@@ -833,20 +845,21 @@ asmlinkage long sys_setfsuid(uid_t uid)
  */
 asmlinkage long sys_setfsgid(gid_t gid)
 {
+	struct cred *cred = current->cred;
 	int old_fsgid;
 
-	old_fsgid = current->fsgid;
+	old_fsgid = cred->fsgid;
 	if (security_task_setgid(gid, (gid_t)-1, (gid_t)-1, LSM_SETID_FS))
 		return old_fsgid;
 
-	if (gid == current->gid || gid == current->egid ||
-	    gid == current->sgid || gid == current->fsgid || 
+	if (gid == cred->gid || gid == cred->egid ||
+	    gid == cred->sgid || gid == cred->fsgid ||
 	    capable(CAP_SETGID)) {
 		if (gid != old_fsgid) {
 			set_dumpable(current->mm, suid_dumpable);
 			smp_wmb();
 		}
-		current->fsgid = gid;
+		cred->fsgid = gid;
 		key_fsgid_changed(current);
 		proc_id_connector(current, PROC_EVENT_GID);
 	}
@@ -1208,8 +1221,15 @@ int groups_search(struct group_info *group_info, gid_t grp)
 	return 0;
 }
 
-/* validate and set current->group_info */
-int set_current_groups(struct group_info *group_info)
+/**
+ * set_groups - Change a group subscription in a security record
+ * @sec: The security record to alter
+ * @group_info: The group list to impose
+ *
+ * Validate a group subscription and, if valid, impose it upon a task security
+ * record.
+ */
+int set_groups(struct cred *cred, struct group_info *group_info)
 {
 	int retval;
 	struct group_info *old_info;
@@ -1221,20 +1241,34 @@ int set_current_groups(struct group_info *group_info)
 	groups_sort(group_info);
 	get_group_info(group_info);
 
-	task_lock(current);
-	old_info = current->group_info;
-	current->group_info = group_info;
-	task_unlock(current);
+	spin_lock(&cred->lock);
+	old_info = cred->group_info;
+	cred->group_info = group_info;
+	spin_unlock(&cred->lock);
 
 	put_group_info(old_info);
-
 	return 0;
 }
 
+EXPORT_SYMBOL(set_groups);
+
+/**
+ * set_current_groups - Change current's group subscription
+ * @group_info: The group list to impose
+ *
+ * Validate a group subscription and, if valid, impose it upon current's task
+ * security record.
+ */
+int set_current_groups(struct group_info *group_info)
+{
+	return set_groups(current->cred, group_info);
+}
+
 EXPORT_SYMBOL(set_current_groups);
 
 asmlinkage long sys_getgroups(int gidsetsize, gid_t __user *grouplist)
 {
+	struct cred *cred = current->cred;
 	int i = 0;
 
 	/*
@@ -1246,13 +1280,13 @@ asmlinkage long sys_getgroups(int gidsetsize, gid_t __user *grouplist)
 		return -EINVAL;
 
 	/* no need to grab task_lock here; it cannot change */
-	i = current->group_info->ngroups;
+	i = cred->group_info->ngroups;
 	if (gidsetsize) {
 		if (i > gidsetsize) {
 			i = -EINVAL;
 			goto out;
 		}
-		if (groups_to_user(grouplist, current->group_info)) {
+		if (groups_to_user(grouplist, cred->group_info)) {
 			i = -EFAULT;
 			goto out;
 		}
@@ -1296,9 +1330,10 @@ asmlinkage long sys_setgroups(int gidsetsize, gid_t __user *grouplist)
  */
 int in_group_p(gid_t grp)
 {
+	struct cred *cred = current->cred;
 	int retval = 1;
-	if (grp != current->fsgid)
-		retval = groups_search(current->group_info, grp);
+	if (grp != cred->fsgid)
+		retval = groups_search(cred->group_info, grp);
 	return retval;
 }
 
@@ -1306,9 +1341,10 @@ EXPORT_SYMBOL(in_group_p);
 
 int in_egroup_p(gid_t grp)
 {
+	struct cred *cred = current->cred;
 	int retval = 1;
-	if (grp != current->egid)
-		retval = groups_search(current->group_info, grp);
+	if (grp != cred->egid)
+		retval = groups_search(cred->group_info, grp);
 	return retval;
 }
 
@@ -1624,7 +1660,9 @@ asmlinkage long sys_umask(int mask)
 asmlinkage long sys_prctl(int option, unsigned long arg2, unsigned long arg3,
 			  unsigned long arg4, unsigned long arg5)
 {
-	long error = 0;
+	struct task_struct *me = current;
+	unsigned char comm[sizeof(me->comm)];
+	long error;
 
 	if (security_task_prctl(option, arg2, arg3, arg4, arg5, &error))
 		return error;
@@ -1635,39 +1673,41 @@ asmlinkage long sys_prctl(int option, unsigned long arg2, unsigned long arg3,
 				error = -EINVAL;
 				break;
 			}
-			current->pdeath_signal = arg2;
+			me->pdeath_signal = arg2;
+			error = 0;
 			break;
 		case PR_GET_PDEATHSIG:
-			error = put_user(current->pdeath_signal, (int __user *)arg2);
+			error = put_user(me->pdeath_signal, (int __user *)arg2);
 			break;
 		case PR_GET_DUMPABLE:
-			error = get_dumpable(current->mm);
+			error = get_dumpable(me->mm);
 			break;
 		case PR_SET_DUMPABLE:
 			if (arg2 < 0 || arg2 > 1) {
 				error = -EINVAL;
 				break;
 			}
-			set_dumpable(current->mm, arg2);
+			set_dumpable(me->mm, arg2);
+			error = 0;
 			break;
 
 		case PR_SET_UNALIGN:
-			error = SET_UNALIGN_CTL(current, arg2);
+			error = SET_UNALIGN_CTL(me, arg2);
 			break;
 		case PR_GET_UNALIGN:
-			error = GET_UNALIGN_CTL(current, arg2);
+			error = GET_UNALIGN_CTL(me, arg2);
 			break;
 		case PR_SET_FPEMU:
-			error = SET_FPEMU_CTL(current, arg2);
+			error = SET_FPEMU_CTL(me, arg2);
 			break;
 		case PR_GET_FPEMU:
-			error = GET_FPEMU_CTL(current, arg2);
+			error = GET_FPEMU_CTL(me, arg2);
 			break;
 		case PR_SET_FPEXC:
-			error = SET_FPEXC_CTL(current, arg2);
+			error = SET_FPEXC_CTL(me, arg2);
 			break;
 		case PR_GET_FPEXC:
-			error = GET_FPEXC_CTL(current, arg2);
+			error = GET_FPEXC_CTL(me, arg2);
 			break;
 		case PR_GET_TIMING:
 			error = PR_TIMING_STATISTICAL;
@@ -1675,33 +1715,28 @@ asmlinkage long sys_prctl(int option, unsigned long arg2, unsigned long arg3,
 		case PR_SET_TIMING:
 			if (arg2 != PR_TIMING_STATISTICAL)
 				error = -EINVAL;
+			else
+				error = 0;
 			break;
 
-		case PR_SET_NAME: {
-			struct task_struct *me = current;
-			unsigned char ncomm[sizeof(me->comm)];
-
-			ncomm[sizeof(me->comm)-1] = 0;
-			if (strncpy_from_user(ncomm, (char __user *)arg2,
-						sizeof(me->comm)-1) < 0)
+		case PR_SET_NAME:
+			comm[sizeof(me->comm)-1] = 0;
+			if (strncpy_from_user(comm, (char __user *)arg2,
+					      sizeof(me->comm) - 1) < 0)
 				return -EFAULT;
-			set_task_comm(me, ncomm);
+			set_task_comm(me, comm);
 			return 0;
-		}
-		case PR_GET_NAME: {
-			struct task_struct *me = current;
-			unsigned char tcomm[sizeof(me->comm)];
-
-			get_task_comm(tcomm, me);
-			if (copy_to_user((char __user *)arg2, tcomm, sizeof(tcomm)))
+		case PR_GET_NAME:
+			get_task_comm(comm, me);
+			if (copy_to_user((char __user *)arg2, comm,
+					 sizeof(comm)))
 				return -EFAULT;
 			return 0;
-		}
 		case PR_GET_ENDIAN:
-			error = GET_ENDIAN(current, arg2);
+			error = GET_ENDIAN(me, arg2);
 			break;
 		case PR_SET_ENDIAN:
-			error = SET_ENDIAN(current, arg2);
+			error = SET_ENDIAN(me, arg2);
 			break;
 
 		case PR_GET_SECCOMP:
@@ -1725,6 +1760,7 @@ asmlinkage long sys_prctl(int option, unsigned long arg2, unsigned long arg3,
 					current->default_timer_slack_ns;
 			else
 				current->timer_slack_ns = arg2;
+			error = 0;
 			break;
 		default:
 			error = -EINVAL;
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 9f3b478f917..5c97c5b4ea8 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -246,7 +246,7 @@ __update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu)
 
 	memcpy(data->comm, tsk->comm, TASK_COMM_LEN);
 	data->pid = tsk->pid;
-	data->uid = tsk->uid;
+	data->uid = task_uid(tsk);
 	data->nice = tsk->static_prio - 20 - MAX_RT_PRIO;
 	data->policy = tsk->policy;
 	data->rt_priority = tsk->rt_priority;
diff --git a/kernel/tsacct.c b/kernel/tsacct.c
index 8ebcd8532df..6d1ed07bf31 100644
--- a/kernel/tsacct.c
+++ b/kernel/tsacct.c
@@ -53,8 +53,8 @@ void bacct_add_tsk(struct taskstats *stats, struct task_struct *tsk)
 		stats->ac_flag |= AXSIG;
 	stats->ac_nice	 = task_nice(tsk);
 	stats->ac_sched	 = tsk->policy;
-	stats->ac_uid	 = tsk->uid;
-	stats->ac_gid	 = tsk->gid;
+	stats->ac_uid	 = tsk->cred->uid;
+	stats->ac_gid	 = tsk->cred->gid;
 	stats->ac_pid	 = tsk->pid;
 	rcu_read_lock();
 	stats->ac_ppid	 = pid_alive(tsk) ?
diff --git a/kernel/uid16.c b/kernel/uid16.c
index 3e41c1673e2..71f07fc39fe 100644
--- a/kernel/uid16.c
+++ b/kernel/uid16.c
@@ -86,9 +86,9 @@ asmlinkage long sys_getresuid16(old_uid_t __user *ruid, old_uid_t __user *euid,
 {
 	int retval;
 
-	if (!(retval = put_user(high2lowuid(current->uid), ruid)) &&
-	    !(retval = put_user(high2lowuid(current->euid), euid)))
-		retval = put_user(high2lowuid(current->suid), suid);
+	if (!(retval = put_user(high2lowuid(current->cred->uid), ruid)) &&
+	    !(retval = put_user(high2lowuid(current->cred->euid), euid)))
+		retval = put_user(high2lowuid(current->cred->suid), suid);
 
 	return retval;
 }
@@ -106,9 +106,9 @@ asmlinkage long sys_getresgid16(old_gid_t __user *rgid, old_gid_t __user *egid,
 {
 	int retval;
 
-	if (!(retval = put_user(high2lowgid(current->gid), rgid)) &&
-	    !(retval = put_user(high2lowgid(current->egid), egid)))
-		retval = put_user(high2lowgid(current->sgid), sgid);
+	if (!(retval = put_user(high2lowgid(current->cred->gid), rgid)) &&
+	    !(retval = put_user(high2lowgid(current->cred->egid), egid)))
+		retval = put_user(high2lowgid(current->cred->sgid), sgid);
 
 	return retval;
 }
@@ -166,20 +166,20 @@ asmlinkage long sys_getgroups16(int gidsetsize, old_gid_t __user *grouplist)
 	if (gidsetsize < 0)
 		return -EINVAL;
 
-	get_group_info(current->group_info);
-	i = current->group_info->ngroups;
+	get_group_info(current->cred->group_info);
+	i = current->cred->group_info->ngroups;
 	if (gidsetsize) {
 		if (i > gidsetsize) {
 			i = -EINVAL;
 			goto out;
 		}
-		if (groups16_to_user(grouplist, current->group_info)) {
+		if (groups16_to_user(grouplist, current->cred->group_info)) {
 			i = -EFAULT;
 			goto out;
 		}
 	}
 out:
-	put_group_info(current->group_info);
+	put_group_info(current->cred->group_info);
 	return i;
 }
 
@@ -210,20 +210,20 @@ asmlinkage long sys_setgroups16(int gidsetsize, old_gid_t __user *grouplist)
 
 asmlinkage long sys_getuid16(void)
 {
-	return high2lowuid(current->uid);
+	return high2lowuid(current->cred->uid);
 }
 
 asmlinkage long sys_geteuid16(void)
 {
-	return high2lowuid(current->euid);
+	return high2lowuid(current->cred->euid);
 }
 
 asmlinkage long sys_getgid16(void)
 {
-	return high2lowgid(current->gid);
+	return high2lowgid(current->cred->gid);
 }
 
 asmlinkage long sys_getegid16(void)
 {
-	return high2lowgid(current->egid);
+	return high2lowgid(current->cred->egid);
 }
diff --git a/kernel/user.c b/kernel/user.c
index 39d6159fae4..104d22ac84d 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -457,11 +457,11 @@ void switch_uid(struct user_struct *new_user)
 	 * cheaply with the new uid cache, so if it matters
 	 * we should be checking for it.  -DaveM
 	 */
-	old_user = current->user;
+	old_user = current->cred->user;
 	atomic_inc(&new_user->processes);
 	atomic_dec(&old_user->processes);
 	switch_uid_keyring(new_user);
-	current->user = new_user;
+	current->cred->user = new_user;
 	sched_switch_user(current);
 
 	/*
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 07a96474077..b23492ee3e5 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -1110,12 +1110,12 @@ asmlinkage long sys_migrate_pages(pid_t pid, unsigned long maxnode,
 		const unsigned long __user *old_nodes,
 		const unsigned long __user *new_nodes)
 {
+	struct cred *cred, *tcred;
 	struct mm_struct *mm;
 	struct task_struct *task;
 	nodemask_t old;
 	nodemask_t new;
 	nodemask_t task_nodes;
-	uid_t uid, euid;
 	int err;
 
 	err = get_nodes(&old, old_nodes, maxnode);
@@ -1145,10 +1145,10 @@ asmlinkage long sys_migrate_pages(pid_t pid, unsigned long maxnode,
 	 * capabilities, superuser privileges or the same
 	 * userid as the target process.
 	 */
-	uid = current_uid();
-	euid = current_euid();
-	if (euid != task->suid && euid != task->uid &&
-	    uid  != task->suid && uid  != task->uid &&
+	cred = current->cred;
+	tcred = task->cred;
+	if (cred->euid != tcred->suid && cred->euid != tcred->uid &&
+	    cred->uid  != tcred->suid && cred->uid  != tcred->uid &&
 	    !capable(CAP_SYS_NICE)) {
 		err = -EPERM;
 		goto out;
diff --git a/mm/migrate.c b/mm/migrate.c
index 6263c24c4af..794443da1b4 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -1045,10 +1045,10 @@ asmlinkage long sys_move_pages(pid_t pid, unsigned long nr_pages,
 			const int __user *nodes,
 			int __user *status, int flags)
 {
+	struct cred *cred, *tcred;
 	struct task_struct *task;
 	struct mm_struct *mm;
 	int err;
-	uid_t uid, euid;
 
 	/* Check flags */
 	if (flags & ~(MPOL_MF_MOVE|MPOL_MF_MOVE_ALL))
@@ -1076,10 +1076,10 @@ asmlinkage long sys_move_pages(pid_t pid, unsigned long nr_pages,
 	 * capabilities, superuser privileges or the same
 	 * userid as the target process.
 	 */
-	uid = current_uid();
-	euid = current_euid();
-	if (euid != task->suid && euid != task->uid &&
-	    uid  != task->suid && uid  != task->uid &&
+	cred = current->cred;
+	tcred = task->cred;
+	if (cred->euid != tcred->suid && cred->euid != tcred->uid &&
+	    cred->uid  != tcred->suid && cred->uid  != tcred->uid &&
 	    !capable(CAP_SYS_NICE)) {
 		err = -EPERM;
 		goto out;
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 34a458aa799..3af787ba207 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -298,7 +298,7 @@ static void dump_tasks(const struct mem_cgroup *mem)
 
 		task_lock(p);
 		printk(KERN_INFO "[%5d] %5d %5d %8lu %8lu %3d     %3d %s\n",
-		       p->pid, p->uid, p->tgid, p->mm->total_vm,
+		       p->pid, p->cred->uid, p->tgid, p->mm->total_vm,
 		       get_mm_rss(p->mm), (int)task_cpu(p), p->oomkilladj,
 		       p->comm);
 		task_unlock(p);
diff --git a/net/core/scm.c b/net/core/scm.c
index 4681d8f9b45..c28ca32a7d9 100644
--- a/net/core/scm.c
+++ b/net/core/scm.c
@@ -44,11 +44,13 @@
 
 static __inline__ int scm_check_creds(struct ucred *creds)
 {
+	struct cred *cred = current->cred;
+
 	if ((creds->pid == task_tgid_vnr(current) || capable(CAP_SYS_ADMIN)) &&
-	    ((creds->uid == current_uid()   || creds->uid == current_euid() ||
-	      creds->uid == current_suid()) || capable(CAP_SETUID)) &&
-	    ((creds->gid == current_gid()   || creds->gid == current_egid() ||
-	      creds->gid == current_sgid()) || capable(CAP_SETGID))) {
+	    ((creds->uid == cred->uid   || creds->uid == cred->euid ||
+	      creds->uid == cred->suid) || capable(CAP_SETUID)) &&
+	    ((creds->gid == cred->gid   || creds->gid == cred->egid ||
+	      creds->gid == cred->sgid) || capable(CAP_SETGID))) {
 	       return 0;
 	}
 	return -EPERM;
diff --git a/net/sunrpc/auth.c b/net/sunrpc/auth.c
index 8fc38057880..c7954321260 100644
--- a/net/sunrpc/auth.c
+++ b/net/sunrpc/auth.c
@@ -353,7 +353,7 @@ rpcauth_lookupcred(struct rpc_auth *auth, int flags)
 	struct auth_cred acred = {
 		.uid = current_fsuid(),
 		.gid = current_fsgid(),
-		.group_info = current->group_info,
+		.group_info = current->cred->group_info,
 	};
 	struct rpc_cred *ret;
 
diff --git a/security/commoncap.c b/security/commoncap.c
index fb4e240720d..fa61679f8c7 100644
--- a/security/commoncap.c
+++ b/security/commoncap.c
@@ -30,7 +30,7 @@
 
 int cap_netlink_send(struct sock *sk, struct sk_buff *skb)
 {
-	NETLINK_CB(skb).eff_cap = current->cap_effective;
+	NETLINK_CB(skb).eff_cap = current_cap();
 	return 0;
 }
 
@@ -52,7 +52,7 @@ EXPORT_SYMBOL(cap_netlink_recv);
 int cap_capable(struct task_struct *tsk, int cap, int audit)
 {
 	/* Derived from include/linux/sched.h:capable. */
-	if (cap_raised(tsk->cap_effective, cap))
+	if (cap_raised(tsk->cred->cap_effective, cap))
 		return 0;
 	return -EPERM;
 }
@@ -67,7 +67,8 @@ int cap_settime(struct timespec *ts, struct timezone *tz)
 int cap_ptrace_may_access(struct task_struct *child, unsigned int mode)
 {
 	/* Derived from arch/i386/kernel/ptrace.c:sys_ptrace. */
-	if (cap_issubset(child->cap_permitted, current->cap_permitted))
+	if (cap_issubset(child->cred->cap_permitted,
+			 current->cred->cap_permitted))
 		return 0;
 	if (capable(CAP_SYS_PTRACE))
 		return 0;
@@ -76,8 +77,8 @@ int cap_ptrace_may_access(struct task_struct *child, unsigned int mode)
 
 int cap_ptrace_traceme(struct task_struct *parent)
 {
-	/* Derived from arch/i386/kernel/ptrace.c:sys_ptrace. */
-	if (cap_issubset(current->cap_permitted, parent->cap_permitted))
+	if (cap_issubset(current->cred->cap_permitted,
+			 parent->cred->cap_permitted))
 		return 0;
 	if (has_capability(parent, CAP_SYS_PTRACE))
 		return 0;
@@ -87,10 +88,12 @@ int cap_ptrace_traceme(struct task_struct *parent)
 int cap_capget (struct task_struct *target, kernel_cap_t *effective,
 		kernel_cap_t *inheritable, kernel_cap_t *permitted)
 {
+	struct cred *cred = target->cred;
+
 	/* Derived from kernel/capability.c:sys_capget. */
-	*effective = target->cap_effective;
-	*inheritable = target->cap_inheritable;
-	*permitted = target->cap_permitted;
+	*effective   = cred->cap_effective;
+	*inheritable = cred->cap_inheritable;
+	*permitted   = cred->cap_permitted;
 	return 0;
 }
 
@@ -122,24 +125,26 @@ int cap_capset_check(const kernel_cap_t *effective,
 		     const kernel_cap_t *inheritable,
 		     const kernel_cap_t *permitted)
 {
+	const struct cred *cred = current->cred;
+
 	if (cap_inh_is_capped()
 	    && !cap_issubset(*inheritable,
-			     cap_combine(current->cap_inheritable,
-					 current->cap_permitted))) {
+			     cap_combine(cred->cap_inheritable,
+					 cred->cap_permitted))) {
 		/* incapable of using this inheritable set */
 		return -EPERM;
 	}
 	if (!cap_issubset(*inheritable,
-			   cap_combine(current->cap_inheritable,
-				       current->cap_bset))) {
+			   cap_combine(cred->cap_inheritable,
+				       cred->cap_bset))) {
 		/* no new pI capabilities outside bounding set */
 		return -EPERM;
 	}
 
 	/* verify restrictions on target's new Permitted set */
 	if (!cap_issubset (*permitted,
-			   cap_combine (current->cap_permitted,
-					current->cap_permitted))) {
+			   cap_combine (cred->cap_permitted,
+					cred->cap_permitted))) {
 		return -EPERM;
 	}
 
@@ -155,9 +160,11 @@ void cap_capset_set(const kernel_cap_t *effective,
 		    const kernel_cap_t *inheritable,
 		    const kernel_cap_t *permitted)
 {
-	current->cap_effective = *effective;
-	current->cap_inheritable = *inheritable;
-	current->cap_permitted = *permitted;
+	struct cred *cred = current->cred;
+
+	cred->cap_effective   = *effective;
+	cred->cap_inheritable = *inheritable;
+	cred->cap_permitted   = *permitted;
 }
 
 static inline void bprm_clear_caps(struct linux_binprm *bprm)
@@ -211,8 +218,8 @@ static inline int bprm_caps_from_vfs_caps(struct cpu_vfs_cap_data *caps,
 		 * pP' = (X & fP) | (pI & fI)
 		 */
 		bprm->cap_post_exec_permitted.cap[i] =
-			(current->cap_bset.cap[i] & permitted) |
-			(current->cap_inheritable.cap[i] & inheritable);
+			(current->cred->cap_bset.cap[i] & permitted) |
+			(current->cred->cap_inheritable.cap[i] & inheritable);
 
 		if (permitted & ~bprm->cap_post_exec_permitted.cap[i]) {
 			/*
@@ -354,8 +361,8 @@ int cap_bprm_set_security (struct linux_binprm *bprm)
 		if (bprm->e_uid == 0 || current_uid() == 0) {
 			/* pP' = (cap_bset & ~0) | (pI & ~0) */
 			bprm->cap_post_exec_permitted = cap_combine(
-				current->cap_bset, current->cap_inheritable
-				);
+				current->cred->cap_bset,
+				current->cred->cap_inheritable);
 			bprm->cap_effective = (bprm->e_uid == 0);
 			ret = 0;
 		}
@@ -366,44 +373,39 @@ int cap_bprm_set_security (struct linux_binprm *bprm)
 
 void cap_bprm_apply_creds (struct linux_binprm *bprm, int unsafe)
 {
-	kernel_cap_t pP = current->cap_permitted;
-	kernel_cap_t pE = current->cap_effective;
-	uid_t uid;
-	gid_t gid;
+	struct cred *cred = current->cred;
 
-	current_uid_gid(&uid, &gid);
-
-	if (bprm->e_uid != uid || bprm->e_gid != gid ||
+	if (bprm->e_uid != cred->uid || bprm->e_gid != cred->gid ||
 	    !cap_issubset(bprm->cap_post_exec_permitted,
-			  current->cap_permitted)) {
+			  cred->cap_permitted)) {
 		set_dumpable(current->mm, suid_dumpable);
 		current->pdeath_signal = 0;
 
 		if (unsafe & ~LSM_UNSAFE_PTRACE_CAP) {
 			if (!capable(CAP_SETUID)) {
-				bprm->e_uid = uid;
-				bprm->e_gid = gid;
+				bprm->e_uid = cred->uid;
+				bprm->e_gid = cred->gid;
 			}
 			if (cap_limit_ptraced_target()) {
 				bprm->cap_post_exec_permitted = cap_intersect(
 					bprm->cap_post_exec_permitted,
-					current->cap_permitted);
+					cred->cap_permitted);
 			}
 		}
 	}
 
-	current->suid = current->euid = current->fsuid = bprm->e_uid;
-	current->sgid = current->egid = current->fsgid = bprm->e_gid;
+	cred->suid = cred->euid = cred->fsuid = bprm->e_uid;
+	cred->sgid = cred->egid = cred->fsgid = bprm->e_gid;
 
 	/* For init, we want to retain the capabilities set
 	 * in the init_task struct. Thus we skip the usual
 	 * capability rules */
 	if (!is_global_init(current)) {
-		current->cap_permitted = bprm->cap_post_exec_permitted;
+		cred->cap_permitted = bprm->cap_post_exec_permitted;
 		if (bprm->cap_effective)
-			current->cap_effective = bprm->cap_post_exec_permitted;
+			cred->cap_effective = bprm->cap_post_exec_permitted;
 		else
-			cap_clear(current->cap_effective);
+			cap_clear(cred->cap_effective);
 	}
 
 	/*
@@ -418,27 +420,30 @@ void cap_bprm_apply_creds (struct linux_binprm *bprm, int unsafe)
 	 * Number 1 above might fail if you don't have a full bset, but I think
 	 * that is interesting information to audit.
 	 */
-	if (!cap_isclear(current->cap_effective)) {
-		if (!cap_issubset(CAP_FULL_SET, current->cap_effective) ||
-		    (bprm->e_uid != 0) || (current->uid != 0) ||
+	if (!cap_isclear(cred->cap_effective)) {
+		if (!cap_issubset(CAP_FULL_SET, cred->cap_effective) ||
+		    (bprm->e_uid != 0) || (cred->uid != 0) ||
 		    issecure(SECURE_NOROOT))
-			audit_log_bprm_fcaps(bprm, &pP, &pE);
+			audit_log_bprm_fcaps(bprm, &cred->cap_permitted,
+					     &cred->cap_effective);
 	}
 
-	current->securebits &= ~issecure_mask(SECURE_KEEP_CAPS);
+	cred->securebits &= ~issecure_mask(SECURE_KEEP_CAPS);
 }
 
 int cap_bprm_secureexec (struct linux_binprm *bprm)
 {
-	if (current_uid() != 0) {
+	const struct cred *cred = current->cred;
+
+	if (cred->uid != 0) {
 		if (bprm->cap_effective)
 			return 1;
 		if (!cap_isclear(bprm->cap_post_exec_permitted))
 			return 1;
 	}
 
-	return (current_euid() != current_uid() ||
-		current_egid() != current_gid());
+	return (cred->euid != cred->uid ||
+		cred->egid != cred->gid);
 }
 
 int cap_inode_setxattr(struct dentry *dentry, const char *name,
@@ -501,25 +506,27 @@ int cap_inode_removexattr(struct dentry *dentry, const char *name)
 static inline void cap_emulate_setxuid (int old_ruid, int old_euid,
 					int old_suid)
 {
-	uid_t euid = current_euid();
+	struct cred *cred = current->cred;
 
 	if ((old_ruid == 0 || old_euid == 0 || old_suid == 0) &&
-	    (current_uid()  != 0 && euid != 0 && current_suid() != 0) &&
+	    (cred->uid != 0 && cred->euid != 0 && cred->suid != 0) &&
 	    !issecure(SECURE_KEEP_CAPS)) {
-		cap_clear (current->cap_permitted);
-		cap_clear (current->cap_effective);
+		cap_clear (cred->cap_permitted);
+		cap_clear (cred->cap_effective);
 	}
-	if (old_euid == 0 && euid != 0) {
-		cap_clear (current->cap_effective);
+	if (old_euid == 0 && cred->euid != 0) {
+		cap_clear (cred->cap_effective);
 	}
-	if (old_euid != 0 && euid == 0) {
-		current->cap_effective = current->cap_permitted;
+	if (old_euid != 0 && cred->euid == 0) {
+		cred->cap_effective = cred->cap_permitted;
 	}
 }
 
 int cap_task_post_setuid (uid_t old_ruid, uid_t old_euid, uid_t old_suid,
 			  int flags)
 {
+	struct cred *cred = current->cred;
+
 	switch (flags) {
 	case LSM_SETID_RE:
 	case LSM_SETID_ID:
@@ -541,16 +548,16 @@ int cap_task_post_setuid (uid_t old_ruid, uid_t old_euid, uid_t old_suid,
 			 */
 
 			if (!issecure (SECURE_NO_SETUID_FIXUP)) {
-				if (old_fsuid == 0 && current_fsuid() != 0) {
-					current->cap_effective =
+				if (old_fsuid == 0 && cred->fsuid != 0) {
+					cred->cap_effective =
 						cap_drop_fs_set(
-						    current->cap_effective);
+							cred->cap_effective);
 				}
-				if (old_fsuid != 0 && current_fsuid() == 0) {
-					current->cap_effective =
+				if (old_fsuid != 0 && cred->fsuid == 0) {
+					cred->cap_effective =
 						cap_raise_fs_set(
-						    current->cap_effective,
-						    current->cap_permitted);
+						    cred->cap_effective,
+						    cred->cap_permitted);
 				}
 			}
 			break;
@@ -575,7 +582,8 @@ int cap_task_post_setuid (uid_t old_ruid, uid_t old_euid, uid_t old_suid,
  */
 static int cap_safe_nice(struct task_struct *p)
 {
-	if (!cap_issubset(p->cap_permitted, current->cap_permitted) &&
+	if (!cap_issubset(p->cred->cap_permitted,
+			  current->cred->cap_permitted) &&
 	    !capable(CAP_SYS_NICE))
 		return -EPERM;
 	return 0;
@@ -610,7 +618,7 @@ static long cap_prctl_drop(unsigned long cap)
 		return -EPERM;
 	if (!cap_valid(cap))
 		return -EINVAL;
-	cap_lower(current->cap_bset, cap);
+	cap_lower(current->cred->cap_bset, cap);
 	return 0;
 }
 
@@ -633,6 +641,7 @@ int cap_task_setnice (struct task_struct *p, int nice)
 int cap_task_prctl(int option, unsigned long arg2, unsigned long arg3,
 		   unsigned long arg4, unsigned long arg5, long *rc_p)
 {
+	struct cred *cred = current->cred;
 	long error = 0;
 
 	switch (option) {
@@ -640,7 +649,7 @@ int cap_task_prctl(int option, unsigned long arg2, unsigned long arg3,
 		if (!cap_valid(arg2))
 			error = -EINVAL;
 		else
-			error = !!cap_raised(current->cap_bset, arg2);
+			error = !!cap_raised(cred->cap_bset, arg2);
 		break;
 #ifdef CONFIG_SECURITY_FILE_CAPABILITIES
 	case PR_CAPBSET_DROP:
@@ -667,9 +676,9 @@ int cap_task_prctl(int option, unsigned long arg2, unsigned long arg3,
 	 * capability-based-privilege environment.
 	 */
 	case PR_SET_SECUREBITS:
-		if ((((current->securebits & SECURE_ALL_LOCKS) >> 1)
-		     & (current->securebits ^ arg2))                  /*[1]*/
-		    || ((current->securebits & SECURE_ALL_LOCKS
+		if ((((cred->securebits & SECURE_ALL_LOCKS) >> 1)
+		     & (cred->securebits ^ arg2))                  /*[1]*/
+		    || ((cred->securebits & SECURE_ALL_LOCKS
 			 & ~arg2))                                    /*[2]*/
 		    || (arg2 & ~(SECURE_ALL_LOCKS | SECURE_ALL_BITS)) /*[3]*/
 		    || (cap_capable(current, CAP_SETPCAP, SECURITY_CAP_AUDIT) != 0)) { /*[4]*/
@@ -682,11 +691,11 @@ int cap_task_prctl(int option, unsigned long arg2, unsigned long arg3,
 			 */
 			error = -EPERM;  /* cannot change a locked bit */
 		} else {
-			current->securebits = arg2;
+			cred->securebits = arg2;
 		}
 		break;
 	case PR_GET_SECUREBITS:
-		error = current->securebits;
+		error = cred->securebits;
 		break;
 
 #endif /* def CONFIG_SECURITY_FILE_CAPABILITIES */
@@ -701,10 +710,9 @@ int cap_task_prctl(int option, unsigned long arg2, unsigned long arg3,
 		else if (issecure(SECURE_KEEP_CAPS_LOCKED))
 			error = -EPERM;
 		else if (arg2)
-			current->securebits |= issecure_mask(SECURE_KEEP_CAPS);
+			cred->securebits |= issecure_mask(SECURE_KEEP_CAPS);
 		else
-			current->securebits &=
-				~issecure_mask(SECURE_KEEP_CAPS);
+			cred->securebits &= ~issecure_mask(SECURE_KEEP_CAPS);
 		break;
 
 	default:
@@ -719,11 +727,12 @@ int cap_task_prctl(int option, unsigned long arg2, unsigned long arg3,
 
 void cap_task_reparent_to_init (struct task_struct *p)
 {
-	cap_set_init_eff(p->cap_effective);
-	cap_clear(p->cap_inheritable);
-	cap_set_full(p->cap_permitted);
-	p->securebits = SECUREBITS_DEFAULT;
-	return;
+	struct cred *cred = p->cred;
+
+	cap_set_init_eff(cred->cap_effective);
+	cap_clear(cred->cap_inheritable);
+	cap_set_full(cred->cap_permitted);
+	p->cred->securebits = SECUREBITS_DEFAULT;
 }
 
 int cap_syslog (int type)
diff --git a/security/keys/keyctl.c b/security/keys/keyctl.c
index fcce331eca7..8833b447ade 100644
--- a/security/keys/keyctl.c
+++ b/security/keys/keyctl.c
@@ -889,7 +889,7 @@ long keyctl_instantiate_key(key_serial_t id,
 	/* the appropriate instantiation authorisation key must have been
 	 * assumed before calling this */
 	ret = -EPERM;
-	instkey = current->request_key_auth;
+	instkey = current->cred->request_key_auth;
 	if (!instkey)
 		goto error;
 
@@ -932,8 +932,8 @@ long keyctl_instantiate_key(key_serial_t id,
 	/* discard the assumed authority if it's just been disabled by
 	 * instantiation of the key */
 	if (ret == 0) {
-		key_put(current->request_key_auth);
-		current->request_key_auth = NULL;
+		key_put(current->cred->request_key_auth);
+		current->cred->request_key_auth = NULL;
 	}
 
 error2:
@@ -960,7 +960,7 @@ long keyctl_negate_key(key_serial_t id, unsigned timeout, key_serial_t ringid)
 	/* the appropriate instantiation authorisation key must have been
 	 * assumed before calling this */
 	ret = -EPERM;
-	instkey = current->request_key_auth;
+	instkey = current->cred->request_key_auth;
 	if (!instkey)
 		goto error;
 
@@ -983,8 +983,8 @@ long keyctl_negate_key(key_serial_t id, unsigned timeout, key_serial_t ringid)
 	/* discard the assumed authority if it's just been disabled by
 	 * instantiation of the key */
 	if (ret == 0) {
-		key_put(current->request_key_auth);
-		current->request_key_auth = NULL;
+		key_put(current->cred->request_key_auth);
+		current->cred->request_key_auth = NULL;
 	}
 
 error:
@@ -999,6 +999,7 @@ error:
  */
 long keyctl_set_reqkey_keyring(int reqkey_defl)
 {
+	struct cred *cred = current->cred;
 	int ret;
 
 	switch (reqkey_defl) {
@@ -1018,10 +1019,10 @@ long keyctl_set_reqkey_keyring(int reqkey_defl)
 	case KEY_REQKEY_DEFL_USER_KEYRING:
 	case KEY_REQKEY_DEFL_USER_SESSION_KEYRING:
 	set:
-		current->jit_keyring = reqkey_defl;
+		cred->jit_keyring = reqkey_defl;
 
 	case KEY_REQKEY_DEFL_NO_CHANGE:
-		return current->jit_keyring;
+		return cred->jit_keyring;
 
 	case KEY_REQKEY_DEFL_GROUP_KEYRING:
 	default:
@@ -1086,8 +1087,8 @@ long keyctl_assume_authority(key_serial_t id)
 
 	/* we divest ourselves of authority if given an ID of 0 */
 	if (id == 0) {
-		key_put(current->request_key_auth);
-		current->request_key_auth = NULL;
+		key_put(current->cred->request_key_auth);
+		current->cred->request_key_auth = NULL;
 		ret = 0;
 		goto error;
 	}
@@ -1103,8 +1104,8 @@ long keyctl_assume_authority(key_serial_t id)
 		goto error;
 	}
 
-	key_put(current->request_key_auth);
-	current->request_key_auth = authkey;
+	key_put(current->cred->request_key_auth);
+	current->cred->request_key_auth = authkey;
 	ret = authkey->serial;
 
 error:
diff --git a/security/keys/permission.c b/security/keys/permission.c
index 3b41f9b5253..baf3d5f31e7 100644
--- a/security/keys/permission.c
+++ b/security/keys/permission.c
@@ -22,6 +22,7 @@ int key_task_permission(const key_ref_t key_ref,
 			struct task_struct *context,
 			key_perm_t perm)
 {
+	struct cred *cred = context->cred;
 	struct key *key;
 	key_perm_t kperm;
 	int ret;
@@ -29,7 +30,7 @@ int key_task_permission(const key_ref_t key_ref,
 	key = key_ref_to_ptr(key_ref);
 
 	/* use the second 8-bits of permissions for keys the caller owns */
-	if (key->uid == context->fsuid) {
+	if (key->uid == cred->fsuid) {
 		kperm = key->perm >> 16;
 		goto use_these_perms;
 	}
@@ -37,14 +38,14 @@ int key_task_permission(const key_ref_t key_ref,
 	/* use the third 8-bits of permissions for keys the caller has a group
 	 * membership in common with */
 	if (key->gid != -1 && key->perm & KEY_GRP_ALL) {
-		if (key->gid == context->fsgid) {
+		if (key->gid == cred->fsgid) {
 			kperm = key->perm >> 8;
 			goto use_these_perms;
 		}
 
-		task_lock(context);
-		ret = groups_search(context->group_info, key->gid);
-		task_unlock(context);
+		spin_lock(&cred->lock);
+		ret = groups_search(cred->group_info, key->gid);
+		spin_unlock(&cred->lock);
 
 		if (ret) {
 			kperm = key->perm >> 8;
diff --git a/security/keys/process_keys.c b/security/keys/process_keys.c
index 1c793b7090a..b0904cdda2e 100644
--- a/security/keys/process_keys.c
+++ b/security/keys/process_keys.c
@@ -42,7 +42,7 @@ struct key_user root_key_user = {
  */
 int install_user_keyrings(void)
 {
-	struct user_struct *user = current->user;
+	struct user_struct *user = current->cred->user;
 	struct key *uid_keyring, *session_keyring;
 	char buf[20];
 	int ret;
@@ -156,7 +156,7 @@ int install_thread_keyring(void)
 
 	sprintf(buf, "_tid.%u", tsk->pid);
 
-	keyring = keyring_alloc(buf, tsk->uid, tsk->gid, tsk,
+	keyring = keyring_alloc(buf, tsk->cred->uid, tsk->cred->gid, tsk,
 				KEY_ALLOC_QUOTA_OVERRUN, NULL);
 	if (IS_ERR(keyring)) {
 		ret = PTR_ERR(keyring);
@@ -164,8 +164,8 @@ int install_thread_keyring(void)
 	}
 
 	task_lock(tsk);
-	old = tsk->thread_keyring;
-	tsk->thread_keyring = keyring;
+	old = tsk->cred->thread_keyring;
+	tsk->cred->thread_keyring = keyring;
 	task_unlock(tsk);
 
 	ret = 0;
@@ -192,7 +192,7 @@ int install_process_keyring(void)
 	if (!tsk->signal->process_keyring) {
 		sprintf(buf, "_pid.%u", tsk->tgid);
 
-		keyring = keyring_alloc(buf, tsk->uid, tsk->gid, tsk,
+		keyring = keyring_alloc(buf, tsk->cred->uid, tsk->cred->gid, tsk,
 					KEY_ALLOC_QUOTA_OVERRUN, NULL);
 		if (IS_ERR(keyring)) {
 			ret = PTR_ERR(keyring);
@@ -238,7 +238,7 @@ static int install_session_keyring(struct key *keyring)
 		if (tsk->signal->session_keyring)
 			flags = KEY_ALLOC_IN_QUOTA;
 
-		keyring = keyring_alloc(buf, tsk->uid, tsk->gid, tsk,
+		keyring = keyring_alloc(buf, tsk->cred->uid, tsk->cred->gid, tsk,
 					flags, NULL);
 		if (IS_ERR(keyring))
 			return PTR_ERR(keyring);
@@ -292,14 +292,14 @@ int copy_thread_group_keys(struct task_struct *tsk)
  */
 int copy_keys(unsigned long clone_flags, struct task_struct *tsk)
 {
-	key_check(tsk->thread_keyring);
-	key_check(tsk->request_key_auth);
+	key_check(tsk->cred->thread_keyring);
+	key_check(tsk->cred->request_key_auth);
 
 	/* no thread keyring yet */
-	tsk->thread_keyring = NULL;
+	tsk->cred->thread_keyring = NULL;
 
 	/* copy the request_key() authorisation for this thread */
-	key_get(tsk->request_key_auth);
+	key_get(tsk->cred->request_key_auth);
 
 	return 0;
 
@@ -322,8 +322,8 @@ void exit_thread_group_keys(struct signal_struct *tg)
  */
 void exit_keys(struct task_struct *tsk)
 {
-	key_put(tsk->thread_keyring);
-	key_put(tsk->request_key_auth);
+	key_put(tsk->cred->thread_keyring);
+	key_put(tsk->cred->request_key_auth);
 
 } /* end exit_keys() */
 
@@ -337,8 +337,8 @@ int exec_keys(struct task_struct *tsk)
 
 	/* newly exec'd tasks don't get a thread keyring */
 	task_lock(tsk);
-	old = tsk->thread_keyring;
-	tsk->thread_keyring = NULL;
+	old = tsk->cred->thread_keyring;
+	tsk->cred->thread_keyring = NULL;
 	task_unlock(tsk);
 
 	key_put(old);
@@ -373,10 +373,11 @@ int suid_keys(struct task_struct *tsk)
 void key_fsuid_changed(struct task_struct *tsk)
 {
 	/* update the ownership of the thread keyring */
-	if (tsk->thread_keyring) {
-		down_write(&tsk->thread_keyring->sem);
-		tsk->thread_keyring->uid = tsk->fsuid;
-		up_write(&tsk->thread_keyring->sem);
+	BUG_ON(!tsk->cred);
+	if (tsk->cred->thread_keyring) {
+		down_write(&tsk->cred->thread_keyring->sem);
+		tsk->cred->thread_keyring->uid = tsk->cred->fsuid;
+		up_write(&tsk->cred->thread_keyring->sem);
 	}
 
 } /* end key_fsuid_changed() */
@@ -388,10 +389,11 @@ void key_fsuid_changed(struct task_struct *tsk)
 void key_fsgid_changed(struct task_struct *tsk)
 {
 	/* update the ownership of the thread keyring */
-	if (tsk->thread_keyring) {
-		down_write(&tsk->thread_keyring->sem);
-		tsk->thread_keyring->gid = tsk->fsgid;
-		up_write(&tsk->thread_keyring->sem);
+	BUG_ON(!tsk->cred);
+	if (tsk->cred->thread_keyring) {
+		down_write(&tsk->cred->thread_keyring->sem);
+		tsk->cred->thread_keyring->gid = tsk->cred->fsgid;
+		up_write(&tsk->cred->thread_keyring->sem);
 	}
 
 } /* end key_fsgid_changed() */
@@ -426,9 +428,9 @@ key_ref_t search_process_keyrings(struct key_type *type,
 	err = ERR_PTR(-EAGAIN);
 
 	/* search the thread keyring first */
-	if (context->thread_keyring) {
+	if (context->cred->thread_keyring) {
 		key_ref = keyring_search_aux(
-			make_key_ref(context->thread_keyring, 1),
+			make_key_ref(context->cred->thread_keyring, 1),
 			context, type, description, match);
 		if (!IS_ERR(key_ref))
 			goto found;
@@ -493,9 +495,9 @@ key_ref_t search_process_keyrings(struct key_type *type,
 		}
 	}
 	/* or search the user-session keyring */
-	else if (context->user->session_keyring) {
+	else if (context->cred->user->session_keyring) {
 		key_ref = keyring_search_aux(
-			make_key_ref(context->user->session_keyring, 1),
+			make_key_ref(context->cred->user->session_keyring, 1),
 			context, type, description, match);
 		if (!IS_ERR(key_ref))
 			goto found;
@@ -517,20 +519,20 @@ key_ref_t search_process_keyrings(struct key_type *type,
 	 * search the keyrings of the process mentioned there
 	 * - we don't permit access to request_key auth keys via this method
 	 */
-	if (context->request_key_auth &&
+	if (context->cred->request_key_auth &&
 	    context == current &&
 	    type != &key_type_request_key_auth
 	    ) {
 		/* defend against the auth key being revoked */
-		down_read(&context->request_key_auth->sem);
+		down_read(&context->cred->request_key_auth->sem);
 
-		if (key_validate(context->request_key_auth) == 0) {
-			rka = context->request_key_auth->payload.data;
+		if (key_validate(context->cred->request_key_auth) == 0) {
+			rka = context->cred->request_key_auth->payload.data;
 
 			key_ref = search_process_keyrings(type, description,
 							  match, rka->context);
 
-			up_read(&context->request_key_auth->sem);
+			up_read(&context->cred->request_key_auth->sem);
 
 			if (!IS_ERR(key_ref))
 				goto found;
@@ -547,7 +549,7 @@ key_ref_t search_process_keyrings(struct key_type *type,
 				break;
 			}
 		} else {
-			up_read(&context->request_key_auth->sem);
+			up_read(&context->cred->request_key_auth->sem);
 		}
 	}
 
@@ -580,15 +582,16 @@ key_ref_t lookup_user_key(key_serial_t id, int create, int partial,
 {
 	struct request_key_auth *rka;
 	struct task_struct *t = current;
-	key_ref_t key_ref, skey_ref;
+	struct cred *cred = t->cred;
 	struct key *key;
+	key_ref_t key_ref, skey_ref;
 	int ret;
 
 	key_ref = ERR_PTR(-ENOKEY);
 
 	switch (id) {
 	case KEY_SPEC_THREAD_KEYRING:
-		if (!t->thread_keyring) {
+		if (!cred->thread_keyring) {
 			if (!create)
 				goto error;
 
@@ -599,7 +602,7 @@ key_ref_t lookup_user_key(key_serial_t id, int create, int partial,
 			}
 		}
 
-		key = t->thread_keyring;
+		key = cred->thread_keyring;
 		atomic_inc(&key->usage);
 		key_ref = make_key_ref(key, 1);
 		break;
@@ -628,7 +631,8 @@ key_ref_t lookup_user_key(key_serial_t id, int create, int partial,
 			ret = install_user_keyrings();
 			if (ret < 0)
 				goto error;
-			ret = install_session_keyring(t->user->session_keyring);
+			ret = install_session_keyring(
+				cred->user->session_keyring);
 			if (ret < 0)
 				goto error;
 		}
@@ -641,25 +645,25 @@ key_ref_t lookup_user_key(key_serial_t id, int create, int partial,
 		break;
 
 	case KEY_SPEC_USER_KEYRING:
-		if (!t->user->uid_keyring) {
+		if (!cred->user->uid_keyring) {
 			ret = install_user_keyrings();
 			if (ret < 0)
 				goto error;
 		}
 
-		key = t->user->uid_keyring;
+		key = cred->user->uid_keyring;
 		atomic_inc(&key->usage);
 		key_ref = make_key_ref(key, 1);
 		break;
 
 	case KEY_SPEC_USER_SESSION_KEYRING:
-		if (!t->user->session_keyring) {
+		if (!cred->user->session_keyring) {
 			ret = install_user_keyrings();
 			if (ret < 0)
 				goto error;
 		}
 
-		key = t->user->session_keyring;
+		key = cred->user->session_keyring;
 		atomic_inc(&key->usage);
 		key_ref = make_key_ref(key, 1);
 		break;
@@ -670,7 +674,7 @@ key_ref_t lookup_user_key(key_serial_t id, int create, int partial,
 		goto error;
 
 	case KEY_SPEC_REQKEY_AUTH_KEY:
-		key = t->request_key_auth;
+		key = cred->request_key_auth;
 		if (!key)
 			goto error;
 
@@ -679,19 +683,19 @@ key_ref_t lookup_user_key(key_serial_t id, int create, int partial,
 		break;
 
 	case KEY_SPEC_REQUESTOR_KEYRING:
-		if (!t->request_key_auth)
+		if (!cred->request_key_auth)
 			goto error;
 
-		down_read(&t->request_key_auth->sem);
-		if (t->request_key_auth->flags & KEY_FLAG_REVOKED) {
+		down_read(&cred->request_key_auth->sem);
+		if (cred->request_key_auth->flags & KEY_FLAG_REVOKED) {
 			key_ref = ERR_PTR(-EKEYREVOKED);
 			key = NULL;
 		} else {
-			rka = t->request_key_auth->payload.data;
+			rka = cred->request_key_auth->payload.data;
 			key = rka->dest_keyring;
 			atomic_inc(&key->usage);
 		}
-		up_read(&t->request_key_auth->sem);
+		up_read(&cred->request_key_auth->sem);
 		if (!key)
 			goto error;
 		key_ref = make_key_ref(key, 1);
@@ -791,7 +795,7 @@ long join_session_keyring(const char *name)
 	keyring = find_keyring_by_name(name, false);
 	if (PTR_ERR(keyring) == -ENOKEY) {
 		/* not found - try and create a new one */
-		keyring = keyring_alloc(name, tsk->uid, tsk->gid, tsk,
+		keyring = keyring_alloc(name, tsk->cred->uid, tsk->cred->gid, tsk,
 					KEY_ALLOC_IN_QUOTA, NULL);
 		if (IS_ERR(keyring)) {
 			ret = PTR_ERR(keyring);
diff --git a/security/keys/request_key.c b/security/keys/request_key.c
index 8e9d93b4a40..3e9b9eb1dd2 100644
--- a/security/keys/request_key.c
+++ b/security/keys/request_key.c
@@ -104,7 +104,8 @@ static int call_sbin_request_key(struct key_construction *cons,
 
 	/* we specify the process's default keyrings */
 	sprintf(keyring_str[0], "%d",
-		tsk->thread_keyring ? tsk->thread_keyring->serial : 0);
+		tsk->cred->thread_keyring ?
+		tsk->cred->thread_keyring->serial : 0);
 
 	prkey = 0;
 	if (tsk->signal->process_keyring)
@@ -117,7 +118,7 @@ static int call_sbin_request_key(struct key_construction *cons,
 		sskey = rcu_dereference(tsk->signal->session_keyring)->serial;
 		rcu_read_unlock();
 	} else {
-		sskey = tsk->user->session_keyring->serial;
+		sskey = tsk->cred->user->session_keyring->serial;
 	}
 
 	sprintf(keyring_str[2], "%d", sskey);
@@ -232,11 +233,11 @@ static void construct_get_dest_keyring(struct key **_dest_keyring)
 	} else {
 		/* use a default keyring; falling through the cases until we
 		 * find one that we actually have */
-		switch (tsk->jit_keyring) {
+		switch (tsk->cred->jit_keyring) {
 		case KEY_REQKEY_DEFL_DEFAULT:
 		case KEY_REQKEY_DEFL_REQUESTOR_KEYRING:
-			if (tsk->request_key_auth) {
-				authkey = tsk->request_key_auth;
+			if (tsk->cred->request_key_auth) {
+				authkey = tsk->cred->request_key_auth;
 				down_read(&authkey->sem);
 				rka = authkey->payload.data;
 				if (!test_bit(KEY_FLAG_REVOKED,
@@ -249,7 +250,7 @@ static void construct_get_dest_keyring(struct key **_dest_keyring)
 			}
 
 		case KEY_REQKEY_DEFL_THREAD_KEYRING:
-			dest_keyring = key_get(tsk->thread_keyring);
+			dest_keyring = key_get(tsk->cred->thread_keyring);
 			if (dest_keyring)
 				break;
 
@@ -268,11 +269,12 @@ static void construct_get_dest_keyring(struct key **_dest_keyring)
 				break;
 
 		case KEY_REQKEY_DEFL_USER_SESSION_KEYRING:
-			dest_keyring = key_get(tsk->user->session_keyring);
+			dest_keyring =
+				key_get(tsk->cred->user->session_keyring);
 			break;
 
 		case KEY_REQKEY_DEFL_USER_KEYRING:
-			dest_keyring = key_get(tsk->user->uid_keyring);
+			dest_keyring = key_get(tsk->cred->user->uid_keyring);
 			break;
 
 		case KEY_REQKEY_DEFL_GROUP_KEYRING:
diff --git a/security/keys/request_key_auth.c b/security/keys/request_key_auth.c
index 1762d44711d..2125579d5d7 100644
--- a/security/keys/request_key_auth.c
+++ b/security/keys/request_key_auth.c
@@ -164,22 +164,22 @@ struct key *request_key_auth_new(struct key *target, const void *callout_info,
 
 	/* see if the calling process is already servicing the key request of
 	 * another process */
-	if (current->request_key_auth) {
+	if (current->cred->request_key_auth) {
 		/* it is - use that instantiation context here too */
-		down_read(&current->request_key_auth->sem);
+		down_read(&current->cred->request_key_auth->sem);
 
 		/* if the auth key has been revoked, then the key we're
 		 * servicing is already instantiated */
 		if (test_bit(KEY_FLAG_REVOKED,
-			     &current->request_key_auth->flags))
+			     &current->cred->request_key_auth->flags))
 			goto auth_key_revoked;
 
-		irka = current->request_key_auth->payload.data;
+		irka = current->cred->request_key_auth->payload.data;
 		rka->context = irka->context;
 		rka->pid = irka->pid;
 		get_task_struct(rka->context);
 
-		up_read(&current->request_key_auth->sem);
+		up_read(&current->cred->request_key_auth->sem);
 	}
 	else {
 		/* it isn't - use this process as the context */
@@ -214,7 +214,7 @@ struct key *request_key_auth_new(struct key *target, const void *callout_info,
 	return authkey;
 
 auth_key_revoked:
-	up_read(&current->request_key_auth->sem);
+	up_read(&current->cred->request_key_auth->sem);
 	kfree(rka->callout_info);
 	kfree(rka);
 	kleave("= -EKEYREVOKED");
diff --git a/security/selinux/exports.c b/security/selinux/exports.c
index 64af2d3409e..cf02490cd1e 100644
--- a/security/selinux/exports.c
+++ b/security/selinux/exports.c
@@ -39,7 +39,7 @@ EXPORT_SYMBOL_GPL(selinux_string_to_sid);
 int selinux_secmark_relabel_packet_permission(u32 sid)
 {
 	if (selinux_enabled) {
-		struct task_security_struct *tsec = current->security;
+		struct task_security_struct *tsec = current->cred->security;
 
 		return avc_has_perm(tsec->sid, sid, SECCLASS_PACKET,
 				    PACKET__RELABELTO, NULL);
diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index 9f6da154cc8..328308f2882 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -167,21 +167,21 @@ static int task_alloc_security(struct task_struct *task)
 		return -ENOMEM;
 
 	tsec->osid = tsec->sid = SECINITSID_UNLABELED;
-	task->security = tsec;
+	task->cred->security = tsec;
 
 	return 0;
 }
 
 static void task_free_security(struct task_struct *task)
 {
-	struct task_security_struct *tsec = task->security;
-	task->security = NULL;
+	struct task_security_struct *tsec = task->cred->security;
+	task->cred->security = NULL;
 	kfree(tsec);
 }
 
 static int inode_alloc_security(struct inode *inode)
 {
-	struct task_security_struct *tsec = current->security;
+	struct task_security_struct *tsec = current->cred->security;
 	struct inode_security_struct *isec;
 
 	isec = kmem_cache_zalloc(sel_inode_cache, GFP_NOFS);
@@ -215,7 +215,7 @@ static void inode_free_security(struct inode *inode)
 
 static int file_alloc_security(struct file *file)
 {
-	struct task_security_struct *tsec = current->security;
+	struct task_security_struct *tsec = current->cred->security;
 	struct file_security_struct *fsec;
 
 	fsec = kzalloc(sizeof(struct file_security_struct), GFP_KERNEL);
@@ -554,7 +554,7 @@ static int selinux_set_mnt_opts(struct super_block *sb,
 				struct security_mnt_opts *opts)
 {
 	int rc = 0, i;
-	struct task_security_struct *tsec = current->security;
+	struct task_security_struct *tsec = current->cred->security;
 	struct superblock_security_struct *sbsec = sb->s_security;
 	const char *name = sb->s_type->name;
 	struct inode *inode = sbsec->sb->s_root->d_inode;
@@ -1353,8 +1353,8 @@ static int task_has_perm(struct task_struct *tsk1,
 {
 	struct task_security_struct *tsec1, *tsec2;
 
-	tsec1 = tsk1->security;
-	tsec2 = tsk2->security;
+	tsec1 = tsk1->cred->security;
+	tsec2 = tsk2->cred->security;
 	return avc_has_perm(tsec1->sid, tsec2->sid,
 			    SECCLASS_PROCESS, perms, NULL);
 }
@@ -1374,7 +1374,7 @@ static int task_has_capability(struct task_struct *tsk,
 	u32 av = CAP_TO_MASK(cap);
 	int rc;
 
-	tsec = tsk->security;
+	tsec = tsk->cred->security;
 
 	AVC_AUDIT_DATA_INIT(&ad, CAP);
 	ad.tsk = tsk;
@@ -1405,7 +1405,7 @@ static int task_has_system(struct task_struct *tsk,
 {
 	struct task_security_struct *tsec;
 
-	tsec = tsk->security;
+	tsec = tsk->cred->security;
 
 	return avc_has_perm(tsec->sid, SECINITSID_KERNEL,
 			    SECCLASS_SYSTEM, perms, NULL);
@@ -1426,7 +1426,7 @@ static int inode_has_perm(struct task_struct *tsk,
 	if (unlikely(IS_PRIVATE(inode)))
 		return 0;
 
-	tsec = tsk->security;
+	tsec = tsk->cred->security;
 	isec = inode->i_security;
 
 	if (!adp) {
@@ -1466,7 +1466,7 @@ static int file_has_perm(struct task_struct *tsk,
 				struct file *file,
 				u32 av)
 {
-	struct task_security_struct *tsec = tsk->security;
+	struct task_security_struct *tsec = tsk->cred->security;
 	struct file_security_struct *fsec = file->f_security;
 	struct inode *inode = file->f_path.dentry->d_inode;
 	struct avc_audit_data ad;
@@ -1503,7 +1503,7 @@ static int may_create(struct inode *dir,
 	struct avc_audit_data ad;
 	int rc;
 
-	tsec = current->security;
+	tsec = current->cred->security;
 	dsec = dir->i_security;
 	sbsec = dir->i_sb->s_security;
 
@@ -1540,7 +1540,7 @@ static int may_create_key(u32 ksid,
 {
 	struct task_security_struct *tsec;
 
-	tsec = ctx->security;
+	tsec = ctx->cred->security;
 
 	return avc_has_perm(tsec->sid, ksid, SECCLASS_KEY, KEY__CREATE, NULL);
 }
@@ -1561,7 +1561,7 @@ static int may_link(struct inode *dir,
 	u32 av;
 	int rc;
 
-	tsec = current->security;
+	tsec = current->cred->security;
 	dsec = dir->i_security;
 	isec = dentry->d_inode->i_security;
 
@@ -1606,7 +1606,7 @@ static inline int may_rename(struct inode *old_dir,
 	int old_is_dir, new_is_dir;
 	int rc;
 
-	tsec = current->security;
+	tsec = current->cred->security;
 	old_dsec = old_dir->i_security;
 	old_isec = old_dentry->d_inode->i_security;
 	old_is_dir = S_ISDIR(old_dentry->d_inode->i_mode);
@@ -1659,7 +1659,7 @@ static int superblock_has_perm(struct task_struct *tsk,
 	struct task_security_struct *tsec;
 	struct superblock_security_struct *sbsec;
 
-	tsec = tsk->security;
+	tsec = tsk->cred->security;
 	sbsec = sb->s_security;
 	return avc_has_perm(tsec->sid, sbsec->sid, SECCLASS_FILESYSTEM,
 			    perms, ad);
@@ -1758,8 +1758,8 @@ static int selinux_ptrace_may_access(struct task_struct *child,
 		return rc;
 
 	if (mode == PTRACE_MODE_READ) {
-		struct task_security_struct *tsec = current->security;
-		struct task_security_struct *csec = child->security;
+		struct task_security_struct *tsec = current->cred->security;
+		struct task_security_struct *csec = child->cred->security;
 		return avc_has_perm(tsec->sid, csec->sid,
 				    SECCLASS_FILE, FILE__READ, NULL);
 	}
@@ -1874,7 +1874,7 @@ static int selinux_sysctl(ctl_table *table, int op)
 	if (rc)
 		return rc;
 
-	tsec = current->security;
+	tsec = current->cred->security;
 
 	rc = selinux_sysctl_get_sid(table, (op == 0001) ?
 				    SECCLASS_DIR : SECCLASS_FILE, &tsid);
@@ -2025,7 +2025,7 @@ static int selinux_bprm_set_security(struct linux_binprm *bprm)
 	if (bsec->set)
 		return 0;
 
-	tsec = current->security;
+	tsec = current->cred->security;
 	isec = inode->i_security;
 
 	/* Default to the current task SID. */
@@ -2090,7 +2090,7 @@ static int selinux_bprm_check_security(struct linux_binprm *bprm)
 
 static int selinux_bprm_secureexec(struct linux_binprm *bprm)
 {
-	struct task_security_struct *tsec = current->security;
+	struct task_security_struct *tsec = current->cred->security;
 	int atsecure = 0;
 
 	if (tsec->osid != tsec->sid) {
@@ -2214,7 +2214,7 @@ static void selinux_bprm_apply_creds(struct linux_binprm *bprm, int unsafe)
 
 	secondary_ops->bprm_apply_creds(bprm, unsafe);
 
-	tsec = current->security;
+	tsec = current->cred->security;
 
 	bsec = bprm->security;
 	sid = bsec->sid;
@@ -2243,7 +2243,7 @@ static void selinux_bprm_apply_creds(struct linux_binprm *bprm, int unsafe)
 			rcu_read_lock();
 			tracer = tracehook_tracer_task(current);
 			if (likely(tracer != NULL)) {
-				sec = tracer->security;
+				sec = tracer->cred->security;
 				ptsid = sec->sid;
 			}
 			rcu_read_unlock();
@@ -2274,7 +2274,7 @@ static void selinux_bprm_post_apply_creds(struct linux_binprm *bprm)
 	int rc, i;
 	unsigned long flags;
 
-	tsec = current->security;
+	tsec = current->cred->security;
 	bsec = bprm->security;
 
 	if (bsec->unsafe) {
@@ -2521,7 +2521,7 @@ static int selinux_inode_init_security(struct inode *inode, struct inode *dir,
 	int rc;
 	char *namep = NULL, *context;
 
-	tsec = current->security;
+	tsec = current->cred->security;
 	dsec = dir->i_security;
 	sbsec = dir->i_sb->s_security;
 
@@ -2706,7 +2706,7 @@ static int selinux_inode_setotherxattr(struct dentry *dentry, const char *name)
 static int selinux_inode_setxattr(struct dentry *dentry, const char *name,
 				  const void *value, size_t size, int flags)
 {
-	struct task_security_struct *tsec = current->security;
+	struct task_security_struct *tsec = current->cred->security;
 	struct inode *inode = dentry->d_inode;
 	struct inode_security_struct *isec = inode->i_security;
 	struct superblock_security_struct *sbsec;
@@ -2918,7 +2918,7 @@ static int selinux_revalidate_file_permission(struct file *file, int mask)
 static int selinux_file_permission(struct file *file, int mask)
 {
 	struct inode *inode = file->f_path.dentry->d_inode;
-	struct task_security_struct *tsec = current->security;
+	struct task_security_struct *tsec = current->cred->security;
 	struct file_security_struct *fsec = file->f_security;
 	struct inode_security_struct *isec = inode->i_security;
 
@@ -2995,7 +2995,8 @@ static int selinux_file_mmap(struct file *file, unsigned long reqprot,
 			     unsigned long addr, unsigned long addr_only)
 {
 	int rc = 0;
-	u32 sid = ((struct task_security_struct *)(current->security))->sid;
+	u32 sid = ((struct task_security_struct *)
+		   (current->cred->security))->sid;
 
 	if (addr < mmap_min_addr)
 		rc = avc_has_perm(sid, sid, SECCLASS_MEMPROTECT,
@@ -3107,7 +3108,7 @@ static int selinux_file_set_fowner(struct file *file)
 	struct task_security_struct *tsec;
 	struct file_security_struct *fsec;
 
-	tsec = current->security;
+	tsec = current->cred->security;
 	fsec = file->f_security;
 	fsec->fown_sid = tsec->sid;
 
@@ -3125,7 +3126,7 @@ static int selinux_file_send_sigiotask(struct task_struct *tsk,
 	/* struct fown_struct is never outside the context of a struct file */
 	file = container_of(fown, struct file, f_owner);
 
-	tsec = tsk->security;
+	tsec = tsk->cred->security;
 	fsec = file->f_security;
 
 	if (!signum)
@@ -3188,12 +3189,12 @@ static int selinux_task_alloc_security(struct task_struct *tsk)
 	struct task_security_struct *tsec1, *tsec2;
 	int rc;
 
-	tsec1 = current->security;
+	tsec1 = current->cred->security;
 
 	rc = task_alloc_security(tsk);
 	if (rc)
 		return rc;
-	tsec2 = tsk->security;
+	tsec2 = tsk->cred->security;
 
 	tsec2->osid = tsec1->osid;
 	tsec2->sid = tsec1->sid;
@@ -3251,7 +3252,7 @@ static int selinux_task_getsid(struct task_struct *p)
 
 static void selinux_task_getsecid(struct task_struct *p, u32 *secid)
 {
-	struct task_security_struct *tsec = p->security;
+	struct task_security_struct *tsec = p->cred->security;
 	*secid = tsec->sid;
 }
 
@@ -3343,7 +3344,7 @@ static int selinux_task_kill(struct task_struct *p, struct siginfo *info,
 		perm = PROCESS__SIGNULL; /* null signal; existence test */
 	else
 		perm = signal_to_av(sig);
-	tsec = p->security;
+	tsec = p->cred->security;
 	if (secid)
 		rc = avc_has_perm(secid, tsec->sid, SECCLASS_PROCESS, perm, NULL);
 	else
@@ -3375,7 +3376,7 @@ static void selinux_task_reparent_to_init(struct task_struct *p)
 
 	secondary_ops->task_reparent_to_init(p);
 
-	tsec = p->security;
+	tsec = p->cred->security;
 	tsec->osid = tsec->sid;
 	tsec->sid = SECINITSID_KERNEL;
 	return;
@@ -3384,7 +3385,7 @@ static void selinux_task_reparent_to_init(struct task_struct *p)
 static void selinux_task_to_inode(struct task_struct *p,
 				  struct inode *inode)
 {
-	struct task_security_struct *tsec = p->security;
+	struct task_security_struct *tsec = p->cred->security;
 	struct inode_security_struct *isec = inode->i_security;
 
 	isec->sid = tsec->sid;
@@ -3632,7 +3633,7 @@ static int socket_has_perm(struct task_struct *task, struct socket *sock,
 	struct avc_audit_data ad;
 	int err = 0;
 
-	tsec = task->security;
+	tsec = task->cred->security;
 	isec = SOCK_INODE(sock)->i_security;
 
 	if (isec->sid == SECINITSID_KERNEL)
@@ -3656,7 +3657,7 @@ static int selinux_socket_create(int family, int type,
 	if (kern)
 		goto out;
 
-	tsec = current->security;
+	tsec = current->cred->security;
 	newsid = tsec->sockcreate_sid ? : tsec->sid;
 	err = avc_has_perm(tsec->sid, newsid,
 			   socket_type_to_security_class(family, type,
@@ -3677,7 +3678,7 @@ static int selinux_socket_post_create(struct socket *sock, int family,
 
 	isec = SOCK_INODE(sock)->i_security;
 
-	tsec = current->security;
+	tsec = current->cred->security;
 	newsid = tsec->sockcreate_sid ? : tsec->sid;
 	isec->sclass = socket_type_to_security_class(family, type, protocol);
 	isec->sid = kern ? SECINITSID_KERNEL : newsid;
@@ -3723,7 +3724,7 @@ static int selinux_socket_bind(struct socket *sock, struct sockaddr *address, in
 		struct sock *sk = sock->sk;
 		u32 sid, node_perm;
 
-		tsec = current->security;
+		tsec = current->cred->security;
 		isec = SOCK_INODE(sock)->i_security;
 
 		if (family == PF_INET) {
@@ -4764,7 +4765,7 @@ static int ipc_alloc_security(struct task_struct *task,
 			      struct kern_ipc_perm *perm,
 			      u16 sclass)
 {
-	struct task_security_struct *tsec = task->security;
+	struct task_security_struct *tsec = task->cred->security;
 	struct ipc_security_struct *isec;
 
 	isec = kzalloc(sizeof(struct ipc_security_struct), GFP_KERNEL);
@@ -4814,7 +4815,7 @@ static int ipc_has_perm(struct kern_ipc_perm *ipc_perms,
 	struct ipc_security_struct *isec;
 	struct avc_audit_data ad;
 
-	tsec = current->security;
+	tsec = current->cred->security;
 	isec = ipc_perms->security;
 
 	AVC_AUDIT_DATA_INIT(&ad, IPC);
@@ -4845,7 +4846,7 @@ static int selinux_msg_queue_alloc_security(struct msg_queue *msq)
 	if (rc)
 		return rc;
 
-	tsec = current->security;
+	tsec = current->cred->security;
 	isec = msq->q_perm.security;
 
 	AVC_AUDIT_DATA_INIT(&ad, IPC);
@@ -4871,7 +4872,7 @@ static int selinux_msg_queue_associate(struct msg_queue *msq, int msqflg)
 	struct ipc_security_struct *isec;
 	struct avc_audit_data ad;
 
-	tsec = current->security;
+	tsec = current->cred->security;
 	isec = msq->q_perm.security;
 
 	AVC_AUDIT_DATA_INIT(&ad, IPC);
@@ -4917,7 +4918,7 @@ static int selinux_msg_queue_msgsnd(struct msg_queue *msq, struct msg_msg *msg,
 	struct avc_audit_data ad;
 	int rc;
 
-	tsec = current->security;
+	tsec = current->cred->security;
 	isec = msq->q_perm.security;
 	msec = msg->security;
 
@@ -4965,7 +4966,7 @@ static int selinux_msg_queue_msgrcv(struct msg_queue *msq, struct msg_msg *msg,
 	struct avc_audit_data ad;
 	int rc;
 
-	tsec = target->security;
+	tsec = target->cred->security;
 	isec = msq->q_perm.security;
 	msec = msg->security;
 
@@ -4992,7 +4993,7 @@ static int selinux_shm_alloc_security(struct shmid_kernel *shp)
 	if (rc)
 		return rc;
 
-	tsec = current->security;
+	tsec = current->cred->security;
 	isec = shp->shm_perm.security;
 
 	AVC_AUDIT_DATA_INIT(&ad, IPC);
@@ -5018,7 +5019,7 @@ static int selinux_shm_associate(struct shmid_kernel *shp, int shmflg)
 	struct ipc_security_struct *isec;
 	struct avc_audit_data ad;
 
-	tsec = current->security;
+	tsec = current->cred->security;
 	isec = shp->shm_perm.security;
 
 	AVC_AUDIT_DATA_INIT(&ad, IPC);
@@ -5091,7 +5092,7 @@ static int selinux_sem_alloc_security(struct sem_array *sma)
 	if (rc)
 		return rc;
 
-	tsec = current->security;
+	tsec = current->cred->security;
 	isec = sma->sem_perm.security;
 
 	AVC_AUDIT_DATA_INIT(&ad, IPC);
@@ -5117,7 +5118,7 @@ static int selinux_sem_associate(struct sem_array *sma, int semflg)
 	struct ipc_security_struct *isec;
 	struct avc_audit_data ad;
 
-	tsec = current->security;
+	tsec = current->cred->security;
 	isec = sma->sem_perm.security;
 
 	AVC_AUDIT_DATA_INIT(&ad, IPC);
@@ -5224,7 +5225,7 @@ static int selinux_getprocattr(struct task_struct *p,
 			return error;
 	}
 
-	tsec = p->security;
+	tsec = p->cred->security;
 
 	if (!strcmp(name, "current"))
 		sid = tsec->sid;
@@ -5308,7 +5309,7 @@ static int selinux_setprocattr(struct task_struct *p,
 	   operation.  See selinux_bprm_set_security for the execve
 	   checks and may_create for the file creation checks. The
 	   operation will then fail if the context is not permitted. */
-	tsec = p->security;
+	tsec = p->cred->security;
 	if (!strcmp(name, "exec"))
 		tsec->exec_sid = sid;
 	else if (!strcmp(name, "fscreate"))
@@ -5361,7 +5362,8 @@ boundary_ok:
 		rcu_read_lock();
 		tracer = tracehook_tracer_task(p);
 		if (tracer != NULL) {
-			struct task_security_struct *ptsec = tracer->security;
+			struct task_security_struct *ptsec =
+				tracer->cred->security;
 			u32 ptsid = ptsec->sid;
 			rcu_read_unlock();
 			error = avc_has_perm_noaudit(ptsid, sid,
@@ -5405,7 +5407,7 @@ static void selinux_release_secctx(char *secdata, u32 seclen)
 static int selinux_key_alloc(struct key *k, struct task_struct *tsk,
 			     unsigned long flags)
 {
-	struct task_security_struct *tsec = tsk->security;
+	struct task_security_struct *tsec = tsk->cred->security;
 	struct key_security_struct *ksec;
 
 	ksec = kzalloc(sizeof(struct key_security_struct), GFP_KERNEL);
@@ -5439,7 +5441,7 @@ static int selinux_key_permission(key_ref_t key_ref,
 
 	key = key_ref_to_ptr(key_ref);
 
-	tsec = ctx->security;
+	tsec = ctx->cred->security;
 	ksec = key->security;
 
 	/* if no specific permissions are requested, we skip the
@@ -5683,7 +5685,7 @@ static __init int selinux_init(void)
 	/* Set the security state for the initial task. */
 	if (task_alloc_security(current))
 		panic("SELinux:  Failed to initialize initial task.\n");
-	tsec = current->security;
+	tsec = current->cred->security;
 	tsec->osid = tsec->sid = SECINITSID_KERNEL;
 
 	sel_inode_cache = kmem_cache_create("selinux_inode_security",
diff --git a/security/selinux/selinuxfs.c b/security/selinux/selinuxfs.c
index 69c9dccc8cf..10715d1330b 100644
--- a/security/selinux/selinuxfs.c
+++ b/security/selinux/selinuxfs.c
@@ -97,7 +97,7 @@ static int task_has_security(struct task_struct *tsk,
 {
 	struct task_security_struct *tsec;
 
-	tsec = tsk->security;
+	tsec = tsk->cred->security;
 	if (!tsec)
 		return -EACCES;
 
diff --git a/security/selinux/xfrm.c b/security/selinux/xfrm.c
index 8f17f542a11..d7db76617b0 100644
--- a/security/selinux/xfrm.c
+++ b/security/selinux/xfrm.c
@@ -197,7 +197,7 @@ static int selinux_xfrm_sec_ctx_alloc(struct xfrm_sec_ctx **ctxp,
 	struct xfrm_user_sec_ctx *uctx, u32 sid)
 {
 	int rc = 0;
-	struct task_security_struct *tsec = current->security;
+	struct task_security_struct *tsec = current->cred->security;
 	struct xfrm_sec_ctx *ctx = NULL;
 	char *ctx_str = NULL;
 	u32 str_len;
@@ -333,7 +333,7 @@ void selinux_xfrm_policy_free(struct xfrm_sec_ctx *ctx)
  */
 int selinux_xfrm_policy_delete(struct xfrm_sec_ctx *ctx)
 {
-	struct task_security_struct *tsec = current->security;
+	struct task_security_struct *tsec = current->cred->security;
 	int rc = 0;
 
 	if (ctx) {
@@ -378,7 +378,7 @@ void selinux_xfrm_state_free(struct xfrm_state *x)
   */
 int selinux_xfrm_state_delete(struct xfrm_state *x)
 {
-	struct task_security_struct *tsec = current->security;
+	struct task_security_struct *tsec = current->cred->security;
 	struct xfrm_sec_ctx *ctx = x->security;
 	int rc = 0;
 
diff --git a/security/smack/smack_access.c b/security/smack/smack_access.c
index 79ff21ed4c3..b6dd4fc0fb0 100644
--- a/security/smack/smack_access.c
+++ b/security/smack/smack_access.c
@@ -164,7 +164,7 @@ int smk_curacc(char *obj_label, u32 mode)
 {
 	int rc;
 
-	rc = smk_access(current->security, obj_label, mode);
+	rc = smk_access(current->cred->security, obj_label, mode);
 	if (rc == 0)
 		return 0;
 
@@ -173,7 +173,7 @@ int smk_curacc(char *obj_label, u32 mode)
 	 * only one that gets privilege and current does not
 	 * have that label.
 	 */
-	if (smack_onlycap != NULL && smack_onlycap != current->security)
+	if (smack_onlycap != NULL && smack_onlycap != current->cred->security)
 		return rc;
 
 	if (capable(CAP_MAC_OVERRIDE))
diff --git a/security/smack/smack_lsm.c b/security/smack/smack_lsm.c
index 6e2dc0bab70..791da238d04 100644
--- a/security/smack/smack_lsm.c
+++ b/security/smack/smack_lsm.c
@@ -102,7 +102,8 @@ static int smack_ptrace_may_access(struct task_struct *ctp, unsigned int mode)
 	if (rc != 0)
 		return rc;
 
-	rc = smk_access(current->security, ctp->security, MAY_READWRITE);
+	rc = smk_access(current->cred->security, ctp->cred->security,
+			MAY_READWRITE);
 	if (rc != 0 && capable(CAP_MAC_OVERRIDE))
 		return 0;
 	return rc;
@@ -124,7 +125,8 @@ static int smack_ptrace_traceme(struct task_struct *ptp)
 	if (rc != 0)
 		return rc;
 
-	rc = smk_access(ptp->security, current->security, MAY_READWRITE);
+	rc = smk_access(ptp->cred->security, current->cred->security,
+			MAY_READWRITE);
 	if (rc != 0 && has_capability(ptp, CAP_MAC_OVERRIDE))
 		return 0;
 	return rc;
@@ -141,7 +143,7 @@ static int smack_ptrace_traceme(struct task_struct *ptp)
 static int smack_syslog(int type)
 {
 	int rc;
-	char *sp = current->security;
+	char *sp = current->cred->security;
 
 	rc = cap_syslog(type);
 	if (rc != 0)
@@ -373,7 +375,7 @@ static int smack_sb_umount(struct vfsmount *mnt, int flags)
  */
 static int smack_inode_alloc_security(struct inode *inode)
 {
-	inode->i_security = new_inode_smack(current->security);
+	inode->i_security = new_inode_smack(current->cred->security);
 	if (inode->i_security == NULL)
 		return -ENOMEM;
 	return 0;
@@ -818,7 +820,7 @@ static int smack_file_permission(struct file *file, int mask)
  */
 static int smack_file_alloc_security(struct file *file)
 {
-	file->f_security = current->security;
+	file->f_security = current->cred->security;
 	return 0;
 }
 
@@ -916,7 +918,7 @@ static int smack_file_fcntl(struct file *file, unsigned int cmd,
  */
 static int smack_file_set_fowner(struct file *file)
 {
-	file->f_security = current->security;
+	file->f_security = current->cred->security;
 	return 0;
 }
 
@@ -941,7 +943,7 @@ static int smack_file_send_sigiotask(struct task_struct *tsk,
 	 * struct fown_struct is never outside the context of a struct file
 	 */
 	file = container_of(fown, struct file, f_owner);
-	rc = smk_access(file->f_security, tsk->security, MAY_WRITE);
+	rc = smk_access(file->f_security, tsk->cred->security, MAY_WRITE);
 	if (rc != 0 && has_capability(tsk, CAP_MAC_OVERRIDE))
 		return 0;
 	return rc;
@@ -984,7 +986,7 @@ static int smack_file_receive(struct file *file)
  */
 static int smack_task_alloc_security(struct task_struct *tsk)
 {
-	tsk->security = current->security;
+	tsk->cred->security = current->cred->security;
 
 	return 0;
 }
@@ -999,7 +1001,7 @@ static int smack_task_alloc_security(struct task_struct *tsk)
  */
 static void smack_task_free_security(struct task_struct *task)
 {
-	task->security = NULL;
+	task->cred->security = NULL;
 }
 
 /**
@@ -1011,7 +1013,7 @@ static void smack_task_free_security(struct task_struct *task)
  */
 static int smack_task_setpgid(struct task_struct *p, pid_t pgid)
 {
-	return smk_curacc(p->security, MAY_WRITE);
+	return smk_curacc(p->cred->security, MAY_WRITE);
 }
 
 /**
@@ -1022,7 +1024,7 @@ static int smack_task_setpgid(struct task_struct *p, pid_t pgid)
  */
 static int smack_task_getpgid(struct task_struct *p)
 {
-	return smk_curacc(p->security, MAY_READ);
+	return smk_curacc(p->cred->security, MAY_READ);
 }
 
 /**
@@ -1033,7 +1035,7 @@ static int smack_task_getpgid(struct task_struct *p)
  */
 static int smack_task_getsid(struct task_struct *p)
 {
-	return smk_curacc(p->security, MAY_READ);
+	return smk_curacc(p->cred->security, MAY_READ);
 }
 
 /**
@@ -1045,7 +1047,7 @@ static int smack_task_getsid(struct task_struct *p)
  */
 static void smack_task_getsecid(struct task_struct *p, u32 *secid)
 {
-	*secid = smack_to_secid(p->security);
+	*secid = smack_to_secid(p->cred->security);
 }
 
 /**
@@ -1061,7 +1063,7 @@ static int smack_task_setnice(struct task_struct *p, int nice)
 
 	rc = cap_task_setnice(p, nice);
 	if (rc == 0)
-		rc = smk_curacc(p->security, MAY_WRITE);
+		rc = smk_curacc(p->cred->security, MAY_WRITE);
 	return rc;
 }
 
@@ -1078,7 +1080,7 @@ static int smack_task_setioprio(struct task_struct *p, int ioprio)
 
 	rc = cap_task_setioprio(p, ioprio);
 	if (rc == 0)
-		rc = smk_curacc(p->security, MAY_WRITE);
+		rc = smk_curacc(p->cred->security, MAY_WRITE);
 	return rc;
 }
 
@@ -1090,7 +1092,7 @@ static int smack_task_setioprio(struct task_struct *p, int ioprio)
  */
 static int smack_task_getioprio(struct task_struct *p)
 {
-	return smk_curacc(p->security, MAY_READ);
+	return smk_curacc(p->cred->security, MAY_READ);
 }
 
 /**
@@ -1108,7 +1110,7 @@ static int smack_task_setscheduler(struct task_struct *p, int policy,
 
 	rc = cap_task_setscheduler(p, policy, lp);
 	if (rc == 0)
-		rc = smk_curacc(p->security, MAY_WRITE);
+		rc = smk_curacc(p->cred->security, MAY_WRITE);
 	return rc;
 }
 
@@ -1120,7 +1122,7 @@ static int smack_task_setscheduler(struct task_struct *p, int policy,
  */
 static int smack_task_getscheduler(struct task_struct *p)
 {
-	return smk_curacc(p->security, MAY_READ);
+	return smk_curacc(p->cred->security, MAY_READ);
 }
 
 /**
@@ -1131,7 +1133,7 @@ static int smack_task_getscheduler(struct task_struct *p)
  */
 static int smack_task_movememory(struct task_struct *p)
 {
-	return smk_curacc(p->security, MAY_WRITE);
+	return smk_curacc(p->cred->security, MAY_WRITE);
 }
 
 /**
@@ -1154,13 +1156,13 @@ static int smack_task_kill(struct task_struct *p, struct siginfo *info,
 	 * can write the receiver.
 	 */
 	if (secid == 0)
-		return smk_curacc(p->security, MAY_WRITE);
+		return smk_curacc(p->cred->security, MAY_WRITE);
 	/*
 	 * If the secid isn't 0 we're dealing with some USB IO
 	 * specific behavior. This is not clean. For one thing
 	 * we can't take privilege into account.
 	 */
-	return smk_access(smack_from_secid(secid), p->security, MAY_WRITE);
+	return smk_access(smack_from_secid(secid), p->cred->security, MAY_WRITE);
 }
 
 /**
@@ -1173,7 +1175,7 @@ static int smack_task_wait(struct task_struct *p)
 {
 	int rc;
 
-	rc = smk_access(current->security, p->security, MAY_WRITE);
+	rc = smk_access(current->cred->security, p->cred->security, MAY_WRITE);
 	if (rc == 0)
 		return 0;
 
@@ -1204,7 +1206,7 @@ static int smack_task_wait(struct task_struct *p)
 static void smack_task_to_inode(struct task_struct *p, struct inode *inode)
 {
 	struct inode_smack *isp = inode->i_security;
-	isp->smk_inode = p->security;
+	isp->smk_inode = p->cred->security;
 }
 
 /*
@@ -1223,7 +1225,7 @@ static void smack_task_to_inode(struct task_struct *p, struct inode *inode)
  */
 static int smack_sk_alloc_security(struct sock *sk, int family, gfp_t gfp_flags)
 {
-	char *csp = current->security;
+	char *csp = current->cred->security;
 	struct socket_smack *ssp;
 
 	ssp = kzalloc(sizeof(struct socket_smack), gfp_flags);
@@ -1448,7 +1450,7 @@ static int smack_flags_to_may(int flags)
  */
 static int smack_msg_msg_alloc_security(struct msg_msg *msg)
 {
-	msg->security = current->security;
+	msg->security = current->cred->security;
 	return 0;
 }
 
@@ -1484,7 +1486,7 @@ static int smack_shm_alloc_security(struct shmid_kernel *shp)
 {
 	struct kern_ipc_perm *isp = &shp->shm_perm;
 
-	isp->security = current->security;
+	isp->security = current->cred->security;
 	return 0;
 }
 
@@ -1593,7 +1595,7 @@ static int smack_sem_alloc_security(struct sem_array *sma)
 {
 	struct kern_ipc_perm *isp = &sma->sem_perm;
 
-	isp->security = current->security;
+	isp->security = current->cred->security;
 	return 0;
 }
 
@@ -1697,7 +1699,7 @@ static int smack_msg_queue_alloc_security(struct msg_queue *msq)
 {
 	struct kern_ipc_perm *kisp = &msq->q_perm;
 
-	kisp->security = current->security;
+	kisp->security = current->cred->security;
 	return 0;
 }
 
@@ -1852,7 +1854,7 @@ static void smack_d_instantiate(struct dentry *opt_dentry, struct inode *inode)
 	struct super_block *sbp;
 	struct superblock_smack *sbsp;
 	struct inode_smack *isp;
-	char *csp = current->security;
+	char *csp = current->cred->security;
 	char *fetched;
 	char *final;
 	struct dentry *dp;
@@ -2009,7 +2011,7 @@ static int smack_getprocattr(struct task_struct *p, char *name, char **value)
 	if (strcmp(name, "current") != 0)
 		return -EINVAL;
 
-	cp = kstrdup(p->security, GFP_KERNEL);
+	cp = kstrdup(p->cred->security, GFP_KERNEL);
 	if (cp == NULL)
 		return -ENOMEM;
 
@@ -2055,7 +2057,7 @@ static int smack_setprocattr(struct task_struct *p, char *name,
 	if (newsmack == NULL)
 		return -EINVAL;
 
-	p->security = newsmack;
+	p->cred->security = newsmack;
 	return size;
 }
 
@@ -2288,8 +2290,8 @@ static void smack_sock_graft(struct sock *sk, struct socket *parent)
 		return;
 
 	ssp = sk->sk_security;
-	ssp->smk_in = current->security;
-	ssp->smk_out = current->security;
+	ssp->smk_in = current->cred->security;
+	ssp->smk_out = current->cred->security;
 	ssp->smk_packet[0] = '\0';
 
 	rc = smack_netlabel(sk);
@@ -2362,7 +2364,7 @@ static int smack_inet_conn_request(struct sock *sk, struct sk_buff *skb,
 static int smack_key_alloc(struct key *key, struct task_struct *tsk,
 			   unsigned long flags)
 {
-	key->security = tsk->security;
+	key->security = tsk->cred->security;
 	return 0;
 }
 
@@ -2403,10 +2405,11 @@ static int smack_key_permission(key_ref_t key_ref,
 	/*
 	 * This should not occur
 	 */
-	if (context->security == NULL)
+	if (context->cred->security == NULL)
 		return -EACCES;
 
-	return smk_access(context->security, keyp->security, MAY_READWRITE);
+	return smk_access(context->cred->security, keyp->security,
+			  MAY_READWRITE);
 }
 #endif /* CONFIG_KEYS */
 
@@ -2726,7 +2729,7 @@ static __init int smack_init(void)
 	/*
 	 * Set the security state for the initial task.
 	 */
-	current->security = &smack_known_floor.smk_known;
+	current->cred->security = &smack_known_floor.smk_known;
 
 	/*
 	 * Initialize locks
diff --git a/security/smack/smackfs.c b/security/smack/smackfs.c
index c21d8c8bf0c..c5ca279e050 100644
--- a/security/smack/smackfs.c
+++ b/security/smack/smackfs.c
@@ -336,7 +336,7 @@ static void smk_cipso_doi(void)
 
 	audit_info.loginuid = audit_get_loginuid(current);
 	audit_info.sessionid = audit_get_sessionid(current);
-	audit_info.secid = smack_to_secid(current->security);
+	audit_info.secid = smack_to_secid(current->cred->security);
 
 	rc = netlbl_cfg_map_del(NULL, &audit_info);
 	if (rc != 0)
@@ -371,7 +371,7 @@ static void smk_unlbl_ambient(char *oldambient)
 
 	audit_info.loginuid = audit_get_loginuid(current);
 	audit_info.sessionid = audit_get_sessionid(current);
-	audit_info.secid = smack_to_secid(current->security);
+	audit_info.secid = smack_to_secid(current->cred->security);
 
 	if (oldambient != NULL) {
 		rc = netlbl_cfg_map_del(oldambient, &audit_info);
@@ -843,7 +843,7 @@ static ssize_t smk_write_onlycap(struct file *file, const char __user *buf,
 				 size_t count, loff_t *ppos)
 {
 	char in[SMK_LABELLEN];
-	char *sp = current->security;
+	char *sp = current->cred->security;
 
 	if (!capable(CAP_MAC_ADMIN))
 		return -EPERM;
-- 
cgit v1.2.3-70-g09d2


From f1752eec6145c97163dbce62d17cf5d928e28a27 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 14 Nov 2008 10:39:17 +1100
Subject: CRED: Detach the credentials from task_struct

Detach the credentials from task_struct, duplicating them in copy_process()
and releasing them in __put_task_struct().

Signed-off-by: David Howells <dhowells@redhat.com>
Acked-by: James Morris <jmorris@namei.org>
Acked-by: Serge Hallyn <serue@us.ibm.com>
Signed-off-by: James Morris <jmorris@namei.org>
---
 include/linux/cred.h       | 29 ++++++++++++++
 include/linux/init_task.h  | 16 +-------
 include/linux/sched.h      |  1 -
 include/linux/security.h   | 26 ++++++-------
 kernel/Makefile            |  2 +-
 kernel/cred.c              | 96 ++++++++++++++++++++++++++++++++++++++++++++++
 kernel/fork.c              | 24 +++---------
 security/capability.c      |  8 ++--
 security/security.c        |  8 ++--
 security/selinux/hooks.c   | 32 ++++++++--------
 security/smack/smack_lsm.c | 20 +++++-----
 11 files changed, 179 insertions(+), 83 deletions(-)
 create mode 100644 kernel/cred.c

(limited to 'include/linux')

diff --git a/include/linux/cred.h b/include/linux/cred.h
index 3e65587a72e..a7a686074cb 100644
--- a/include/linux/cred.h
+++ b/include/linux/cred.h
@@ -158,4 +158,33 @@ do {						\
 	*(_gid) = current->cred->fsgid;		\
 } while(0)
 
+extern void __put_cred(struct cred *);
+extern int copy_creds(struct task_struct *, unsigned long);
+
+/**
+ * get_cred - Get a reference on a set of credentials
+ * @cred: The credentials to reference
+ *
+ * Get a reference on the specified set of credentials.  The caller must
+ * release the reference.
+ */
+static inline struct cred *get_cred(struct cred *cred)
+{
+	atomic_inc(&cred->usage);
+	return cred;
+}
+
+/**
+ * put_cred - Release a reference to a set of credentials
+ * @cred: The credentials to release
+ *
+ * Release a reference to a set of credentials, deleting them when the last ref
+ * is released.
+ */
+static inline void put_cred(struct cred *cred)
+{
+	if (atomic_dec_and_test(&(cred)->usage))
+		__put_cred(cred);
+}
+
 #endif /* _LINUX_CRED_H */
diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index 9de41ccd67b..5e24c54b6df 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -115,19 +115,6 @@ extern struct group_info init_groups;
 
 extern struct cred init_cred;
 
-#define INIT_CRED(p)						\
-{								\
-	.usage			= ATOMIC_INIT(3),		\
-	.securebits		= SECUREBITS_DEFAULT,		\
-	.cap_inheritable	= CAP_INIT_INH_SET,		\
-	.cap_permitted		= CAP_FULL_SET,			\
-	.cap_effective		= CAP_INIT_EFF_SET,		\
-	.cap_bset		= CAP_INIT_BSET,		\
-	.user			= INIT_USER,			\
-	.group_info		= &init_groups,			\
-	.lock			= __SPIN_LOCK_UNLOCKED(p.lock),	\
-}
-
 /*
  *  INIT_TASK is used to set up the first task table, touch at
  * your own risk!. Base=0, limit=0x1fffff (=2MB)
@@ -162,8 +149,7 @@ extern struct cred init_cred;
 	.children	= LIST_HEAD_INIT(tsk.children),			\
 	.sibling	= LIST_HEAD_INIT(tsk.sibling),			\
 	.group_leader	= &tsk,						\
-	.__temp_cred	= INIT_CRED(tsk.__temp_cred),			\
-	.cred		= &tsk.__temp_cred,				\
+	.cred		= &init_cred,					\
 	.comm		= "swapper",					\
 	.thread		= INIT_THREAD,					\
 	.fs		= &init_fs,					\
diff --git a/include/linux/sched.h b/include/linux/sched.h
index c8b92502354..740cf946c8c 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1151,7 +1151,6 @@ struct task_struct {
 	struct list_head cpu_timers[3];
 
 /* process credentials */
-	struct cred __temp_cred __deprecated; /* temporary credentials to be removed */
 	struct cred *cred;	/* actual/objective task credentials */
 
 	char comm[TASK_COMM_LEN]; /* executable name excluding path
diff --git a/include/linux/security.h b/include/linux/security.h
index 9f305d4a31a..9239cc11eb9 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -593,15 +593,15 @@ static inline void security_free_mnt_opts(struct security_mnt_opts *opts)
  *	manual page for definitions of the @clone_flags.
  *	@clone_flags contains the flags indicating what should be shared.
  *	Return 0 if permission is granted.
- * @task_alloc_security:
- *	@p contains the task_struct for child process.
- *	Allocate and attach a security structure to the p->security field. The
- *	security field is initialized to NULL when the task structure is
+ * @cred_alloc_security:
+ *	@cred contains the cred struct for child process.
+ *	Allocate and attach a security structure to the cred->security field.
+ *	The security field is initialized to NULL when the task structure is
  *	allocated.
  *	Return 0 if operation was successful.
- * @task_free_security:
- *	@p contains the task_struct for process.
- *	Deallocate and clear the p->security field.
+ * @cred_free:
+ *	@cred points to the credentials.
+ *	Deallocate and clear the cred->security field in a set of credentials.
  * @task_setuid:
  *	Check permission before setting one or more of the user identity
  *	attributes of the current process.  The @flags parameter indicates
@@ -1405,8 +1405,8 @@ struct security_operations {
 	int (*dentry_open) (struct file *file);
 
 	int (*task_create) (unsigned long clone_flags);
-	int (*task_alloc_security) (struct task_struct *p);
-	void (*task_free_security) (struct task_struct *p);
+	int (*cred_alloc_security) (struct cred *cred);
+	void (*cred_free) (struct cred *cred);
 	int (*task_setuid) (uid_t id0, uid_t id1, uid_t id2, int flags);
 	int (*task_post_setuid) (uid_t old_ruid /* or fsuid */ ,
 				 uid_t old_euid, uid_t old_suid, int flags);
@@ -1660,8 +1660,8 @@ int security_file_send_sigiotask(struct task_struct *tsk,
 int security_file_receive(struct file *file);
 int security_dentry_open(struct file *file);
 int security_task_create(unsigned long clone_flags);
-int security_task_alloc(struct task_struct *p);
-void security_task_free(struct task_struct *p);
+int security_cred_alloc(struct cred *cred);
+void security_cred_free(struct cred *cred);
 int security_task_setuid(uid_t id0, uid_t id1, uid_t id2, int flags);
 int security_task_post_setuid(uid_t old_ruid, uid_t old_euid,
 			      uid_t old_suid, int flags);
@@ -2181,12 +2181,12 @@ static inline int security_task_create(unsigned long clone_flags)
 	return 0;
 }
 
-static inline int security_task_alloc(struct task_struct *p)
+static inline int security_cred_alloc(struct cred *cred)
 {
 	return 0;
 }
 
-static inline void security_task_free(struct task_struct *p)
+static inline void security_cred_free(struct cred *cred)
 { }
 
 static inline int security_task_setuid(uid_t id0, uid_t id1, uid_t id2,
diff --git a/kernel/Makefile b/kernel/Makefile
index 9a3ec66a9d8..5a6a612c302 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -9,7 +9,7 @@ obj-y     = sched.o fork.o exec_domain.o panic.o printk.o \
 	    rcupdate.o extable.o params.o posix-timers.o \
 	    kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \
 	    hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \
-	    notifier.o ksysfs.o pm_qos_params.o sched_clock.o
+	    notifier.o ksysfs.o pm_qos_params.o sched_clock.o cred.o
 
 CFLAGS_REMOVE_sched.o = -mno-spe
 
diff --git a/kernel/cred.c b/kernel/cred.c
new file mode 100644
index 00000000000..833244a7cb0
--- /dev/null
+++ b/kernel/cred.c
@@ -0,0 +1,96 @@
+/* Task credentials management
+ *
+ * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public Licence
+ * as published by the Free Software Foundation; either version
+ * 2 of the Licence, or (at your option) any later version.
+ */
+#include <linux/module.h>
+#include <linux/cred.h>
+#include <linux/sched.h>
+#include <linux/key.h>
+#include <linux/keyctl.h>
+#include <linux/init_task.h>
+#include <linux/security.h>
+
+/*
+ * The initial credentials for the initial task
+ */
+struct cred init_cred = {
+	.usage			= ATOMIC_INIT(3),
+	.securebits		= SECUREBITS_DEFAULT,
+	.cap_inheritable	= CAP_INIT_INH_SET,
+	.cap_permitted		= CAP_FULL_SET,
+	.cap_effective		= CAP_INIT_EFF_SET,
+	.cap_bset		= CAP_INIT_BSET,
+	.user			= INIT_USER,
+	.group_info		= &init_groups,
+};
+
+/*
+ * The RCU callback to actually dispose of a set of credentials
+ */
+static void put_cred_rcu(struct rcu_head *rcu)
+{
+	struct cred *cred = container_of(rcu, struct cred, rcu);
+
+	BUG_ON(atomic_read(&cred->usage) != 0);
+
+	key_put(cred->thread_keyring);
+	key_put(cred->request_key_auth);
+	put_group_info(cred->group_info);
+	free_uid(cred->user);
+	security_cred_free(cred);
+	kfree(cred);
+}
+
+/**
+ * __put_cred - Destroy a set of credentials
+ * @sec: The record to release
+ *
+ * Destroy a set of credentials on which no references remain.
+ */
+void __put_cred(struct cred *cred)
+{
+	call_rcu(&cred->rcu, put_cred_rcu);
+}
+EXPORT_SYMBOL(__put_cred);
+
+/*
+ * Copy credentials for the new process created by fork()
+ */
+int copy_creds(struct task_struct *p, unsigned long clone_flags)
+{
+	struct cred *pcred;
+	int ret;
+
+	pcred = kmemdup(p->cred, sizeof(*p->cred), GFP_KERNEL);
+	if (!pcred)
+		return -ENOMEM;
+
+#ifdef CONFIG_SECURITY
+	pcred->security = NULL;
+#endif
+
+	ret = security_cred_alloc(pcred);
+	if (ret < 0) {
+		kfree(pcred);
+		return ret;
+	}
+
+	atomic_set(&pcred->usage, 1);
+	get_group_info(pcred->group_info);
+	get_uid(pcred->user);
+	key_get(pcred->thread_keyring);
+	key_get(pcred->request_key_auth);
+
+	atomic_inc(&pcred->user->processes);
+
+	/* RCU assignment is unneeded here as no-one can have accessed this
+	 * pointer yet, barring us */
+	p->cred = pcred;
+	return 0;
+}
diff --git a/kernel/fork.c b/kernel/fork.c
index 81fdc773390..c932e283ddf 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -146,9 +146,7 @@ void __put_task_struct(struct task_struct *tsk)
 	WARN_ON(atomic_read(&tsk->usage));
 	WARN_ON(tsk == current);
 
-	security_task_free(tsk);
-	free_uid(tsk->__temp_cred.user);
-	put_group_info(tsk->__temp_cred.group_info);
+	put_cred(tsk->cred);
 	delayacct_tsk_free(tsk);
 
 	if (!profile_handoff_task(tsk))
@@ -969,7 +967,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 	DEBUG_LOCKS_WARN_ON(!p->hardirqs_enabled);
 	DEBUG_LOCKS_WARN_ON(!p->softirqs_enabled);
 #endif
-	p->cred = &p->__temp_cred;
 	retval = -EAGAIN;
 	if (atomic_read(&p->cred->user->processes) >=
 			p->signal->rlim[RLIMIT_NPROC].rlim_cur) {
@@ -978,9 +975,9 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 			goto bad_fork_free;
 	}
 
-	atomic_inc(&p->cred->user->__count);
-	atomic_inc(&p->cred->user->processes);
-	get_group_info(p->cred->group_info);
+	retval = copy_creds(p, clone_flags);
+	if (retval < 0)
+		goto bad_fork_free;
 
 	/*
 	 * If multiple threads are within copy_process(), then this check
@@ -1035,9 +1032,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 	do_posix_clock_monotonic_gettime(&p->start_time);
 	p->real_start_time = p->start_time;
 	monotonic_to_bootbased(&p->real_start_time);
-#ifdef CONFIG_SECURITY
-	p->cred->security = NULL;
-#endif
 	p->io_context = NULL;
 	p->audit_context = NULL;
 	cgroup_fork(p);
@@ -1082,10 +1076,8 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 	/* Perform scheduler related setup. Assign this task to a CPU. */
 	sched_fork(p, clone_flags);
 
-	if ((retval = security_task_alloc(p)))
-		goto bad_fork_cleanup_policy;
 	if ((retval = audit_alloc(p)))
-		goto bad_fork_cleanup_security;
+		goto bad_fork_cleanup_policy;
 	/* copy all the process information */
 	if ((retval = copy_semundo(clone_flags, p)))
 		goto bad_fork_cleanup_audit;
@@ -1284,8 +1276,6 @@ bad_fork_cleanup_semundo:
 	exit_sem(p);
 bad_fork_cleanup_audit:
 	audit_free(p);
-bad_fork_cleanup_security:
-	security_task_free(p);
 bad_fork_cleanup_policy:
 #ifdef CONFIG_NUMA
 	mpol_put(p->mempolicy);
@@ -1298,9 +1288,7 @@ bad_fork_cleanup_cgroup:
 bad_fork_cleanup_put_domain:
 	module_put(task_thread_info(p)->exec_domain->module);
 bad_fork_cleanup_count:
-	put_group_info(p->cred->group_info);
-	atomic_dec(&p->cred->user->processes);
-	free_uid(p->cred->user);
+	put_cred(p->cred);
 bad_fork_free:
 	free_task(p);
 fork_out:
diff --git a/security/capability.c b/security/capability.c
index 24587481903..6c4b5137ca7 100644
--- a/security/capability.c
+++ b/security/capability.c
@@ -340,12 +340,12 @@ static int cap_task_create(unsigned long clone_flags)
 	return 0;
 }
 
-static int cap_task_alloc_security(struct task_struct *p)
+static int cap_cred_alloc_security(struct cred *cred)
 {
 	return 0;
 }
 
-static void cap_task_free_security(struct task_struct *p)
+static void cap_cred_free(struct cred *cred)
 {
 }
 
@@ -890,8 +890,8 @@ void security_fixup_ops(struct security_operations *ops)
 	set_to_cap_if_null(ops, file_receive);
 	set_to_cap_if_null(ops, dentry_open);
 	set_to_cap_if_null(ops, task_create);
-	set_to_cap_if_null(ops, task_alloc_security);
-	set_to_cap_if_null(ops, task_free_security);
+	set_to_cap_if_null(ops, cred_alloc_security);
+	set_to_cap_if_null(ops, cred_free);
 	set_to_cap_if_null(ops, task_setuid);
 	set_to_cap_if_null(ops, task_post_setuid);
 	set_to_cap_if_null(ops, task_setgid);
diff --git a/security/security.c b/security/security.c
index 81c956a1230..d058f7d5b10 100644
--- a/security/security.c
+++ b/security/security.c
@@ -616,14 +616,14 @@ int security_task_create(unsigned long clone_flags)
 	return security_ops->task_create(clone_flags);
 }
 
-int security_task_alloc(struct task_struct *p)
+int security_cred_alloc(struct cred *cred)
 {
-	return security_ops->task_alloc_security(p);
+	return security_ops->cred_alloc_security(cred);
 }
 
-void security_task_free(struct task_struct *p)
+void security_cred_free(struct cred *cred)
 {
-	security_ops->task_free_security(p);
+	security_ops->cred_free(cred);
 }
 
 int security_task_setuid(uid_t id0, uid_t id1, uid_t id2, int flags)
diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index 328308f2882..658435dce37 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -158,7 +158,7 @@ static int selinux_secmark_enabled(void)
 
 /* Allocate and free functions for each kind of security blob. */
 
-static int task_alloc_security(struct task_struct *task)
+static int cred_alloc_security(struct cred *cred)
 {
 	struct task_security_struct *tsec;
 
@@ -167,18 +167,11 @@ static int task_alloc_security(struct task_struct *task)
 		return -ENOMEM;
 
 	tsec->osid = tsec->sid = SECINITSID_UNLABELED;
-	task->cred->security = tsec;
+	cred->security = tsec;
 
 	return 0;
 }
 
-static void task_free_security(struct task_struct *task)
-{
-	struct task_security_struct *tsec = task->cred->security;
-	task->cred->security = NULL;
-	kfree(tsec);
-}
-
 static int inode_alloc_security(struct inode *inode)
 {
 	struct task_security_struct *tsec = current->cred->security;
@@ -3184,17 +3177,17 @@ static int selinux_task_create(unsigned long clone_flags)
 	return task_has_perm(current, current, PROCESS__FORK);
 }
 
-static int selinux_task_alloc_security(struct task_struct *tsk)
+static int selinux_cred_alloc_security(struct cred *cred)
 {
 	struct task_security_struct *tsec1, *tsec2;
 	int rc;
 
 	tsec1 = current->cred->security;
 
-	rc = task_alloc_security(tsk);
+	rc = cred_alloc_security(cred);
 	if (rc)
 		return rc;
-	tsec2 = tsk->cred->security;
+	tsec2 = cred->security;
 
 	tsec2->osid = tsec1->osid;
 	tsec2->sid = tsec1->sid;
@@ -3208,9 +3201,14 @@ static int selinux_task_alloc_security(struct task_struct *tsk)
 	return 0;
 }
 
-static void selinux_task_free_security(struct task_struct *tsk)
+/*
+ * detach and free the LSM part of a set of credentials
+ */
+static void selinux_cred_free(struct cred *cred)
 {
-	task_free_security(tsk);
+	struct task_security_struct *tsec = cred->security;
+	cred->security = NULL;
+	kfree(tsec);
 }
 
 static int selinux_task_setuid(uid_t id0, uid_t id1, uid_t id2, int flags)
@@ -5552,8 +5550,8 @@ static struct security_operations selinux_ops = {
 	.dentry_open =			selinux_dentry_open,
 
 	.task_create =			selinux_task_create,
-	.task_alloc_security =		selinux_task_alloc_security,
-	.task_free_security =		selinux_task_free_security,
+	.cred_alloc_security =		selinux_cred_alloc_security,
+	.cred_free =			selinux_cred_free,
 	.task_setuid =			selinux_task_setuid,
 	.task_post_setuid =		selinux_task_post_setuid,
 	.task_setgid =			selinux_task_setgid,
@@ -5683,7 +5681,7 @@ static __init int selinux_init(void)
 	printk(KERN_INFO "SELinux:  Initializing.\n");
 
 	/* Set the security state for the initial task. */
-	if (task_alloc_security(current))
+	if (cred_alloc_security(current->cred))
 		panic("SELinux:  Failed to initialize initial task.\n");
 	tsec = current->cred->security;
 	tsec->osid = tsec->sid = SECINITSID_KERNEL;
diff --git a/security/smack/smack_lsm.c b/security/smack/smack_lsm.c
index 791da238d04..cc837314fb0 100644
--- a/security/smack/smack_lsm.c
+++ b/security/smack/smack_lsm.c
@@ -975,8 +975,8 @@ static int smack_file_receive(struct file *file)
  */
 
 /**
- * smack_task_alloc_security - "allocate" a task blob
- * @tsk: the task in need of a blob
+ * smack_cred_alloc_security - "allocate" a task cred blob
+ * @cred: the task creds in need of a blob
  *
  * Smack isn't using copies of blobs. Everyone
  * points to an immutable list. No alloc required.
@@ -984,24 +984,24 @@ static int smack_file_receive(struct file *file)
  *
  * Always returns 0
  */
-static int smack_task_alloc_security(struct task_struct *tsk)
+static int smack_cred_alloc_security(struct cred *cred)
 {
-	tsk->cred->security = current->cred->security;
+	cred->security = current->cred->security;
 
 	return 0;
 }
 
 /**
- * smack_task_free_security - "free" a task blob
- * @task: the task with the blob
+ * smack_cred_free - "free" task-level security credentials
+ * @cred: the credentials in question
  *
  * Smack isn't using copies of blobs. Everyone
  * points to an immutable list. The blobs never go away.
  * There is no leak here.
  */
-static void smack_task_free_security(struct task_struct *task)
+static void smack_cred_free(struct cred *cred)
 {
-	task->cred->security = NULL;
+	cred->security = NULL;
 }
 
 /**
@@ -2630,8 +2630,8 @@ struct security_operations smack_ops = {
 	.file_send_sigiotask = 		smack_file_send_sigiotask,
 	.file_receive = 		smack_file_receive,
 
-	.task_alloc_security = 		smack_task_alloc_security,
-	.task_free_security = 		smack_task_free_security,
+	.cred_alloc_security = 		smack_cred_alloc_security,
+	.cred_free =			smack_cred_free,
 	.task_post_setuid =		cap_task_post_setuid,
 	.task_setpgid = 		smack_task_setpgid,
 	.task_getpgid = 		smack_task_getpgid,
-- 
cgit v1.2.3-70-g09d2


From 86a264abe542cfececb4df129bc45a0338d8cdb9 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 14 Nov 2008 10:39:18 +1100
Subject: CRED: Wrap current->cred and a few other accessors

Wrap current->cred and a few other accessors to hide their actual
implementation.

Signed-off-by: David Howells <dhowells@redhat.com>
Acked-by: James Morris <jmorris@namei.org>
Acked-by: Serge Hallyn <serue@us.ibm.com>
Signed-off-by: James Morris <jmorris@namei.org>
---
 arch/ia64/ia32/sys_ia32.c     |   7 +-
 drivers/net/tun.c             |   8 +-
 drivers/usb/core/devio.c      |  10 ++-
 fs/binfmt_elf.c               |  10 +--
 fs/binfmt_elf_fdpic.c         |   9 +-
 fs/exec.c                     |   5 +-
 fs/fcntl.c                    |   3 +-
 fs/file_table.c               |   7 +-
 fs/hugetlbfs/inode.c          |   5 +-
 fs/ioprio.c                   |   4 +-
 fs/smbfs/dir.c                |   3 +-
 include/linux/cred.h          | 187 ++++++++++++++++++++++++++++++++----------
 include/linux/securebits.h    |   2 +-
 ipc/mqueue.c                  |   2 +-
 ipc/shm.c                     |   4 +-
 kernel/sys.c                  |  59 +++++++------
 kernel/uid16.c                |  31 +++----
 net/core/scm.c                |   2 +-
 net/sunrpc/auth.c             |  14 ++--
 security/commoncap.c          |   2 +-
 security/keys/process_keys.c  |   2 +-
 security/keys/request_key.c   |  11 +--
 security/selinux/exports.c    |   8 +-
 security/selinux/xfrm.c       |   6 +-
 security/smack/smack_access.c |   2 +-
 security/smack/smack_lsm.c    |  26 +++---
 security/smack/smackfs.c      |   4 +-
 27 files changed, 271 insertions(+), 162 deletions(-)

(limited to 'include/linux')

diff --git a/arch/ia64/ia32/sys_ia32.c b/arch/ia64/ia32/sys_ia32.c
index 2445a9d3488..16ef61a91d9 100644
--- a/arch/ia64/ia32/sys_ia32.c
+++ b/arch/ia64/ia32/sys_ia32.c
@@ -1767,25 +1767,24 @@ groups16_from_user(struct group_info *group_info, short __user *grouplist)
 asmlinkage long
 sys32_getgroups16 (int gidsetsize, short __user *grouplist)
 {
+	const struct cred *cred = current_cred();
 	int i;
 
 	if (gidsetsize < 0)
 		return -EINVAL;
 
-	get_group_info(current->cred->group_info);
-	i = current->cred->group_info->ngroups;
+	i = cred->group_info->ngroups;
 	if (gidsetsize) {
 		if (i > gidsetsize) {
 			i = -EINVAL;
 			goto out;
 		}
-		if (groups16_to_user(grouplist, current->cred->group_info)) {
+		if (groups16_to_user(grouplist, cred->group_info)) {
 			i = -EFAULT;
 			goto out;
 		}
 	}
 out:
-	put_group_info(current->cred->group_info);
 	return i;
 }
 
diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index b14e2025e22..55dc70c6b4d 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -702,6 +702,7 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr)
 	struct tun_net *tn;
 	struct tun_struct *tun;
 	struct net_device *dev;
+	const struct cred *cred = current_cred();
 	int err;
 
 	tn = net_generic(net, tun_net_id);
@@ -712,11 +713,12 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr)
 
 		/* Check permissions */
 		if (((tun->owner != -1 &&
-		      current_euid() != tun->owner) ||
+		      cred->euid != tun->owner) ||
 		     (tun->group != -1 &&
-		      current_egid() != tun->group)) &&
-		     !capable(CAP_NET_ADMIN))
+		      cred->egid != tun->group)) &&
+		    !capable(CAP_NET_ADMIN)) {
 			return -EPERM;
+		}
 	}
 	else if (__dev_get_by_name(net, ifr->ifr_name))
 		return -EINVAL;
diff --git a/drivers/usb/core/devio.c b/drivers/usb/core/devio.c
index 1aadb938702..aa79280df15 100644
--- a/drivers/usb/core/devio.c
+++ b/drivers/usb/core/devio.c
@@ -574,6 +574,7 @@ static int usbdev_open(struct inode *inode, struct file *file)
 {
 	struct usb_device *dev = NULL;
 	struct dev_state *ps;
+	const struct cred *cred = current_cred();
 	int ret;
 
 	lock_kernel();
@@ -617,8 +618,8 @@ static int usbdev_open(struct inode *inode, struct file *file)
 	init_waitqueue_head(&ps->wait);
 	ps->discsignr = 0;
 	ps->disc_pid = get_pid(task_pid(current));
-	ps->disc_uid = current_uid();
-	ps->disc_euid = current_euid();
+	ps->disc_uid = cred->uid;
+	ps->disc_euid = cred->euid;
 	ps->disccontext = NULL;
 	ps->ifclaimed = 0;
 	security_task_getsecid(current, &ps->secid);
@@ -967,6 +968,7 @@ static int proc_do_submiturb(struct dev_state *ps, struct usbdevfs_urb *uurb,
 	struct usb_host_endpoint *ep;
 	struct async *as;
 	struct usb_ctrlrequest *dr = NULL;
+	const struct cred *cred = current_cred();
 	unsigned int u, totlen, isofrmlen;
 	int ret, ifnum = -1;
 	int is_in;
@@ -1174,8 +1176,8 @@ static int proc_do_submiturb(struct dev_state *ps, struct usbdevfs_urb *uurb,
 	as->signr = uurb->signr;
 	as->ifnum = ifnum;
 	as->pid = get_pid(task_pid(current));
-	as->uid = current_uid();
-	as->euid = current_euid();
+	as->uid = cred->uid;
+	as->euid = cred->euid;
 	security_task_getsecid(current, &as->secid);
 	if (!is_in) {
 		if (copy_from_user(as->urb->transfer_buffer, uurb->buffer,
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 7a52477ce49..0e665561316 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -157,7 +157,7 @@ create_elf_tables(struct linux_binprm *bprm, struct elfhdr *exec,
 	int items;
 	elf_addr_t *elf_info;
 	int ei_index = 0;
-	struct task_struct *tsk = current;
+	const struct cred *cred = current_cred();
 	struct vm_area_struct *vma;
 
 	/*
@@ -223,10 +223,10 @@ create_elf_tables(struct linux_binprm *bprm, struct elfhdr *exec,
 	NEW_AUX_ENT(AT_BASE, interp_load_addr);
 	NEW_AUX_ENT(AT_FLAGS, 0);
 	NEW_AUX_ENT(AT_ENTRY, exec->e_entry);
-	NEW_AUX_ENT(AT_UID, tsk->cred->uid);
-	NEW_AUX_ENT(AT_EUID, tsk->cred->euid);
-	NEW_AUX_ENT(AT_GID, tsk->cred->gid);
-	NEW_AUX_ENT(AT_EGID, tsk->cred->egid);
+	NEW_AUX_ENT(AT_UID, cred->uid);
+	NEW_AUX_ENT(AT_EUID, cred->euid);
+	NEW_AUX_ENT(AT_GID, cred->gid);
+	NEW_AUX_ENT(AT_EGID, cred->egid);
  	NEW_AUX_ENT(AT_SECURE, security_bprm_secureexec(bprm));
 	NEW_AUX_ENT(AT_EXECFN, bprm->exec);
 	if (k_platform) {
diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c
index 9f67054c2c4..1f6e8c023b4 100644
--- a/fs/binfmt_elf_fdpic.c
+++ b/fs/binfmt_elf_fdpic.c
@@ -475,6 +475,7 @@ static int create_elf_fdpic_tables(struct linux_binprm *bprm,
 				   struct elf_fdpic_params *exec_params,
 				   struct elf_fdpic_params *interp_params)
 {
+	const struct cred *cred = current_cred();
 	unsigned long sp, csp, nitems;
 	elf_caddr_t __user *argv, *envp;
 	size_t platform_len = 0, len;
@@ -623,10 +624,10 @@ static int create_elf_fdpic_tables(struct linux_binprm *bprm,
 	NEW_AUX_ENT(AT_BASE,	interp_params->elfhdr_addr);
 	NEW_AUX_ENT(AT_FLAGS,	0);
 	NEW_AUX_ENT(AT_ENTRY,	exec_params->entry_addr);
-	NEW_AUX_ENT(AT_UID,	(elf_addr_t) current->cred->uid);
-	NEW_AUX_ENT(AT_EUID,	(elf_addr_t) current->cred->euid);
-	NEW_AUX_ENT(AT_GID,	(elf_addr_t) current->cred->gid);
-	NEW_AUX_ENT(AT_EGID,	(elf_addr_t) current->cred->egid);
+	NEW_AUX_ENT(AT_UID,	(elf_addr_t) cred->uid);
+	NEW_AUX_ENT(AT_EUID,	(elf_addr_t) cred->euid);
+	NEW_AUX_ENT(AT_GID,	(elf_addr_t) cred->gid);
+	NEW_AUX_ENT(AT_EGID,	(elf_addr_t) cred->egid);
 	NEW_AUX_ENT(AT_SECURE,	security_bprm_secureexec(bprm));
 	NEW_AUX_ENT(AT_EXECFN,	bprm->exec);
 
diff --git a/fs/exec.c b/fs/exec.c
index 31149e430a8..a5330e1a221 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1388,6 +1388,7 @@ EXPORT_SYMBOL(set_binfmt);
  */
 static int format_corename(char *corename, long signr)
 {
+	const struct cred *cred = current_cred();
 	const char *pat_ptr = core_pattern;
 	int ispipe = (*pat_ptr == '|');
 	char *out_ptr = corename;
@@ -1424,7 +1425,7 @@ static int format_corename(char *corename, long signr)
 			/* uid */
 			case 'u':
 				rc = snprintf(out_ptr, out_end - out_ptr,
-					      "%d", current_uid());
+					      "%d", cred->uid);
 				if (rc > out_end - out_ptr)
 					goto out;
 				out_ptr += rc;
@@ -1432,7 +1433,7 @@ static int format_corename(char *corename, long signr)
 			/* gid */
 			case 'g':
 				rc = snprintf(out_ptr, out_end - out_ptr,
-					      "%d", current_gid());
+					      "%d", cred->gid);
 				if (rc > out_end - out_ptr)
 					goto out;
 				out_ptr += rc;
diff --git a/fs/fcntl.c b/fs/fcntl.c
index 63964d863ad..c594cc0e40f 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -205,13 +205,14 @@ static void f_modown(struct file *filp, struct pid *pid, enum pid_type type,
 int __f_setown(struct file *filp, struct pid *pid, enum pid_type type,
 		int force)
 {
+	const struct cred *cred = current_cred();
 	int err;
 	
 	err = security_file_set_fowner(filp);
 	if (err)
 		return err;
 
-	f_modown(filp, pid, type, current_uid(), current_euid(), force);
+	f_modown(filp, pid, type, cred->uid, cred->euid, force);
 	return 0;
 }
 EXPORT_SYMBOL(__f_setown);
diff --git a/fs/file_table.c b/fs/file_table.c
index 3152b53cfab..bc4563fe791 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -94,7 +94,7 @@ int proc_nr_files(ctl_table *table, int write, struct file *filp,
  */
 struct file *get_empty_filp(void)
 {
-	struct task_struct *tsk;
+	const struct cred *cred = current_cred();
 	static int old_max;
 	struct file * f;
 
@@ -118,12 +118,11 @@ struct file *get_empty_filp(void)
 	if (security_file_alloc(f))
 		goto fail_sec;
 
-	tsk = current;
 	INIT_LIST_HEAD(&f->f_u.fu_list);
 	atomic_long_set(&f->f_count, 1);
 	rwlock_init(&f->f_owner.lock);
-	f->f_uid = tsk->cred->fsuid;
-	f->f_gid = tsk->cred->fsgid;
+	f->f_uid = cred->fsuid;
+	f->f_gid = cred->fsgid;
 	eventpoll_init_file(f);
 	/* f->f_version: 0 */
 	return f;
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 870a721b8bd..7d479ce3ace 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -951,6 +951,7 @@ struct file *hugetlb_file_setup(const char *name, size_t size)
 	struct inode *inode;
 	struct dentry *dentry, *root;
 	struct qstr quick_string;
+	struct user_struct *user = current_user();
 
 	if (!hugetlbfs_vfsmount)
 		return ERR_PTR(-ENOENT);
@@ -958,7 +959,7 @@ struct file *hugetlb_file_setup(const char *name, size_t size)
 	if (!can_do_hugetlb_shm())
 		return ERR_PTR(-EPERM);
 
-	if (!user_shm_lock(size, current->cred->user))
+	if (!user_shm_lock(size, user))
 		return ERR_PTR(-ENOMEM);
 
 	root = hugetlbfs_vfsmount->mnt_root;
@@ -998,7 +999,7 @@ out_inode:
 out_dentry:
 	dput(dentry);
 out_shm_unlock:
-	user_shm_unlock(size, current->cred->user);
+	user_shm_unlock(size, user);
 	return ERR_PTR(error);
 }
 
diff --git a/fs/ioprio.c b/fs/ioprio.c
index bb5210af77c..5112554fd21 100644
--- a/fs/ioprio.c
+++ b/fs/ioprio.c
@@ -123,7 +123,7 @@ asmlinkage long sys_ioprio_set(int which, int who, int ioprio)
 			break;
 		case IOPRIO_WHO_USER:
 			if (!who)
-				user = current->cred->user;
+				user = current_user();
 			else
 				user = find_user(who);
 
@@ -216,7 +216,7 @@ asmlinkage long sys_ioprio_get(int which, int who)
 			break;
 		case IOPRIO_WHO_USER:
 			if (!who)
-				user = current->cred->user;
+				user = current_user();
 			else
 				user = find_user(who);
 
diff --git a/fs/smbfs/dir.c b/fs/smbfs/dir.c
index 9e9bb0db4f6..e7ddd0328dd 100644
--- a/fs/smbfs/dir.c
+++ b/fs/smbfs/dir.c
@@ -667,8 +667,7 @@ smb_make_node(struct inode *dir, struct dentry *dentry, int mode, dev_t dev)
 
 	attr.ia_valid = ATTR_MODE | ATTR_UID | ATTR_GID;
 	attr.ia_mode = mode;
-	attr.ia_uid = current_euid();
-	attr.ia_gid = current_egid();
+	current_euid_egid(&attr.ia_uid, &attr.ia_gid);
 
 	if (!new_valid_dev(dev))
 		return -EINVAL;
diff --git a/include/linux/cred.h b/include/linux/cred.h
index a7a686074cb..4221ec6000c 100644
--- a/include/linux/cred.h
+++ b/include/linux/cred.h
@@ -37,15 +37,16 @@ struct group_info {
  * get_group_info - Get a reference to a group info structure
  * @group_info: The group info to reference
  *
- * This must be called with the owning task locked (via task_lock()) when task
- * != current.  The reason being that the vast majority of callers are looking
- * at current->group_info, which can not be changed except by the current task.
- * Changing current->group_info requires the task lock, too.
+ * This gets a reference to a set of supplementary groups.
+ *
+ * If the caller is accessing a task's credentials, they must hold the RCU read
+ * lock when reading.
  */
-#define get_group_info(group_info)		\
-do {						\
-	atomic_inc(&(group_info)->usage);	\
-} while (0)
+static inline struct group_info *get_group_info(struct group_info *gi)
+{
+	atomic_inc(&gi->usage);
+	return gi;
+}
 
 /**
  * put_group_info - Release a reference to a group info structure
@@ -61,7 +62,7 @@ extern struct group_info *groups_alloc(int);
 extern void groups_free(struct group_info *);
 extern int set_current_groups(struct group_info *);
 extern int set_groups(struct cred *, struct group_info *);
-extern int groups_search(struct group_info *, gid_t);
+extern int groups_search(const struct group_info *, gid_t);
 
 /* access the groups "array" with this macro */
 #define GROUP_AT(gi, i) \
@@ -123,41 +124,6 @@ struct cred {
 	spinlock_t	lock;		/* lock for pointer changes */
 };
 
-#define get_current_user()	(get_uid(current->cred->user))
-
-#define task_uid(task)		((task)->cred->uid)
-#define task_gid(task)		((task)->cred->gid)
-#define task_euid(task)		((task)->cred->euid)
-#define task_egid(task)		((task)->cred->egid)
-
-#define current_uid()		(current->cred->uid)
-#define current_gid()		(current->cred->gid)
-#define current_euid()		(current->cred->euid)
-#define current_egid()		(current->cred->egid)
-#define current_suid()		(current->cred->suid)
-#define current_sgid()		(current->cred->sgid)
-#define current_fsuid()		(current->cred->fsuid)
-#define current_fsgid()		(current->cred->fsgid)
-#define current_cap()		(current->cred->cap_effective)
-
-#define current_uid_gid(_uid, _gid)		\
-do {						\
-	*(_uid) = current->cred->uid;		\
-	*(_gid) = current->cred->gid;		\
-} while(0)
-
-#define current_euid_egid(_uid, _gid)		\
-do {						\
-	*(_uid) = current->cred->euid;		\
-	*(_gid) = current->cred->egid;		\
-} while(0)
-
-#define current_fsuid_fsgid(_uid, _gid)		\
-do {						\
-	*(_uid) = current->cred->fsuid;		\
-	*(_gid) = current->cred->fsgid;		\
-} while(0)
-
 extern void __put_cred(struct cred *);
 extern int copy_creds(struct task_struct *, unsigned long);
 
@@ -187,4 +153,137 @@ static inline void put_cred(struct cred *cred)
 		__put_cred(cred);
 }
 
+/**
+ * current_cred - Access the current task's credentials
+ *
+ * Access the credentials of the current task.
+ */
+#define current_cred() \
+	(current->cred)
+
+/**
+ * __task_cred - Access another task's credentials
+ * @task: The task to query
+ *
+ * Access the credentials of another task.  The caller must hold the
+ * RCU readlock.
+ *
+ * The caller must make sure task doesn't go away, either by holding a ref on
+ * task or by holding tasklist_lock to prevent it from being unlinked.
+ */
+#define __task_cred(task) \
+	((const struct cred *)(rcu_dereference((task)->cred)))
+
+/**
+ * get_task_cred - Get another task's credentials
+ * @task: The task to query
+ *
+ * Get the credentials of a task, pinning them so that they can't go away.
+ * Accessing a task's credentials directly is not permitted.
+ *
+ * The caller must make sure task doesn't go away, either by holding a ref on
+ * task or by holding tasklist_lock to prevent it from being unlinked.
+ */
+#define get_task_cred(task)				\
+({							\
+	struct cred *__cred;				\
+	rcu_read_lock();				\
+	__cred = (struct cred *) __task_cred((task));	\
+	get_cred(__cred);				\
+	rcu_read_unlock();				\
+	__cred;						\
+})
+
+/**
+ * get_current_cred - Get the current task's credentials
+ *
+ * Get the credentials of the current task, pinning them so that they can't go
+ * away.  Accessing the current task's credentials directly is not permitted.
+ */
+#define get_current_cred()				\
+	(get_cred(current_cred()))
+
+/**
+ * get_current_user - Get the current task's user_struct
+ *
+ * Get the user record of the current task, pinning it so that it can't go
+ * away.
+ */
+#define get_current_user()				\
+({							\
+	struct user_struct *__u;			\
+	struct cred *__cred;				\
+	__cred = (struct cred *) current_cred();	\
+	__u = get_uid(__cred->user);			\
+	__u;						\
+})
+
+/**
+ * get_current_groups - Get the current task's supplementary group list
+ *
+ * Get the supplementary group list of the current task, pinning it so that it
+ * can't go away.
+ */
+#define get_current_groups()				\
+({							\
+	struct group_info *__groups;			\
+	struct cred *__cred;				\
+	__cred = (struct cred *) current_cred();	\
+	__groups = get_group_info(__cred->group_info);	\
+	__groups;					\
+})
+
+#define task_cred_xxx(task, xxx)		\
+({						\
+	__typeof__(task->cred->xxx) ___val;	\
+	rcu_read_lock();			\
+	___val = __task_cred((task))->xxx;	\
+	rcu_read_unlock();			\
+	___val;					\
+})
+
+#define task_uid(task)		(task_cred_xxx((task), uid))
+#define task_euid(task)		(task_cred_xxx((task), euid))
+
+#define current_cred_xxx(xxx)			\
+({						\
+	current->cred->xxx;			\
+})
+
+#define current_uid()		(current_cred_xxx(uid))
+#define current_gid()		(current_cred_xxx(gid))
+#define current_euid()		(current_cred_xxx(euid))
+#define current_egid()		(current_cred_xxx(egid))
+#define current_suid()		(current_cred_xxx(suid))
+#define current_sgid()		(current_cred_xxx(sgid))
+#define current_fsuid() 	(current_cred_xxx(fsuid))
+#define current_fsgid() 	(current_cred_xxx(fsgid))
+#define current_cap()		(current_cred_xxx(cap_effective))
+#define current_user()		(current_cred_xxx(user))
+#define current_security()	(current_cred_xxx(security))
+
+#define current_uid_gid(_uid, _gid)		\
+do {						\
+	const struct cred *__cred;		\
+	__cred = current_cred();		\
+	*(_uid) = __cred->uid;			\
+	*(_gid) = __cred->gid;			\
+} while(0)
+
+#define current_euid_egid(_euid, _egid)		\
+do {						\
+	const struct cred *__cred;		\
+	__cred = current_cred();		\
+	*(_euid) = __cred->euid;		\
+	*(_egid) = __cred->egid;		\
+} while(0)
+
+#define current_fsuid_fsgid(_fsuid, _fsgid)	\
+do {						\
+	const struct cred *__cred;		\
+	__cred = current_cred();		\
+	*(_fsuid) = __cred->fsuid;		\
+	*(_fsgid) = __cred->fsgid;		\
+} while(0)
+
 #endif /* _LINUX_CRED_H */
diff --git a/include/linux/securebits.h b/include/linux/securebits.h
index 6d389491bfa..d2c5ed845bc 100644
--- a/include/linux/securebits.h
+++ b/include/linux/securebits.h
@@ -32,7 +32,7 @@
    setting is locked or not. A setting which is locked cannot be
    changed from user-level. */
 #define issecure_mask(X)	(1 << (X))
-#define issecure(X)		(issecure_mask(X) & current->cred->securebits)
+#define issecure(X)		(issecure_mask(X) & current_cred_xxx(securebits))
 
 #define SECURE_ALL_BITS		(issecure_mask(SECURE_NOROOT) | \
 				 issecure_mask(SECURE_NO_SETUID_FIXUP) | \
diff --git a/ipc/mqueue.c b/ipc/mqueue.c
index e1885b494ba..1151881ccb9 100644
--- a/ipc/mqueue.c
+++ b/ipc/mqueue.c
@@ -112,6 +112,7 @@ static inline struct mqueue_inode_info *MQUEUE_I(struct inode *inode)
 static struct inode *mqueue_get_inode(struct super_block *sb, int mode,
 							struct mq_attr *attr)
 {
+	struct user_struct *u = current_user();
 	struct inode *inode;
 
 	inode = new_inode(sb);
@@ -126,7 +127,6 @@ static struct inode *mqueue_get_inode(struct super_block *sb, int mode,
 		if (S_ISREG(mode)) {
 			struct mqueue_inode_info *info;
 			struct task_struct *p = current;
-			struct user_struct *u = p->cred->user;
 			unsigned long mq_bytes, mq_msg_tblsz;
 
 			inode->i_fop = &mqueue_file_operations;
diff --git a/ipc/shm.c b/ipc/shm.c
index 264a9d33c5d..38a055758a9 100644
--- a/ipc/shm.c
+++ b/ipc/shm.c
@@ -366,7 +366,7 @@ static int newseg(struct ipc_namespace *ns, struct ipc_params *params)
 	if (shmflg & SHM_HUGETLB) {
 		/* hugetlb_file_setup takes care of mlock user accounting */
 		file = hugetlb_file_setup(name, size);
-		shp->mlock_user = current->cred->user;
+		shp->mlock_user = current_user();
 	} else {
 		int acctflag = VM_ACCOUNT;
 		/*
@@ -767,7 +767,7 @@ asmlinkage long sys_shmctl(int shmid, int cmd, struct shmid_ds __user *buf)
 			goto out_unlock;
 		
 		if(cmd==SHM_LOCK) {
-			struct user_struct *user = current->cred->user;
+			struct user_struct *user = current_user();
 			if (!is_file_hugepages(shp->shm_file)) {
 				err = shmem_lock(shp->shm_file, 1, user);
 				if (!err && !(shp->shm_perm.mode & SHM_LOCKED)){
diff --git a/kernel/sys.c b/kernel/sys.c
index 5d81f07c015..c4d6b59553e 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -143,6 +143,7 @@ asmlinkage long sys_setpriority(int which, int who, int niceval)
 {
 	struct task_struct *g, *p;
 	struct user_struct *user;
+	const struct cred *cred = current_cred();
 	int error = -EINVAL;
 	struct pid *pgrp;
 
@@ -176,18 +177,18 @@ asmlinkage long sys_setpriority(int which, int who, int niceval)
 			} while_each_pid_thread(pgrp, PIDTYPE_PGID, p);
 			break;
 		case PRIO_USER:
-			user = current->cred->user;
+			user = cred->user;
 			if (!who)
-				who = current_uid();
-			else
-				if (who != current_uid() && !(user = find_user(who)))
-					goto out_unlock;	/* No processes for this user */
+				who = cred->uid;
+			else if ((who != cred->uid) &&
+				 !(user = find_user(who)))
+				goto out_unlock;	/* No processes for this user */
 
 			do_each_thread(g, p)
-				if (p->cred->uid == who)
+				if (__task_cred(p)->uid == who)
 					error = set_one_prio(p, niceval, error);
 			while_each_thread(g, p);
-			if (who != current_uid())
+			if (who != cred->uid)
 				free_uid(user);		/* For find_user() */
 			break;
 	}
@@ -207,6 +208,7 @@ asmlinkage long sys_getpriority(int which, int who)
 {
 	struct task_struct *g, *p;
 	struct user_struct *user;
+	const struct cred *cred = current_cred();
 	long niceval, retval = -ESRCH;
 	struct pid *pgrp;
 
@@ -238,21 +240,21 @@ asmlinkage long sys_getpriority(int which, int who)
 			} while_each_pid_thread(pgrp, PIDTYPE_PGID, p);
 			break;
 		case PRIO_USER:
-			user = current->cred->user;
+			user = (struct user_struct *) cred->user;
 			if (!who)
-				who = current_uid();
-			else
-				if (who != current_uid() && !(user = find_user(who)))
-					goto out_unlock;	/* No processes for this user */
+				who = cred->uid;
+			else if ((who != cred->uid) &&
+				 !(user = find_user(who)))
+				goto out_unlock;	/* No processes for this user */
 
 			do_each_thread(g, p)
-				if (p->cred->uid == who) {
+				if (__task_cred(p)->uid == who) {
 					niceval = 20 - task_nice(p);
 					if (niceval > retval)
 						retval = niceval;
 				}
 			while_each_thread(g, p);
-			if (who != current_uid())
+			if (who != cred->uid)
 				free_uid(user);		/* for find_user() */
 			break;
 	}
@@ -743,11 +745,11 @@ asmlinkage long sys_setresuid(uid_t ruid, uid_t euid, uid_t suid)
 
 asmlinkage long sys_getresuid(uid_t __user *ruid, uid_t __user *euid, uid_t __user *suid)
 {
-	struct cred *cred = current->cred;
+	const struct cred *cred = current_cred();
 	int retval;
 
-	if (!(retval = put_user(cred->uid, ruid)) &&
-	    !(retval = put_user(cred->euid, euid)))
+	if (!(retval   = put_user(cred->uid,  ruid)) &&
+	    !(retval   = put_user(cred->euid, euid)))
 		retval = put_user(cred->suid, suid);
 
 	return retval;
@@ -796,11 +798,11 @@ asmlinkage long sys_setresgid(gid_t rgid, gid_t egid, gid_t sgid)
 
 asmlinkage long sys_getresgid(gid_t __user *rgid, gid_t __user *egid, gid_t __user *sgid)
 {
-	struct cred *cred = current->cred;
+	const struct cred *cred = current_cred();
 	int retval;
 
-	if (!(retval = put_user(cred->gid, rgid)) &&
-	    !(retval = put_user(cred->egid, egid)))
+	if (!(retval   = put_user(cred->gid,  rgid)) &&
+	    !(retval   = put_user(cred->egid, egid)))
 		retval = put_user(cred->sgid, sgid);
 
 	return retval;
@@ -1199,7 +1201,7 @@ static void groups_sort(struct group_info *group_info)
 }
 
 /* a simple bsearch */
-int groups_search(struct group_info *group_info, gid_t grp)
+int groups_search(const struct group_info *group_info, gid_t grp)
 {
 	unsigned int left, right;
 
@@ -1268,13 +1270,8 @@ EXPORT_SYMBOL(set_current_groups);
 
 asmlinkage long sys_getgroups(int gidsetsize, gid_t __user *grouplist)
 {
-	struct cred *cred = current->cred;
-	int i = 0;
-
-	/*
-	 *	SMP: Nobody else can change our grouplist. Thus we are
-	 *	safe.
-	 */
+	const struct cred *cred = current_cred();
+	int i;
 
 	if (gidsetsize < 0)
 		return -EINVAL;
@@ -1330,8 +1327,9 @@ asmlinkage long sys_setgroups(int gidsetsize, gid_t __user *grouplist)
  */
 int in_group_p(gid_t grp)
 {
-	struct cred *cred = current->cred;
+	const struct cred *cred = current_cred();
 	int retval = 1;
+
 	if (grp != cred->fsgid)
 		retval = groups_search(cred->group_info, grp);
 	return retval;
@@ -1341,8 +1339,9 @@ EXPORT_SYMBOL(in_group_p);
 
 int in_egroup_p(gid_t grp)
 {
-	struct cred *cred = current->cred;
+	const struct cred *cred = current_cred();
 	int retval = 1;
+
 	if (grp != cred->egid)
 		retval = groups_search(cred->group_info, grp);
 	return retval;
diff --git a/kernel/uid16.c b/kernel/uid16.c
index 71f07fc39fe..2460c3199b5 100644
--- a/kernel/uid16.c
+++ b/kernel/uid16.c
@@ -84,11 +84,12 @@ asmlinkage long sys_setresuid16(old_uid_t ruid, old_uid_t euid, old_uid_t suid)
 
 asmlinkage long sys_getresuid16(old_uid_t __user *ruid, old_uid_t __user *euid, old_uid_t __user *suid)
 {
+	const struct cred *cred = current_cred();
 	int retval;
 
-	if (!(retval = put_user(high2lowuid(current->cred->uid), ruid)) &&
-	    !(retval = put_user(high2lowuid(current->cred->euid), euid)))
-		retval = put_user(high2lowuid(current->cred->suid), suid);
+	if (!(retval   = put_user(high2lowuid(cred->uid),  ruid)) &&
+	    !(retval   = put_user(high2lowuid(cred->euid), euid)))
+		retval = put_user(high2lowuid(cred->suid), suid);
 
 	return retval;
 }
@@ -104,11 +105,12 @@ asmlinkage long sys_setresgid16(old_gid_t rgid, old_gid_t egid, old_gid_t sgid)
 
 asmlinkage long sys_getresgid16(old_gid_t __user *rgid, old_gid_t __user *egid, old_gid_t __user *sgid)
 {
+	const struct cred *cred = current_cred();
 	int retval;
 
-	if (!(retval = put_user(high2lowgid(current->cred->gid), rgid)) &&
-	    !(retval = put_user(high2lowgid(current->cred->egid), egid)))
-		retval = put_user(high2lowgid(current->cred->sgid), sgid);
+	if (!(retval   = put_user(high2lowgid(cred->gid),  rgid)) &&
+	    !(retval   = put_user(high2lowgid(cred->egid), egid)))
+		retval = put_user(high2lowgid(cred->sgid), sgid);
 
 	return retval;
 }
@@ -161,25 +163,24 @@ static int groups16_from_user(struct group_info *group_info,
 
 asmlinkage long sys_getgroups16(int gidsetsize, old_gid_t __user *grouplist)
 {
-	int i = 0;
+	const struct cred *cred = current_cred();
+	int i;
 
 	if (gidsetsize < 0)
 		return -EINVAL;
 
-	get_group_info(current->cred->group_info);
-	i = current->cred->group_info->ngroups;
+	i = cred->group_info->ngroups;
 	if (gidsetsize) {
 		if (i > gidsetsize) {
 			i = -EINVAL;
 			goto out;
 		}
-		if (groups16_to_user(grouplist, current->cred->group_info)) {
+		if (groups16_to_user(grouplist, cred->group_info)) {
 			i = -EFAULT;
 			goto out;
 		}
 	}
 out:
-	put_group_info(current->cred->group_info);
 	return i;
 }
 
@@ -210,20 +211,20 @@ asmlinkage long sys_setgroups16(int gidsetsize, old_gid_t __user *grouplist)
 
 asmlinkage long sys_getuid16(void)
 {
-	return high2lowuid(current->cred->uid);
+	return high2lowuid(current_uid());
 }
 
 asmlinkage long sys_geteuid16(void)
 {
-	return high2lowuid(current->cred->euid);
+	return high2lowuid(current_euid());
 }
 
 asmlinkage long sys_getgid16(void)
 {
-	return high2lowgid(current->cred->gid);
+	return high2lowgid(current_gid());
 }
 
 asmlinkage long sys_getegid16(void)
 {
-	return high2lowgid(current->cred->egid);
+	return high2lowgid(current_egid());
 }
diff --git a/net/core/scm.c b/net/core/scm.c
index c28ca32a7d9..f73c44b17dd 100644
--- a/net/core/scm.c
+++ b/net/core/scm.c
@@ -44,7 +44,7 @@
 
 static __inline__ int scm_check_creds(struct ucred *creds)
 {
-	struct cred *cred = current->cred;
+	const struct cred *cred = current_cred();
 
 	if ((creds->pid == task_tgid_vnr(current) || capable(CAP_SYS_ADMIN)) &&
 	    ((creds->uid == cred->uid   || creds->uid == cred->euid ||
diff --git a/net/sunrpc/auth.c b/net/sunrpc/auth.c
index c7954321260..0443f834945 100644
--- a/net/sunrpc/auth.c
+++ b/net/sunrpc/auth.c
@@ -350,16 +350,18 @@ EXPORT_SYMBOL_GPL(rpcauth_lookup_credcache);
 struct rpc_cred *
 rpcauth_lookupcred(struct rpc_auth *auth, int flags)
 {
-	struct auth_cred acred = {
-		.uid = current_fsuid(),
-		.gid = current_fsgid(),
-		.group_info = current->cred->group_info,
-	};
+	struct auth_cred acred;
 	struct rpc_cred *ret;
+	const struct cred *cred = current_cred();
 
 	dprintk("RPC:       looking up %s cred\n",
 		auth->au_ops->au_name);
-	get_group_info(acred.group_info);
+
+	memset(&acred, 0, sizeof(acred));
+	acred.uid = cred->fsuid;
+	acred.gid = cred->fsgid;
+	acred.group_info = get_group_info(((struct cred *)cred)->group_info);
+
 	ret = auth->au_ops->lookup_cred(auth, &acred, flags);
 	put_group_info(acred.group_info);
 	return ret;
diff --git a/security/commoncap.c b/security/commoncap.c
index fa61679f8c7..61307f59000 100644
--- a/security/commoncap.c
+++ b/security/commoncap.c
@@ -641,7 +641,7 @@ int cap_task_setnice (struct task_struct *p, int nice)
 int cap_task_prctl(int option, unsigned long arg2, unsigned long arg3,
 		   unsigned long arg4, unsigned long arg5, long *rc_p)
 {
-	struct cred *cred = current->cred;
+	struct cred *cred = current_cred();
 	long error = 0;
 
 	switch (option) {
diff --git a/security/keys/process_keys.c b/security/keys/process_keys.c
index b0904cdda2e..ce8ac6073d5 100644
--- a/security/keys/process_keys.c
+++ b/security/keys/process_keys.c
@@ -582,7 +582,7 @@ key_ref_t lookup_user_key(key_serial_t id, int create, int partial,
 {
 	struct request_key_auth *rka;
 	struct task_struct *t = current;
-	struct cred *cred = t->cred;
+	struct cred *cred = current_cred();
 	struct key *key;
 	key_ref_t key_ref, skey_ref;
 	int ret;
diff --git a/security/keys/request_key.c b/security/keys/request_key.c
index 3e9b9eb1dd2..0488b0af5bd 100644
--- a/security/keys/request_key.c
+++ b/security/keys/request_key.c
@@ -67,6 +67,7 @@ static int call_sbin_request_key(struct key_construction *cons,
 				 void *aux)
 {
 	struct task_struct *tsk = current;
+	const struct cred *cred = current_cred();
 	key_serial_t prkey, sskey;
 	struct key *key = cons->key, *authkey = cons->authkey, *keyring;
 	char *argv[9], *envp[3], uid_str[12], gid_str[12];
@@ -96,16 +97,16 @@ static int call_sbin_request_key(struct key_construction *cons,
 		goto error_link;
 
 	/* record the UID and GID */
-	sprintf(uid_str, "%d", current_fsuid());
-	sprintf(gid_str, "%d", current_fsgid());
+	sprintf(uid_str, "%d", cred->fsuid);
+	sprintf(gid_str, "%d", cred->fsgid);
 
 	/* we say which key is under construction */
 	sprintf(key_str, "%d", key->serial);
 
 	/* we specify the process's default keyrings */
 	sprintf(keyring_str[0], "%d",
-		tsk->cred->thread_keyring ?
-		tsk->cred->thread_keyring->serial : 0);
+		cred->thread_keyring ?
+		cred->thread_keyring->serial : 0);
 
 	prkey = 0;
 	if (tsk->signal->process_keyring)
@@ -118,7 +119,7 @@ static int call_sbin_request_key(struct key_construction *cons,
 		sskey = rcu_dereference(tsk->signal->session_keyring)->serial;
 		rcu_read_unlock();
 	} else {
-		sskey = tsk->cred->user->session_keyring->serial;
+		sskey = cred->user->session_keyring->serial;
 	}
 
 	sprintf(keyring_str[2], "%d", sskey);
diff --git a/security/selinux/exports.c b/security/selinux/exports.c
index cf02490cd1e..c73aeaa008e 100644
--- a/security/selinux/exports.c
+++ b/security/selinux/exports.c
@@ -39,9 +39,13 @@ EXPORT_SYMBOL_GPL(selinux_string_to_sid);
 int selinux_secmark_relabel_packet_permission(u32 sid)
 {
 	if (selinux_enabled) {
-		struct task_security_struct *tsec = current->cred->security;
+		const struct task_security_struct *__tsec;
+		u32 tsid;
 
-		return avc_has_perm(tsec->sid, sid, SECCLASS_PACKET,
+		__tsec = current_security();
+		tsid = __tsec->sid;
+
+		return avc_has_perm(tsid, sid, SECCLASS_PACKET,
 				    PACKET__RELABELTO, NULL);
 	}
 	return 0;
diff --git a/security/selinux/xfrm.c b/security/selinux/xfrm.c
index d7db76617b0..c0eb72013d6 100644
--- a/security/selinux/xfrm.c
+++ b/security/selinux/xfrm.c
@@ -197,7 +197,7 @@ static int selinux_xfrm_sec_ctx_alloc(struct xfrm_sec_ctx **ctxp,
 	struct xfrm_user_sec_ctx *uctx, u32 sid)
 {
 	int rc = 0;
-	struct task_security_struct *tsec = current->cred->security;
+	const struct task_security_struct *tsec = current_security();
 	struct xfrm_sec_ctx *ctx = NULL;
 	char *ctx_str = NULL;
 	u32 str_len;
@@ -333,7 +333,7 @@ void selinux_xfrm_policy_free(struct xfrm_sec_ctx *ctx)
  */
 int selinux_xfrm_policy_delete(struct xfrm_sec_ctx *ctx)
 {
-	struct task_security_struct *tsec = current->cred->security;
+	const struct task_security_struct *tsec = current_security();
 	int rc = 0;
 
 	if (ctx) {
@@ -378,7 +378,7 @@ void selinux_xfrm_state_free(struct xfrm_state *x)
   */
 int selinux_xfrm_state_delete(struct xfrm_state *x)
 {
-	struct task_security_struct *tsec = current->cred->security;
+	const struct task_security_struct *tsec = current_security();
 	struct xfrm_sec_ctx *ctx = x->security;
 	int rc = 0;
 
diff --git a/security/smack/smack_access.c b/security/smack/smack_access.c
index b6dd4fc0fb0..247cec3b5a4 100644
--- a/security/smack/smack_access.c
+++ b/security/smack/smack_access.c
@@ -164,7 +164,7 @@ int smk_curacc(char *obj_label, u32 mode)
 {
 	int rc;
 
-	rc = smk_access(current->cred->security, obj_label, mode);
+	rc = smk_access(current_security(), obj_label, mode);
 	if (rc == 0)
 		return 0;
 
diff --git a/security/smack/smack_lsm.c b/security/smack/smack_lsm.c
index cc837314fb0..e8a4fcb1ad0 100644
--- a/security/smack/smack_lsm.c
+++ b/security/smack/smack_lsm.c
@@ -143,7 +143,7 @@ static int smack_ptrace_traceme(struct task_struct *ptp)
 static int smack_syslog(int type)
 {
 	int rc;
-	char *sp = current->cred->security;
+	char *sp = current_security();
 
 	rc = cap_syslog(type);
 	if (rc != 0)
@@ -375,7 +375,7 @@ static int smack_sb_umount(struct vfsmount *mnt, int flags)
  */
 static int smack_inode_alloc_security(struct inode *inode)
 {
-	inode->i_security = new_inode_smack(current->cred->security);
+	inode->i_security = new_inode_smack(current_security());
 	if (inode->i_security == NULL)
 		return -ENOMEM;
 	return 0;
@@ -820,7 +820,7 @@ static int smack_file_permission(struct file *file, int mask)
  */
 static int smack_file_alloc_security(struct file *file)
 {
-	file->f_security = current->cred->security;
+	file->f_security = current_security();
 	return 0;
 }
 
@@ -918,7 +918,7 @@ static int smack_file_fcntl(struct file *file, unsigned int cmd,
  */
 static int smack_file_set_fowner(struct file *file)
 {
-	file->f_security = current->cred->security;
+	file->f_security = current_security();
 	return 0;
 }
 
@@ -986,8 +986,7 @@ static int smack_file_receive(struct file *file)
  */
 static int smack_cred_alloc_security(struct cred *cred)
 {
-	cred->security = current->cred->security;
-
+	cred->security = current_security();
 	return 0;
 }
 
@@ -1225,7 +1224,7 @@ static void smack_task_to_inode(struct task_struct *p, struct inode *inode)
  */
 static int smack_sk_alloc_security(struct sock *sk, int family, gfp_t gfp_flags)
 {
-	char *csp = current->cred->security;
+	char *csp = current_security();
 	struct socket_smack *ssp;
 
 	ssp = kzalloc(sizeof(struct socket_smack), gfp_flags);
@@ -1450,7 +1449,7 @@ static int smack_flags_to_may(int flags)
  */
 static int smack_msg_msg_alloc_security(struct msg_msg *msg)
 {
-	msg->security = current->cred->security;
+	msg->security = current_security();
 	return 0;
 }
 
@@ -1486,7 +1485,7 @@ static int smack_shm_alloc_security(struct shmid_kernel *shp)
 {
 	struct kern_ipc_perm *isp = &shp->shm_perm;
 
-	isp->security = current->cred->security;
+	isp->security = current_security();
 	return 0;
 }
 
@@ -1595,7 +1594,7 @@ static int smack_sem_alloc_security(struct sem_array *sma)
 {
 	struct kern_ipc_perm *isp = &sma->sem_perm;
 
-	isp->security = current->cred->security;
+	isp->security = current_security();
 	return 0;
 }
 
@@ -1699,7 +1698,7 @@ static int smack_msg_queue_alloc_security(struct msg_queue *msq)
 {
 	struct kern_ipc_perm *kisp = &msq->q_perm;
 
-	kisp->security = current->cred->security;
+	kisp->security = current_security();
 	return 0;
 }
 
@@ -1854,7 +1853,7 @@ static void smack_d_instantiate(struct dentry *opt_dentry, struct inode *inode)
 	struct super_block *sbp;
 	struct superblock_smack *sbsp;
 	struct inode_smack *isp;
-	char *csp = current->cred->security;
+	char *csp = current_security();
 	char *fetched;
 	char *final;
 	struct dentry *dp;
@@ -2290,8 +2289,7 @@ static void smack_sock_graft(struct sock *sk, struct socket *parent)
 		return;
 
 	ssp = sk->sk_security;
-	ssp->smk_in = current->cred->security;
-	ssp->smk_out = current->cred->security;
+	ssp->smk_in = ssp->smk_out = current_security();
 	ssp->smk_packet[0] = '\0';
 
 	rc = smack_netlabel(sk);
diff --git a/security/smack/smackfs.c b/security/smack/smackfs.c
index c5ca279e050..ca257dfdc75 100644
--- a/security/smack/smackfs.c
+++ b/security/smack/smackfs.c
@@ -336,7 +336,7 @@ static void smk_cipso_doi(void)
 
 	audit_info.loginuid = audit_get_loginuid(current);
 	audit_info.sessionid = audit_get_sessionid(current);
-	audit_info.secid = smack_to_secid(current->cred->security);
+	audit_info.secid = smack_to_secid(current_security());
 
 	rc = netlbl_cfg_map_del(NULL, &audit_info);
 	if (rc != 0)
@@ -371,7 +371,7 @@ static void smk_unlbl_ambient(char *oldambient)
 
 	audit_info.loginuid = audit_get_loginuid(current);
 	audit_info.sessionid = audit_get_sessionid(current);
-	audit_info.secid = smack_to_secid(current->cred->security);
+	audit_info.secid = smack_to_secid(current_security());
 
 	if (oldambient != NULL) {
 		rc = netlbl_cfg_map_del(oldambient, &audit_info);
-- 
cgit v1.2.3-70-g09d2


From c69e8d9c01db2adc503464993c358901c9af9de4 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 14 Nov 2008 10:39:19 +1100
Subject: CRED: Use RCU to access another task's creds and to release a task's
 own creds

Use RCU to access another task's creds and to release a task's own creds.
This means that it will be possible for the credentials of a task to be
replaced without another task (a) requiring a full lock to read them, and (b)
seeing deallocated memory.

Signed-off-by: David Howells <dhowells@redhat.com>
Acked-by: James Morris <jmorris@namei.org>
Acked-by: Serge Hallyn <serue@us.ibm.com>
Signed-off-by: James Morris <jmorris@namei.org>
---
 arch/ia64/kernel/perfmon.c   | 32 +++++++++++++---------
 drivers/connector/cn_proc.c  | 16 +++++++----
 fs/binfmt_elf.c              |  8 ++++--
 fs/binfmt_elf_fdpic.c        |  8 ++++--
 fs/fcntl.c                   | 15 ++++++++---
 fs/fuse/dir.c                | 23 ++++++++++------
 fs/ioprio.c                  | 14 +++++++---
 fs/proc/array.c              | 32 ++++++++++++++--------
 fs/proc/base.c               | 32 ++++++++++++++++------
 include/linux/cred.h         |  3 ++-
 kernel/auditsc.c             | 33 +++++++++++++----------
 kernel/cgroup.c              | 16 +++++------
 kernel/exit.c                | 14 ++++++----
 kernel/futex.c               | 22 +++++++++------
 kernel/futex_compat.c        |  7 ++---
 kernel/ptrace.c              | 22 ++++++++-------
 kernel/sched.c               | 31 ++++++++++++++-------
 kernel/signal.c              | 49 ++++++++++++++++++++-------------
 kernel/sys.c                 | 11 +++++---
 kernel/tsacct.c              |  6 +++--
 mm/mempolicy.c               |  8 +++---
 mm/migrate.c                 |  8 +++---
 mm/oom_kill.c                |  6 ++---
 security/commoncap.c         | 64 +++++++++++++++++++++++++++-----------------
 security/keys/permission.c   | 10 ++++---
 security/keys/process_keys.c | 24 ++++++++++-------
 security/selinux/selinuxfs.c | 13 ++++++---
 security/smack/smack_lsm.c   | 32 +++++++++++-----------
 28 files changed, 355 insertions(+), 204 deletions(-)

(limited to 'include/linux')

diff --git a/arch/ia64/kernel/perfmon.c b/arch/ia64/kernel/perfmon.c
index dd38db46a77..0e499757309 100644
--- a/arch/ia64/kernel/perfmon.c
+++ b/arch/ia64/kernel/perfmon.c
@@ -2399,25 +2399,33 @@ error_kmem:
 static int
 pfm_bad_permissions(struct task_struct *task)
 {
+	const struct cred *tcred;
 	uid_t uid = current_uid();
 	gid_t gid = current_gid();
+	int ret;
+
+	rcu_read_lock();
+	tcred = __task_cred(task);
 
 	/* inspired by ptrace_attach() */
 	DPRINT(("cur: uid=%d gid=%d task: euid=%d suid=%d uid=%d egid=%d sgid=%d\n",
 		uid,
 		gid,
-		task->euid,
-		task->suid,
-		task->uid,
-		task->egid,
-		task->sgid));
-
-	return (uid != task->euid)
-	    || (uid != task->suid)
-	    || (uid != task->uid)
-	    || (gid != task->egid)
-	    || (gid != task->sgid)
-	    || (gid != task->gid)) && !capable(CAP_SYS_PTRACE);
+		tcred->euid,
+		tcred->suid,
+		tcred->uid,
+		tcred->egid,
+		tcred->sgid));
+
+	ret = ((uid != tcred->euid)
+	       || (uid != tcred->suid)
+	       || (uid != tcred->uid)
+	       || (gid != tcred->egid)
+	       || (gid != tcred->sgid)
+	       || (gid != tcred->gid)) && !capable(CAP_SYS_PTRACE);
+
+	rcu_read_unlock();
+	return ret;
 }
 
 static int
diff --git a/drivers/connector/cn_proc.c b/drivers/connector/cn_proc.c
index 354c1ff1715..c5afc98e267 100644
--- a/drivers/connector/cn_proc.c
+++ b/drivers/connector/cn_proc.c
@@ -106,6 +106,7 @@ void proc_id_connector(struct task_struct *task, int which_id)
 	struct proc_event *ev;
 	__u8 buffer[CN_PROC_MSG_SIZE];
 	struct timespec ts;
+	const struct cred *cred;
 
 	if (atomic_read(&proc_event_num_listeners) < 1)
 		return;
@@ -115,14 +116,19 @@ void proc_id_connector(struct task_struct *task, int which_id)
 	ev->what = which_id;
 	ev->event_data.id.process_pid = task->pid;
 	ev->event_data.id.process_tgid = task->tgid;
+	rcu_read_lock();
+	cred = __task_cred(task);
 	if (which_id == PROC_EVENT_UID) {
-		ev->event_data.id.r.ruid = task->cred->uid;
-		ev->event_data.id.e.euid = task->cred->euid;
+		ev->event_data.id.r.ruid = cred->uid;
+		ev->event_data.id.e.euid = cred->euid;
 	} else if (which_id == PROC_EVENT_GID) {
-		ev->event_data.id.r.rgid = task->cred->gid;
-		ev->event_data.id.e.egid = task->cred->egid;
-	} else
+		ev->event_data.id.r.rgid = cred->gid;
+		ev->event_data.id.e.egid = cred->egid;
+	} else {
+		rcu_read_unlock();
 	     	return;
+	}
+	rcu_read_unlock();
 	get_seq(&msg->seq, &ev->cpu);
 	ktime_get_ts(&ts); /* get high res monotonic timestamp */
 	put_unaligned(timespec_to_ns(&ts), (__u64 *)&ev->timestamp_ns);
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 0e665561316..9142ff5dc8e 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -1361,6 +1361,7 @@ static void fill_prstatus(struct elf_prstatus *prstatus,
 static int fill_psinfo(struct elf_prpsinfo *psinfo, struct task_struct *p,
 		       struct mm_struct *mm)
 {
+	const struct cred *cred;
 	unsigned int i, len;
 	
 	/* first copy the parameters from user space */
@@ -1388,8 +1389,11 @@ static int fill_psinfo(struct elf_prpsinfo *psinfo, struct task_struct *p,
 	psinfo->pr_zomb = psinfo->pr_sname == 'Z';
 	psinfo->pr_nice = task_nice(p);
 	psinfo->pr_flag = p->flags;
-	SET_UID(psinfo->pr_uid, p->cred->uid);
-	SET_GID(psinfo->pr_gid, p->cred->gid);
+	rcu_read_lock();
+	cred = __task_cred(p);
+	SET_UID(psinfo->pr_uid, cred->uid);
+	SET_GID(psinfo->pr_gid, cred->gid);
+	rcu_read_unlock();
 	strncpy(psinfo->pr_fname, p->comm, sizeof(psinfo->pr_fname));
 	
 	return 0;
diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c
index 1f6e8c023b4..45dabd59936 100644
--- a/fs/binfmt_elf_fdpic.c
+++ b/fs/binfmt_elf_fdpic.c
@@ -1414,6 +1414,7 @@ static void fill_prstatus(struct elf_prstatus *prstatus,
 static int fill_psinfo(struct elf_prpsinfo *psinfo, struct task_struct *p,
 		       struct mm_struct *mm)
 {
+	const struct cred *cred;
 	unsigned int i, len;
 
 	/* first copy the parameters from user space */
@@ -1441,8 +1442,11 @@ static int fill_psinfo(struct elf_prpsinfo *psinfo, struct task_struct *p,
 	psinfo->pr_zomb = psinfo->pr_sname == 'Z';
 	psinfo->pr_nice = task_nice(p);
 	psinfo->pr_flag = p->flags;
-	SET_UID(psinfo->pr_uid, p->cred->uid);
-	SET_GID(psinfo->pr_gid, p->cred->gid);
+	rcu_read_lock();
+	cred = __task_cred(p);
+	SET_UID(psinfo->pr_uid, cred->uid);
+	SET_GID(psinfo->pr_gid, cred->gid);
+	rcu_read_unlock();
 	strncpy(psinfo->pr_fname, p->comm, sizeof(psinfo->pr_fname));
 
 	return 0;
diff --git a/fs/fcntl.c b/fs/fcntl.c
index c594cc0e40f..87c39f1f081 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -401,10 +401,17 @@ static const long band_table[NSIGPOLL] = {
 static inline int sigio_perm(struct task_struct *p,
                              struct fown_struct *fown, int sig)
 {
-	return (((fown->euid == 0) ||
-		 (fown->euid == p->cred->suid) || (fown->euid == p->cred->uid) ||
-		 (fown->uid == p->cred->suid) || (fown->uid == p->cred->uid)) &&
-		!security_file_send_sigiotask(p, fown, sig));
+	const struct cred *cred;
+	int ret;
+
+	rcu_read_lock();
+	cred = __task_cred(p);
+	ret = ((fown->euid == 0 ||
+		fown->euid == cred->suid || fown->euid == cred->uid ||
+		fown->uid  == cred->suid || fown->uid  == cred->uid) &&
+	       !security_file_send_sigiotask(p, fown, sig));
+	rcu_read_unlock();
+	return ret;
 }
 
 static void send_sigio_to_task(struct task_struct *p,
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index e97a9898186..95bc22bdd06 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -869,18 +869,25 @@ int fuse_update_attributes(struct inode *inode, struct kstat *stat,
  */
 int fuse_allow_task(struct fuse_conn *fc, struct task_struct *task)
 {
+	const struct cred *cred;
+	int ret;
+
 	if (fc->flags & FUSE_ALLOW_OTHER)
 		return 1;
 
-	if (task->cred->euid == fc->user_id &&
-	    task->cred->suid == fc->user_id &&
-	    task->cred->uid == fc->user_id &&
-	    task->cred->egid == fc->group_id &&
-	    task->cred->sgid == fc->group_id &&
-	    task->cred->gid == fc->group_id)
-		return 1;
+	rcu_read_lock();
+	ret = 0;
+	cred = __task_cred(task);
+	if (cred->euid == fc->user_id &&
+	    cred->suid == fc->user_id &&
+	    cred->uid  == fc->user_id &&
+	    cred->egid == fc->group_id &&
+	    cred->sgid == fc->group_id &&
+	    cred->gid  == fc->group_id)
+		ret = 1;
+	rcu_read_unlock();
 
-	return 0;
+	return ret;
 }
 
 static int fuse_access(struct inode *inode, int mask)
diff --git a/fs/ioprio.c b/fs/ioprio.c
index 5112554fd21..3569e0ad86a 100644
--- a/fs/ioprio.c
+++ b/fs/ioprio.c
@@ -31,10 +31,16 @@ static int set_task_ioprio(struct task_struct *task, int ioprio)
 {
 	int err;
 	struct io_context *ioc;
+	const struct cred *cred = current_cred(), *tcred;
 
-	if (task->cred->uid != current_euid() &&
-	    task->cred->uid != current_uid() && !capable(CAP_SYS_NICE))
+	rcu_read_lock();
+	tcred = __task_cred(task);
+	if (tcred->uid != cred->euid &&
+	    tcred->uid != cred->uid && !capable(CAP_SYS_NICE)) {
+		rcu_read_unlock();
 		return -EPERM;
+	}
+	rcu_read_unlock();
 
 	err = security_task_setioprio(task, ioprio);
 	if (err)
@@ -131,7 +137,7 @@ asmlinkage long sys_ioprio_set(int which, int who, int ioprio)
 				break;
 
 			do_each_thread(g, p) {
-				if (p->cred->uid != who)
+				if (__task_cred(p)->uid != who)
 					continue;
 				ret = set_task_ioprio(p, ioprio);
 				if (ret)
@@ -224,7 +230,7 @@ asmlinkage long sys_ioprio_get(int which, int who)
 				break;
 
 			do_each_thread(g, p) {
-				if (p->cred->uid != user->uid)
+				if (__task_cred(p)->uid != user->uid)
 					continue;
 				tmpio = get_task_ioprio(p);
 				if (tmpio < 0)
diff --git a/fs/proc/array.c b/fs/proc/array.c
index 62fe9b2009b..7e4877d9dcb 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -159,6 +159,7 @@ static inline void task_state(struct seq_file *m, struct pid_namespace *ns,
 	struct group_info *group_info;
 	int g;
 	struct fdtable *fdt = NULL;
+	const struct cred *cred;
 	pid_t ppid, tpid;
 
 	rcu_read_lock();
@@ -170,6 +171,7 @@ static inline void task_state(struct seq_file *m, struct pid_namespace *ns,
 		if (tracer)
 			tpid = task_pid_nr_ns(tracer, ns);
 	}
+	cred = get_cred((struct cred *) __task_cred(p));
 	seq_printf(m,
 		"State:\t%s\n"
 		"Tgid:\t%d\n"
@@ -182,8 +184,8 @@ static inline void task_state(struct seq_file *m, struct pid_namespace *ns,
 		task_tgid_nr_ns(p, ns),
 		pid_nr_ns(pid, ns),
 		ppid, tpid,
-		p->cred->uid, p->cred->euid, p->cred->suid, p->cred->fsuid,
-		p->cred->gid, p->cred->egid, p->cred->sgid, p->cred->fsgid);
+		cred->uid, cred->euid, cred->suid, cred->fsuid,
+		cred->gid, cred->egid, cred->sgid, cred->fsgid);
 
 	task_lock(p);
 	if (p->files)
@@ -194,13 +196,12 @@ static inline void task_state(struct seq_file *m, struct pid_namespace *ns,
 		fdt ? fdt->max_fds : 0);
 	rcu_read_unlock();
 
-	group_info = p->cred->group_info;
-	get_group_info(group_info);
+	group_info = cred->group_info;
 	task_unlock(p);
 
 	for (g = 0; g < min(group_info->ngroups, NGROUPS_SMALL); g++)
 		seq_printf(m, "%d ", GROUP_AT(group_info, g));
-	put_group_info(group_info);
+	put_cred(cred);
 
 	seq_printf(m, "\n");
 }
@@ -262,7 +263,7 @@ static inline void task_sig(struct seq_file *m, struct task_struct *p)
 		blocked = p->blocked;
 		collect_sigign_sigcatch(p, &ignored, &caught);
 		num_threads = atomic_read(&p->signal->count);
-		qsize = atomic_read(&p->cred->user->sigpending);
+		qsize = atomic_read(&__task_cred(p)->user->sigpending);
 		qlim = p->signal->rlim[RLIMIT_SIGPENDING].rlim_cur;
 		unlock_task_sighand(p, &flags);
 	}
@@ -293,12 +294,21 @@ static void render_cap_t(struct seq_file *m, const char *header,
 
 static inline void task_cap(struct seq_file *m, struct task_struct *p)
 {
-	struct cred *cred = p->cred;
+	const struct cred *cred;
+	kernel_cap_t cap_inheritable, cap_permitted, cap_effective, cap_bset;
 
-	render_cap_t(m, "CapInh:\t", &cred->cap_inheritable);
-	render_cap_t(m, "CapPrm:\t", &cred->cap_permitted);
-	render_cap_t(m, "CapEff:\t", &cred->cap_effective);
-	render_cap_t(m, "CapBnd:\t", &cred->cap_bset);
+	rcu_read_lock();
+	cred = __task_cred(p);
+	cap_inheritable	= cred->cap_inheritable;
+	cap_permitted	= cred->cap_permitted;
+	cap_effective	= cred->cap_effective;
+	cap_bset	= cred->cap_bset;
+	rcu_read_unlock();
+
+	render_cap_t(m, "CapInh:\t", &cap_inheritable);
+	render_cap_t(m, "CapPrm:\t", &cap_permitted);
+	render_cap_t(m, "CapEff:\t", &cap_effective);
+	render_cap_t(m, "CapBnd:\t", &cap_bset);
 }
 
 static inline void task_context_switch_counts(struct seq_file *m,
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 6862b360c36..cf42c42cbfb 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -1406,6 +1406,7 @@ static struct inode *proc_pid_make_inode(struct super_block * sb, struct task_st
 {
 	struct inode * inode;
 	struct proc_inode *ei;
+	const struct cred *cred;
 
 	/* We need a new inode */
 
@@ -1428,8 +1429,11 @@ static struct inode *proc_pid_make_inode(struct super_block * sb, struct task_st
 	inode->i_uid = 0;
 	inode->i_gid = 0;
 	if (task_dumpable(task)) {
-		inode->i_uid = task->cred->euid;
-		inode->i_gid = task->cred->egid;
+		rcu_read_lock();
+		cred = __task_cred(task);
+		inode->i_uid = cred->euid;
+		inode->i_gid = cred->egid;
+		rcu_read_unlock();
 	}
 	security_task_to_inode(task, inode);
 
@@ -1445,6 +1449,8 @@ static int pid_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat
 {
 	struct inode *inode = dentry->d_inode;
 	struct task_struct *task;
+	const struct cred *cred;
+
 	generic_fillattr(inode, stat);
 
 	rcu_read_lock();
@@ -1454,8 +1460,9 @@ static int pid_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat
 	if (task) {
 		if ((inode->i_mode == (S_IFDIR|S_IRUGO|S_IXUGO)) ||
 		    task_dumpable(task)) {
-			stat->uid = task->cred->euid;
-			stat->gid = task->cred->egid;
+			cred = __task_cred(task);
+			stat->uid = cred->euid;
+			stat->gid = cred->egid;
 		}
 	}
 	rcu_read_unlock();
@@ -1483,11 +1490,16 @@ static int pid_revalidate(struct dentry *dentry, struct nameidata *nd)
 {
 	struct inode *inode = dentry->d_inode;
 	struct task_struct *task = get_proc_task(inode);
+	const struct cred *cred;
+
 	if (task) {
 		if ((inode->i_mode == (S_IFDIR|S_IRUGO|S_IXUGO)) ||
 		    task_dumpable(task)) {
-			inode->i_uid = task->cred->euid;
-			inode->i_gid = task->cred->egid;
+			rcu_read_lock();
+			cred = __task_cred(task);
+			inode->i_uid = cred->euid;
+			inode->i_gid = cred->egid;
+			rcu_read_unlock();
 		} else {
 			inode->i_uid = 0;
 			inode->i_gid = 0;
@@ -1649,6 +1661,7 @@ static int tid_fd_revalidate(struct dentry *dentry, struct nameidata *nd)
 	struct task_struct *task = get_proc_task(inode);
 	int fd = proc_fd(inode);
 	struct files_struct *files;
+	const struct cred *cred;
 
 	if (task) {
 		files = get_files_struct(task);
@@ -1658,8 +1671,11 @@ static int tid_fd_revalidate(struct dentry *dentry, struct nameidata *nd)
 				rcu_read_unlock();
 				put_files_struct(files);
 				if (task_dumpable(task)) {
-					inode->i_uid = task->cred->euid;
-					inode->i_gid = task->cred->egid;
+					rcu_read_lock();
+					cred = __task_cred(task);
+					inode->i_uid = cred->euid;
+					inode->i_gid = cred->egid;
+					rcu_read_unlock();
 				} else {
 					inode->i_uid = 0;
 					inode->i_gid = 0;
diff --git a/include/linux/cred.h b/include/linux/cred.h
index 4221ec6000c..166ce4ddba6 100644
--- a/include/linux/cred.h
+++ b/include/linux/cred.h
@@ -147,8 +147,9 @@ static inline struct cred *get_cred(struct cred *cred)
  * Release a reference to a set of credentials, deleting them when the last ref
  * is released.
  */
-static inline void put_cred(struct cred *cred)
+static inline void put_cred(const struct cred *_cred)
 {
+	struct cred *cred = (struct cred *) _cred;
 	if (atomic_dec_and_test(&(cred)->usage))
 		__put_cred(cred);
 }
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 2febf5165fa..ae8ef88ade3 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -447,7 +447,7 @@ static int audit_filter_rules(struct task_struct *tsk,
 			      struct audit_names *name,
 			      enum audit_state *state)
 {
-	struct cred *cred = tsk->cred;
+	const struct cred *cred = get_task_cred(tsk);
 	int i, j, need_sid = 1;
 	u32 sid;
 
@@ -642,8 +642,10 @@ static int audit_filter_rules(struct task_struct *tsk,
 			break;
 		}
 
-		if (!result)
+		if (!result) {
+			put_cred(cred);
 			return 0;
+		}
 	}
 	if (rule->filterkey && ctx)
 		ctx->filterkey = kstrdup(rule->filterkey, GFP_ATOMIC);
@@ -651,6 +653,7 @@ static int audit_filter_rules(struct task_struct *tsk,
 	case AUDIT_NEVER:    *state = AUDIT_DISABLED;	    break;
 	case AUDIT_ALWAYS:   *state = AUDIT_RECORD_CONTEXT; break;
 	}
+	put_cred(cred);
 	return 1;
 }
 
@@ -1229,7 +1232,7 @@ static void audit_log_fcaps(struct audit_buffer *ab, struct audit_names *name)
 
 static void audit_log_exit(struct audit_context *context, struct task_struct *tsk)
 {
-	struct cred *cred = tsk->cred;
+	const struct cred *cred;
 	int i, call_panic = 0;
 	struct audit_buffer *ab;
 	struct audit_aux_data *aux;
@@ -1239,13 +1242,14 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
 	context->pid = tsk->pid;
 	if (!context->ppid)
 		context->ppid = sys_getppid();
-	context->uid = cred->uid;
-	context->gid = cred->gid;
-	context->euid = cred->euid;
-	context->suid = cred->suid;
+	cred = current_cred();
+	context->uid   = cred->uid;
+	context->gid   = cred->gid;
+	context->euid  = cred->euid;
+	context->suid  = cred->suid;
 	context->fsuid = cred->fsuid;
-	context->egid = cred->egid;
-	context->sgid = cred->sgid;
+	context->egid  = cred->egid;
+	context->sgid  = cred->sgid;
 	context->fsgid = cred->fsgid;
 	context->personality = tsk->personality;
 
@@ -2088,7 +2092,7 @@ int audit_set_loginuid(struct task_struct *task, uid_t loginuid)
 			audit_log_format(ab, "login pid=%d uid=%u "
 				"old auid=%u new auid=%u"
 				" old ses=%u new ses=%u",
-				task->pid, task->cred->uid,
+				task->pid, task_uid(task),
 				task->loginuid, loginuid,
 				task->sessionid, sessionid);
 			audit_log_end(ab);
@@ -2471,7 +2475,7 @@ void __audit_ptrace(struct task_struct *t)
 
 	context->target_pid = t->pid;
 	context->target_auid = audit_get_loginuid(t);
-	context->target_uid = t->cred->uid;
+	context->target_uid = task_uid(t);
 	context->target_sessionid = audit_get_sessionid(t);
 	security_task_getsecid(t, &context->target_sid);
 	memcpy(context->target_comm, t->comm, TASK_COMM_LEN);
@@ -2490,6 +2494,7 @@ int __audit_signal_info(int sig, struct task_struct *t)
 	struct audit_aux_data_pids *axp;
 	struct task_struct *tsk = current;
 	struct audit_context *ctx = tsk->audit_context;
+	uid_t uid = current_uid(), t_uid = task_uid(t);
 
 	if (audit_pid && t->tgid == audit_pid) {
 		if (sig == SIGTERM || sig == SIGHUP || sig == SIGUSR1 || sig == SIGUSR2) {
@@ -2497,7 +2502,7 @@ int __audit_signal_info(int sig, struct task_struct *t)
 			if (tsk->loginuid != -1)
 				audit_sig_uid = tsk->loginuid;
 			else
-				audit_sig_uid = tsk->cred->uid;
+				audit_sig_uid = uid;
 			security_task_getsecid(tsk, &audit_sig_sid);
 		}
 		if (!audit_signals || audit_dummy_context())
@@ -2509,7 +2514,7 @@ int __audit_signal_info(int sig, struct task_struct *t)
 	if (!ctx->target_pid) {
 		ctx->target_pid = t->tgid;
 		ctx->target_auid = audit_get_loginuid(t);
-		ctx->target_uid = t->cred->uid;
+		ctx->target_uid = t_uid;
 		ctx->target_sessionid = audit_get_sessionid(t);
 		security_task_getsecid(t, &ctx->target_sid);
 		memcpy(ctx->target_comm, t->comm, TASK_COMM_LEN);
@@ -2530,7 +2535,7 @@ int __audit_signal_info(int sig, struct task_struct *t)
 
 	axp->target_pid[axp->pid_count] = t->tgid;
 	axp->target_auid[axp->pid_count] = audit_get_loginuid(t);
-	axp->target_uid[axp->pid_count] = t->cred->uid;
+	axp->target_uid[axp->pid_count] = t_uid;
 	axp->target_sessionid[axp->pid_count] = audit_get_sessionid(t);
 	security_task_getsecid(t, &axp->target_sid[axp->pid_count]);
 	memcpy(axp->target_comm[axp->pid_count], t->comm, TASK_COMM_LEN);
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index e210526e640..a512a75a556 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -1279,7 +1279,7 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
 static int attach_task_by_pid(struct cgroup *cgrp, u64 pid)
 {
 	struct task_struct *tsk;
-	uid_t euid;
+	const struct cred *cred = current_cred(), *tcred;
 	int ret;
 
 	if (pid) {
@@ -1289,16 +1289,16 @@ static int attach_task_by_pid(struct cgroup *cgrp, u64 pid)
 			rcu_read_unlock();
 			return -ESRCH;
 		}
-		get_task_struct(tsk);
-		rcu_read_unlock();
 
-		euid = current_euid();
-		if (euid &&
-		    euid != tsk->cred->uid &&
-		    euid != tsk->cred->suid) {
-			put_task_struct(tsk);
+		tcred = __task_cred(tsk);
+		if (cred->euid &&
+		    cred->euid != tcred->uid &&
+		    cred->euid != tcred->suid) {
+			rcu_read_unlock();
 			return -EACCES;
 		}
+		get_task_struct(tsk);
+		rcu_read_unlock();
 	} else {
 		tsk = current;
 		get_task_struct(tsk);
diff --git a/kernel/exit.c b/kernel/exit.c
index e0f6e1892fb..bbc22530f2c 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -160,7 +160,10 @@ void release_task(struct task_struct * p)
 	int zap_leader;
 repeat:
 	tracehook_prepare_release_task(p);
-	atomic_dec(&p->cred->user->processes);
+	/* don't need to get the RCU readlock here - the process is dead and
+	 * can't be modifying its own credentials */
+	atomic_dec(&__task_cred(p)->user->processes);
+
 	proc_flush_task(p);
 	write_lock_irq(&tasklist_lock);
 	tracehook_finish_release_task(p);
@@ -1267,12 +1270,12 @@ static int wait_task_zombie(struct task_struct *p, int options,
 	unsigned long state;
 	int retval, status, traced;
 	pid_t pid = task_pid_vnr(p);
+	uid_t uid = __task_cred(p)->uid;
 
 	if (!likely(options & WEXITED))
 		return 0;
 
 	if (unlikely(options & WNOWAIT)) {
-		uid_t uid = p->cred->uid;
 		int exit_code = p->exit_code;
 		int why, status;
 
@@ -1393,7 +1396,7 @@ static int wait_task_zombie(struct task_struct *p, int options,
 	if (!retval && infop)
 		retval = put_user(pid, &infop->si_pid);
 	if (!retval && infop)
-		retval = put_user(p->cred->uid, &infop->si_uid);
+		retval = put_user(uid, &infop->si_uid);
 	if (!retval)
 		retval = pid;
 
@@ -1458,7 +1461,8 @@ static int wait_task_stopped(int ptrace, struct task_struct *p,
 	if (!unlikely(options & WNOWAIT))
 		p->exit_code = 0;
 
-	uid = p->cred->uid;
+	/* don't need the RCU readlock here as we're holding a spinlock */
+	uid = __task_cred(p)->uid;
 unlock_sig:
 	spin_unlock_irq(&p->sighand->siglock);
 	if (!exit_code)
@@ -1532,10 +1536,10 @@ static int wait_task_continued(struct task_struct *p, int options,
 	}
 	if (!unlikely(options & WNOWAIT))
 		p->signal->flags &= ~SIGNAL_STOP_CONTINUED;
+	uid = __task_cred(p)->uid;
 	spin_unlock_irq(&p->sighand->siglock);
 
 	pid = task_pid_vnr(p);
-	uid = p->cred->uid;
 	get_task_struct(p);
 	read_unlock(&tasklist_lock);
 
diff --git a/kernel/futex.c b/kernel/futex.c
index 28421d8210b..4fe790e89d0 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -439,15 +439,20 @@ static void free_pi_state(struct futex_pi_state *pi_state)
 static struct task_struct * futex_find_get_task(pid_t pid)
 {
 	struct task_struct *p;
-	uid_t euid = current_euid();
+	const struct cred *cred = current_cred(), *pcred;
 
 	rcu_read_lock();
 	p = find_task_by_vpid(pid);
-	if (!p || (euid != p->cred->euid &&
-		   euid != p->cred->uid))
+	if (!p) {
 		p = ERR_PTR(-ESRCH);
-	else
-		get_task_struct(p);
+	} else {
+		pcred = __task_cred(p);
+		if (cred->euid != pcred->euid &&
+		    cred->euid != pcred->uid)
+			p = ERR_PTR(-ESRCH);
+		else
+			get_task_struct(p);
+	}
 
 	rcu_read_unlock();
 
@@ -1831,7 +1836,7 @@ sys_get_robust_list(int pid, struct robust_list_head __user * __user *head_ptr,
 {
 	struct robust_list_head __user *head;
 	unsigned long ret;
-	uid_t euid = current_euid();
+	const struct cred *cred = current_cred(), *pcred;
 
 	if (!futex_cmpxchg_enabled)
 		return -ENOSYS;
@@ -1847,8 +1852,9 @@ sys_get_robust_list(int pid, struct robust_list_head __user * __user *head_ptr,
 		if (!p)
 			goto err_unlock;
 		ret = -EPERM;
-		if (euid != p->cred->euid &&
-		    euid != p->cred->uid &&
+		pcred = __task_cred(p);
+		if (cred->euid != pcred->euid &&
+		    cred->euid != pcred->uid &&
 		    !capable(CAP_SYS_PTRACE))
 			goto err_unlock;
 		head = p->robust_list;
diff --git a/kernel/futex_compat.c b/kernel/futex_compat.c
index 2c3fd5ed34f..d607a5b9ee2 100644
--- a/kernel/futex_compat.c
+++ b/kernel/futex_compat.c
@@ -135,7 +135,7 @@ compat_sys_get_robust_list(int pid, compat_uptr_t __user *head_ptr,
 {
 	struct compat_robust_list_head __user *head;
 	unsigned long ret;
-	uid_t euid = current_euid();
+	const struct cred *cred = current_cred(), *pcred;
 
 	if (!futex_cmpxchg_enabled)
 		return -ENOSYS;
@@ -151,8 +151,9 @@ compat_sys_get_robust_list(int pid, compat_uptr_t __user *head_ptr,
 		if (!p)
 			goto err_unlock;
 		ret = -EPERM;
-		if (euid != p->cred->euid &&
-		    euid != p->cred->uid &&
+		pcred = __task_cred(p);
+		if (cred->euid != pcred->euid &&
+		    cred->euid != pcred->uid &&
 		    !capable(CAP_SYS_PTRACE))
 			goto err_unlock;
 		head = p->compat_robust_list;
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 49849d12dd1..b9d5f4e4f6a 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -115,7 +115,7 @@ int ptrace_check_attach(struct task_struct *child, int kill)
 
 int __ptrace_may_access(struct task_struct *task, unsigned int mode)
 {
-	struct cred *cred = current->cred, *tcred = task->cred;
+	const struct cred *cred = current_cred(), *tcred;
 
 	/* May we inspect the given task?
 	 * This check is used both for attaching with ptrace
@@ -125,19 +125,23 @@ int __ptrace_may_access(struct task_struct *task, unsigned int mode)
 	 * because setting up the necessary parent/child relationship
 	 * or halting the specified task is impossible.
 	 */
-	uid_t uid = cred->uid;
-	gid_t gid = cred->gid;
 	int dumpable = 0;
 	/* Don't let security modules deny introspection */
 	if (task == current)
 		return 0;
-	if ((uid != tcred->euid ||
-	     uid != tcred->suid ||
-	     uid != tcred->uid  ||
-	     gid != tcred->egid ||
-	     gid != tcred->sgid ||
-	     gid != tcred->gid) && !capable(CAP_SYS_PTRACE))
+	rcu_read_lock();
+	tcred = __task_cred(task);
+	if ((cred->uid != tcred->euid ||
+	     cred->uid != tcred->suid ||
+	     cred->uid != tcred->uid  ||
+	     cred->gid != tcred->egid ||
+	     cred->gid != tcred->sgid ||
+	     cred->gid != tcred->gid) &&
+	    !capable(CAP_SYS_PTRACE)) {
+		rcu_read_unlock();
 		return -EPERM;
+	}
+	rcu_read_unlock();
 	smp_rmb();
 	if (task->mm)
 		dumpable = get_dumpable(task->mm);
diff --git a/kernel/sched.c b/kernel/sched.c
index 733c59e645a..92992e287b1 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -345,7 +345,9 @@ static inline struct task_group *task_group(struct task_struct *p)
 	struct task_group *tg;
 
 #ifdef CONFIG_USER_SCHED
-	tg = p->cred->user->tg;
+	rcu_read_lock();
+	tg = __task_cred(p)->user->tg;
+	rcu_read_unlock();
 #elif defined(CONFIG_CGROUP_SCHED)
 	tg = container_of(task_subsys_state(p, cpu_cgroup_subsys_id),
 				struct task_group, css);
@@ -5121,6 +5123,22 @@ __setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio)
 	set_load_weight(p);
 }
 
+/*
+ * check the target process has a UID that matches the current process's
+ */
+static bool check_same_owner(struct task_struct *p)
+{
+	const struct cred *cred = current_cred(), *pcred;
+	bool match;
+
+	rcu_read_lock();
+	pcred = __task_cred(p);
+	match = (cred->euid == pcred->euid ||
+		 cred->euid == pcred->uid);
+	rcu_read_unlock();
+	return match;
+}
+
 static int __sched_setscheduler(struct task_struct *p, int policy,
 				struct sched_param *param, bool user)
 {
@@ -5128,7 +5146,6 @@ static int __sched_setscheduler(struct task_struct *p, int policy,
 	unsigned long flags;
 	const struct sched_class *prev_class = p->sched_class;
 	struct rq *rq;
-	uid_t euid;
 
 	/* may grab non-irq protected spin_locks */
 	BUG_ON(in_interrupt());
@@ -5181,9 +5198,7 @@ recheck:
 			return -EPERM;
 
 		/* can't change other user's priorities */
-		euid = current_euid();
-		if (euid != p->cred->euid &&
-		    euid != p->cred->uid)
+		if (!check_same_owner(p))
 			return -EPERM;
 	}
 
@@ -5394,7 +5409,6 @@ long sched_setaffinity(pid_t pid, const cpumask_t *in_mask)
 	cpumask_t cpus_allowed;
 	cpumask_t new_mask = *in_mask;
 	struct task_struct *p;
-	uid_t euid;
 	int retval;
 
 	get_online_cpus();
@@ -5415,11 +5429,8 @@ long sched_setaffinity(pid_t pid, const cpumask_t *in_mask)
 	get_task_struct(p);
 	read_unlock(&tasklist_lock);
 
-	euid = current_euid();
 	retval = -EPERM;
-	if (euid != p->cred->euid &&
-	    euid != p->cred->uid &&
-	    !capable(CAP_SYS_NICE))
+	if (!check_same_owner(p) && !capable(CAP_SYS_NICE))
 		goto out_unlock;
 
 	retval = security_task_setscheduler(p, 0, NULL);
diff --git a/kernel/signal.c b/kernel/signal.c
index 80e8a6489f9..84989124baf 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -177,6 +177,11 @@ int next_signal(struct sigpending *pending, sigset_t *mask)
 	return sig;
 }
 
+/*
+ * allocate a new signal queue record
+ * - this may be called without locks if and only if t == current, otherwise an
+ *   appopriate lock must be held to protect t's user_struct
+ */
 static struct sigqueue *__sigqueue_alloc(struct task_struct *t, gfp_t flags,
 					 int override_rlimit)
 {
@@ -184,11 +189,12 @@ static struct sigqueue *__sigqueue_alloc(struct task_struct *t, gfp_t flags,
 	struct user_struct *user;
 
 	/*
-	 * In order to avoid problems with "switch_user()", we want to make
-	 * sure that the compiler doesn't re-load "t->user"
+	 * We won't get problems with the target's UID changing under us
+	 * because changing it requires RCU be used, and if t != current, the
+	 * caller must be holding the RCU readlock (by way of a spinlock) and
+	 * we use RCU protection here
 	 */
-	user = t->cred->user;
-	barrier();
+	user = __task_cred(t)->user;
 	atomic_inc(&user->sigpending);
 	if (override_rlimit ||
 	    atomic_read(&user->sigpending) <=
@@ -562,12 +568,13 @@ static int rm_from_queue(unsigned long mask, struct sigpending *s)
 
 /*
  * Bad permissions for sending the signal
+ * - the caller must hold at least the RCU read lock
  */
 static int check_kill_permission(int sig, struct siginfo *info,
 				 struct task_struct *t)
 {
+	const struct cred *cred = current_cred(), *tcred;
 	struct pid *sid;
-	uid_t uid, euid;
 	int error;
 
 	if (!valid_signal(sig))
@@ -580,10 +587,11 @@ static int check_kill_permission(int sig, struct siginfo *info,
 	if (error)
 		return error;
 
-	uid = current_uid();
-	euid = current_euid();
-	if ((euid ^ t->cred->suid) && (euid ^ t->cred->uid) &&
-	    (uid  ^ t->cred->suid) && (uid  ^ t->cred->uid) &&
+	tcred = __task_cred(t);
+	if ((cred->euid ^ tcred->suid) &&
+	    (cred->euid ^ tcred->uid) &&
+	    (cred->uid  ^ tcred->suid) &&
+	    (cred->uid  ^ tcred->uid) &&
 	    !capable(CAP_KILL)) {
 		switch (sig) {
 		case SIGCONT:
@@ -1011,6 +1019,10 @@ struct sighand_struct *lock_task_sighand(struct task_struct *tsk, unsigned long
 	return sighand;
 }
 
+/*
+ * send signal info to all the members of a group
+ * - the caller must hold the RCU read lock at least
+ */
 int group_send_sig_info(int sig, struct siginfo *info, struct task_struct *p)
 {
 	unsigned long flags;
@@ -1032,8 +1044,8 @@ int group_send_sig_info(int sig, struct siginfo *info, struct task_struct *p)
 /*
  * __kill_pgrp_info() sends a signal to a process group: this is what the tty
  * control characters do (^C, ^Z etc)
+ * - the caller must hold at least a readlock on tasklist_lock
  */
-
 int __kill_pgrp_info(int sig, struct siginfo *info, struct pid *pgrp)
 {
 	struct task_struct *p = NULL;
@@ -1089,6 +1101,7 @@ int kill_pid_info_as_uid(int sig, struct siginfo *info, struct pid *pid,
 {
 	int ret = -EINVAL;
 	struct task_struct *p;
+	const struct cred *pcred;
 
 	if (!valid_signal(sig))
 		return ret;
@@ -1099,9 +1112,11 @@ int kill_pid_info_as_uid(int sig, struct siginfo *info, struct pid *pid,
 		ret = -ESRCH;
 		goto out_unlock;
 	}
-	if ((info == SEND_SIG_NOINFO || (!is_si_special(info) && SI_FROMUSER(info)))
-	    && (euid != p->cred->suid) && (euid != p->cred->uid)
-	    && (uid != p->cred->suid) && (uid != p->cred->uid)) {
+	pcred = __task_cred(p);
+	if ((info == SEND_SIG_NOINFO ||
+	     (!is_si_special(info) && SI_FROMUSER(info))) &&
+	    euid != pcred->suid && euid != pcred->uid &&
+	    uid  != pcred->suid && uid  != pcred->uid) {
 		ret = -EPERM;
 		goto out_unlock;
 	}
@@ -1372,10 +1387,9 @@ int do_notify_parent(struct task_struct *tsk, int sig)
 	 */
 	rcu_read_lock();
 	info.si_pid = task_pid_nr_ns(tsk, tsk->parent->nsproxy->pid_ns);
+	info.si_uid = __task_cred(tsk)->uid;
 	rcu_read_unlock();
 
-	info.si_uid = tsk->cred->uid;
-
 	thread_group_cputime(tsk, &cputime);
 	info.si_utime = cputime_to_jiffies(cputime.utime);
 	info.si_stime = cputime_to_jiffies(cputime.stime);
@@ -1443,10 +1457,9 @@ static void do_notify_parent_cldstop(struct task_struct *tsk, int why)
 	 */
 	rcu_read_lock();
 	info.si_pid = task_pid_nr_ns(tsk, tsk->parent->nsproxy->pid_ns);
+	info.si_uid = __task_cred(tsk)->uid;
 	rcu_read_unlock();
 
-	info.si_uid = tsk->cred->uid;
-
 	info.si_utime = cputime_to_clock_t(tsk->utime);
 	info.si_stime = cputime_to_clock_t(tsk->stime);
 
@@ -1713,7 +1726,7 @@ static int ptrace_signal(int signr, siginfo_t *info,
 		info->si_errno = 0;
 		info->si_code = SI_USER;
 		info->si_pid = task_pid_vnr(current->parent);
-		info->si_uid = current->parent->cred->uid;
+		info->si_uid = task_uid(current->parent);
 	}
 
 	/* If the (new) signal is now blocked, requeue it.  */
diff --git a/kernel/sys.c b/kernel/sys.c
index c4d6b59553e..ccc9eb736d3 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -112,14 +112,17 @@ EXPORT_SYMBOL(cad_pid);
 
 void (*pm_power_off_prepare)(void);
 
+/*
+ * set the priority of a task
+ * - the caller must hold the RCU read lock
+ */
 static int set_one_prio(struct task_struct *p, int niceval, int error)
 {
-	uid_t euid = current_euid();
+	const struct cred *cred = current_cred(), *pcred = __task_cred(p);
 	int no_nice;
 
-	if (p->cred->uid  != euid &&
-	    p->cred->euid != euid &&
-	    !capable(CAP_SYS_NICE)) {
+	if (pcred->uid  != cred->euid &&
+	    pcred->euid != cred->euid && !capable(CAP_SYS_NICE)) {
 		error = -EPERM;
 		goto out;
 	}
diff --git a/kernel/tsacct.c b/kernel/tsacct.c
index 6d1ed07bf31..2dc06ab3571 100644
--- a/kernel/tsacct.c
+++ b/kernel/tsacct.c
@@ -27,6 +27,7 @@
  */
 void bacct_add_tsk(struct taskstats *stats, struct task_struct *tsk)
 {
+	const struct cred *tcred;
 	struct timespec uptime, ts;
 	u64 ac_etime;
 
@@ -53,10 +54,11 @@ void bacct_add_tsk(struct taskstats *stats, struct task_struct *tsk)
 		stats->ac_flag |= AXSIG;
 	stats->ac_nice	 = task_nice(tsk);
 	stats->ac_sched	 = tsk->policy;
-	stats->ac_uid	 = tsk->cred->uid;
-	stats->ac_gid	 = tsk->cred->gid;
 	stats->ac_pid	 = tsk->pid;
 	rcu_read_lock();
+	tcred = __task_cred(tsk);
+	stats->ac_uid	 = tcred->uid;
+	stats->ac_gid	 = tcred->gid;
 	stats->ac_ppid	 = pid_alive(tsk) ?
 				rcu_dereference(tsk->real_parent)->tgid : 0;
 	rcu_read_unlock();
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index b23492ee3e5..7555219c535 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -1110,7 +1110,7 @@ asmlinkage long sys_migrate_pages(pid_t pid, unsigned long maxnode,
 		const unsigned long __user *old_nodes,
 		const unsigned long __user *new_nodes)
 {
-	struct cred *cred, *tcred;
+	const struct cred *cred = current_cred(), *tcred;
 	struct mm_struct *mm;
 	struct task_struct *task;
 	nodemask_t old;
@@ -1145,14 +1145,16 @@ asmlinkage long sys_migrate_pages(pid_t pid, unsigned long maxnode,
 	 * capabilities, superuser privileges or the same
 	 * userid as the target process.
 	 */
-	cred = current->cred;
-	tcred = task->cred;
+	rcu_read_lock();
+	tcred = __task_cred(task);
 	if (cred->euid != tcred->suid && cred->euid != tcred->uid &&
 	    cred->uid  != tcred->suid && cred->uid  != tcred->uid &&
 	    !capable(CAP_SYS_NICE)) {
+		rcu_read_unlock();
 		err = -EPERM;
 		goto out;
 	}
+	rcu_read_unlock();
 
 	task_nodes = cpuset_mems_allowed(task);
 	/* Is the user allowed to access the target nodes? */
diff --git a/mm/migrate.c b/mm/migrate.c
index 794443da1b4..142284229ce 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -1045,7 +1045,7 @@ asmlinkage long sys_move_pages(pid_t pid, unsigned long nr_pages,
 			const int __user *nodes,
 			int __user *status, int flags)
 {
-	struct cred *cred, *tcred;
+	const struct cred *cred = current_cred(), *tcred;
 	struct task_struct *task;
 	struct mm_struct *mm;
 	int err;
@@ -1076,14 +1076,16 @@ asmlinkage long sys_move_pages(pid_t pid, unsigned long nr_pages,
 	 * capabilities, superuser privileges or the same
 	 * userid as the target process.
 	 */
-	cred = current->cred;
-	tcred = task->cred;
+	rcu_read_lock();
+	tcred = __task_cred(task);
 	if (cred->euid != tcred->suid && cred->euid != tcred->uid &&
 	    cred->uid  != tcred->suid && cred->uid  != tcred->uid &&
 	    !capable(CAP_SYS_NICE)) {
+		rcu_read_unlock();
 		err = -EPERM;
 		goto out;
 	}
+	rcu_read_unlock();
 
  	err = security_task_movememory(task);
  	if (err)
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 3af787ba207..0e0b282a207 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -298,9 +298,9 @@ static void dump_tasks(const struct mem_cgroup *mem)
 
 		task_lock(p);
 		printk(KERN_INFO "[%5d] %5d %5d %8lu %8lu %3d     %3d %s\n",
-		       p->pid, p->cred->uid, p->tgid, p->mm->total_vm,
-		       get_mm_rss(p->mm), (int)task_cpu(p), p->oomkilladj,
-		       p->comm);
+		       p->pid, __task_cred(p)->uid, p->tgid,
+		       p->mm->total_vm, get_mm_rss(p->mm), (int)task_cpu(p),
+		       p->oomkilladj, p->comm);
 		task_unlock(p);
 	} while_each_thread(g, p);
 }
diff --git a/security/commoncap.c b/security/commoncap.c
index 61307f59000..0384bf95db6 100644
--- a/security/commoncap.c
+++ b/security/commoncap.c
@@ -51,10 +51,13 @@ EXPORT_SYMBOL(cap_netlink_recv);
  */
 int cap_capable(struct task_struct *tsk, int cap, int audit)
 {
+	__u32 cap_raised;
+
 	/* Derived from include/linux/sched.h:capable. */
-	if (cap_raised(tsk->cred->cap_effective, cap))
-		return 0;
-	return -EPERM;
+	rcu_read_lock();
+	cap_raised = cap_raised(__task_cred(tsk)->cap_effective, cap);
+	rcu_read_unlock();
+	return cap_raised ? 0 : -EPERM;
 }
 
 int cap_settime(struct timespec *ts, struct timezone *tz)
@@ -66,34 +69,42 @@ int cap_settime(struct timespec *ts, struct timezone *tz)
 
 int cap_ptrace_may_access(struct task_struct *child, unsigned int mode)
 {
-	/* Derived from arch/i386/kernel/ptrace.c:sys_ptrace. */
-	if (cap_issubset(child->cred->cap_permitted,
-			 current->cred->cap_permitted))
-		return 0;
-	if (capable(CAP_SYS_PTRACE))
-		return 0;
-	return -EPERM;
+	int ret = 0;
+
+	rcu_read_lock();
+	if (!cap_issubset(child->cred->cap_permitted,
+			  current->cred->cap_permitted) &&
+	    !capable(CAP_SYS_PTRACE))
+		ret = -EPERM;
+	rcu_read_unlock();
+	return ret;
 }
 
 int cap_ptrace_traceme(struct task_struct *parent)
 {
-	if (cap_issubset(current->cred->cap_permitted,
-			 parent->cred->cap_permitted))
-		return 0;
-	if (has_capability(parent, CAP_SYS_PTRACE))
-		return 0;
-	return -EPERM;
+	int ret = 0;
+
+	rcu_read_lock();
+	if (!cap_issubset(current->cred->cap_permitted,
+			 parent->cred->cap_permitted) &&
+	    !has_capability(parent, CAP_SYS_PTRACE))
+		ret = -EPERM;
+	rcu_read_unlock();
+	return ret;
 }
 
 int cap_capget (struct task_struct *target, kernel_cap_t *effective,
 		kernel_cap_t *inheritable, kernel_cap_t *permitted)
 {
-	struct cred *cred = target->cred;
+	const struct cred *cred;
 
 	/* Derived from kernel/capability.c:sys_capget. */
+	rcu_read_lock();
+	cred = __task_cred(target);
 	*effective   = cred->cap_effective;
 	*inheritable = cred->cap_inheritable;
 	*permitted   = cred->cap_permitted;
+	rcu_read_unlock();
 	return 0;
 }
 
@@ -433,7 +444,7 @@ void cap_bprm_apply_creds (struct linux_binprm *bprm, int unsafe)
 
 int cap_bprm_secureexec (struct linux_binprm *bprm)
 {
-	const struct cred *cred = current->cred;
+	const struct cred *cred = current_cred();
 
 	if (cred->uid != 0) {
 		if (bprm->cap_effective)
@@ -511,11 +522,11 @@ static inline void cap_emulate_setxuid (int old_ruid, int old_euid,
 	if ((old_ruid == 0 || old_euid == 0 || old_suid == 0) &&
 	    (cred->uid != 0 && cred->euid != 0 && cred->suid != 0) &&
 	    !issecure(SECURE_KEEP_CAPS)) {
-		cap_clear (cred->cap_permitted);
-		cap_clear (cred->cap_effective);
+		cap_clear(cred->cap_permitted);
+		cap_clear(cred->cap_effective);
 	}
 	if (old_euid == 0 && cred->euid != 0) {
-		cap_clear (cred->cap_effective);
+		cap_clear(cred->cap_effective);
 	}
 	if (old_euid != 0 && cred->euid == 0) {
 		cred->cap_effective = cred->cap_permitted;
@@ -582,9 +593,14 @@ int cap_task_post_setuid (uid_t old_ruid, uid_t old_euid, uid_t old_suid,
  */
 static int cap_safe_nice(struct task_struct *p)
 {
-	if (!cap_issubset(p->cred->cap_permitted,
-			  current->cred->cap_permitted) &&
-	    !capable(CAP_SYS_NICE))
+	int is_subset;
+
+	rcu_read_lock();
+	is_subset = cap_issubset(__task_cred(p)->cap_permitted,
+				 current_cred()->cap_permitted);
+	rcu_read_unlock();
+
+	if (!is_subset && !capable(CAP_SYS_NICE))
 		return -EPERM;
 	return 0;
 }
diff --git a/security/keys/permission.c b/security/keys/permission.c
index baf3d5f31e7..13c36164f28 100644
--- a/security/keys/permission.c
+++ b/security/keys/permission.c
@@ -22,13 +22,16 @@ int key_task_permission(const key_ref_t key_ref,
 			struct task_struct *context,
 			key_perm_t perm)
 {
-	struct cred *cred = context->cred;
+	const struct cred *cred;
 	struct key *key;
 	key_perm_t kperm;
 	int ret;
 
 	key = key_ref_to_ptr(key_ref);
 
+	rcu_read_lock();
+	cred = __task_cred(context);
+
 	/* use the second 8-bits of permissions for keys the caller owns */
 	if (key->uid == cred->fsuid) {
 		kperm = key->perm >> 16;
@@ -43,10 +46,7 @@ int key_task_permission(const key_ref_t key_ref,
 			goto use_these_perms;
 		}
 
-		spin_lock(&cred->lock);
 		ret = groups_search(cred->group_info, key->gid);
-		spin_unlock(&cred->lock);
-
 		if (ret) {
 			kperm = key->perm >> 8;
 			goto use_these_perms;
@@ -57,6 +57,8 @@ int key_task_permission(const key_ref_t key_ref,
 	kperm = key->perm;
 
 use_these_perms:
+	rcu_read_lock();
+
 	/* use the top 8-bits of permissions for keys the caller possesses
 	 * - possessor permissions are additive with other permissions
 	 */
diff --git a/security/keys/process_keys.c b/security/keys/process_keys.c
index ce8ac6073d5..212601ebaa4 100644
--- a/security/keys/process_keys.c
+++ b/security/keys/process_keys.c
@@ -412,10 +412,13 @@ key_ref_t search_process_keyrings(struct key_type *type,
 				  struct task_struct *context)
 {
 	struct request_key_auth *rka;
+	struct cred *cred;
 	key_ref_t key_ref, ret, err;
 
 	might_sleep();
 
+	cred = get_task_cred(context);
+
 	/* we want to return -EAGAIN or -ENOKEY if any of the keyrings were
 	 * searchable, but we failed to find a key or we found a negative key;
 	 * otherwise we want to return a sample error (probably -EACCES) if
@@ -428,9 +431,9 @@ key_ref_t search_process_keyrings(struct key_type *type,
 	err = ERR_PTR(-EAGAIN);
 
 	/* search the thread keyring first */
-	if (context->cred->thread_keyring) {
+	if (cred->thread_keyring) {
 		key_ref = keyring_search_aux(
-			make_key_ref(context->cred->thread_keyring, 1),
+			make_key_ref(cred->thread_keyring, 1),
 			context, type, description, match);
 		if (!IS_ERR(key_ref))
 			goto found;
@@ -495,9 +498,9 @@ key_ref_t search_process_keyrings(struct key_type *type,
 		}
 	}
 	/* or search the user-session keyring */
-	else if (context->cred->user->session_keyring) {
+	else if (cred->user->session_keyring) {
 		key_ref = keyring_search_aux(
-			make_key_ref(context->cred->user->session_keyring, 1),
+			make_key_ref(cred->user->session_keyring, 1),
 			context, type, description, match);
 		if (!IS_ERR(key_ref))
 			goto found;
@@ -519,20 +522,20 @@ key_ref_t search_process_keyrings(struct key_type *type,
 	 * search the keyrings of the process mentioned there
 	 * - we don't permit access to request_key auth keys via this method
 	 */
-	if (context->cred->request_key_auth &&
+	if (cred->request_key_auth &&
 	    context == current &&
 	    type != &key_type_request_key_auth
 	    ) {
 		/* defend against the auth key being revoked */
-		down_read(&context->cred->request_key_auth->sem);
+		down_read(&cred->request_key_auth->sem);
 
-		if (key_validate(context->cred->request_key_auth) == 0) {
-			rka = context->cred->request_key_auth->payload.data;
+		if (key_validate(cred->request_key_auth) == 0) {
+			rka = cred->request_key_auth->payload.data;
 
 			key_ref = search_process_keyrings(type, description,
 							  match, rka->context);
 
-			up_read(&context->cred->request_key_auth->sem);
+			up_read(&cred->request_key_auth->sem);
 
 			if (!IS_ERR(key_ref))
 				goto found;
@@ -549,7 +552,7 @@ key_ref_t search_process_keyrings(struct key_type *type,
 				break;
 			}
 		} else {
-			up_read(&context->cred->request_key_auth->sem);
+			up_read(&cred->request_key_auth->sem);
 		}
 	}
 
@@ -557,6 +560,7 @@ key_ref_t search_process_keyrings(struct key_type *type,
 	key_ref = ret ? ret : err;
 
 found:
+	put_cred(cred);
 	return key_ref;
 
 } /* end search_process_keyrings() */
diff --git a/security/selinux/selinuxfs.c b/security/selinux/selinuxfs.c
index 10715d1330b..c8630363823 100644
--- a/security/selinux/selinuxfs.c
+++ b/security/selinux/selinuxfs.c
@@ -95,13 +95,18 @@ extern void selnl_notify_setenforce(int val);
 static int task_has_security(struct task_struct *tsk,
 			     u32 perms)
 {
-	struct task_security_struct *tsec;
-
-	tsec = tsk->cred->security;
+	const struct task_security_struct *tsec;
+	u32 sid = 0;
+
+	rcu_read_lock();
+	tsec = __task_cred(tsk)->security;
+	if (tsec)
+		sid = tsec->sid;
+	rcu_read_unlock();
 	if (!tsec)
 		return -EACCES;
 
-	return avc_has_perm(tsec->sid, SECINITSID_SECURITY,
+	return avc_has_perm(sid, SECINITSID_SECURITY,
 			    SECCLASS_SECURITY, perms, NULL);
 }
 
diff --git a/security/smack/smack_lsm.c b/security/smack/smack_lsm.c
index e8a4fcb1ad0..11167fd567b 100644
--- a/security/smack/smack_lsm.c
+++ b/security/smack/smack_lsm.c
@@ -30,6 +30,8 @@
 
 #include "smack.h"
 
+#define task_security(task)	(task_cred_xxx((task), security))
+
 /*
  * I hope these are the hokeyist lines of code in the module. Casey.
  */
@@ -1012,7 +1014,7 @@ static void smack_cred_free(struct cred *cred)
  */
 static int smack_task_setpgid(struct task_struct *p, pid_t pgid)
 {
-	return smk_curacc(p->cred->security, MAY_WRITE);
+	return smk_curacc(task_security(p), MAY_WRITE);
 }
 
 /**
@@ -1023,7 +1025,7 @@ static int smack_task_setpgid(struct task_struct *p, pid_t pgid)
  */
 static int smack_task_getpgid(struct task_struct *p)
 {
-	return smk_curacc(p->cred->security, MAY_READ);
+	return smk_curacc(task_security(p), MAY_READ);
 }
 
 /**
@@ -1034,7 +1036,7 @@ static int smack_task_getpgid(struct task_struct *p)
  */
 static int smack_task_getsid(struct task_struct *p)
 {
-	return smk_curacc(p->cred->security, MAY_READ);
+	return smk_curacc(task_security(p), MAY_READ);
 }
 
 /**
@@ -1046,7 +1048,7 @@ static int smack_task_getsid(struct task_struct *p)
  */
 static void smack_task_getsecid(struct task_struct *p, u32 *secid)
 {
-	*secid = smack_to_secid(p->cred->security);
+	*secid = smack_to_secid(task_security(p));
 }
 
 /**
@@ -1062,7 +1064,7 @@ static int smack_task_setnice(struct task_struct *p, int nice)
 
 	rc = cap_task_setnice(p, nice);
 	if (rc == 0)
-		rc = smk_curacc(p->cred->security, MAY_WRITE);
+		rc = smk_curacc(task_security(p), MAY_WRITE);
 	return rc;
 }
 
@@ -1079,7 +1081,7 @@ static int smack_task_setioprio(struct task_struct *p, int ioprio)
 
 	rc = cap_task_setioprio(p, ioprio);
 	if (rc == 0)
-		rc = smk_curacc(p->cred->security, MAY_WRITE);
+		rc = smk_curacc(task_security(p), MAY_WRITE);
 	return rc;
 }
 
@@ -1091,7 +1093,7 @@ static int smack_task_setioprio(struct task_struct *p, int ioprio)
  */
 static int smack_task_getioprio(struct task_struct *p)
 {
-	return smk_curacc(p->cred->security, MAY_READ);
+	return smk_curacc(task_security(p), MAY_READ);
 }
 
 /**
@@ -1109,7 +1111,7 @@ static int smack_task_setscheduler(struct task_struct *p, int policy,
 
 	rc = cap_task_setscheduler(p, policy, lp);
 	if (rc == 0)
-		rc = smk_curacc(p->cred->security, MAY_WRITE);
+		rc = smk_curacc(task_security(p), MAY_WRITE);
 	return rc;
 }
 
@@ -1121,7 +1123,7 @@ static int smack_task_setscheduler(struct task_struct *p, int policy,
  */
 static int smack_task_getscheduler(struct task_struct *p)
 {
-	return smk_curacc(p->cred->security, MAY_READ);
+	return smk_curacc(task_security(p), MAY_READ);
 }
 
 /**
@@ -1132,7 +1134,7 @@ static int smack_task_getscheduler(struct task_struct *p)
  */
 static int smack_task_movememory(struct task_struct *p)
 {
-	return smk_curacc(p->cred->security, MAY_WRITE);
+	return smk_curacc(task_security(p), MAY_WRITE);
 }
 
 /**
@@ -1155,13 +1157,13 @@ static int smack_task_kill(struct task_struct *p, struct siginfo *info,
 	 * can write the receiver.
 	 */
 	if (secid == 0)
-		return smk_curacc(p->cred->security, MAY_WRITE);
+		return smk_curacc(task_security(p), MAY_WRITE);
 	/*
 	 * If the secid isn't 0 we're dealing with some USB IO
 	 * specific behavior. This is not clean. For one thing
 	 * we can't take privilege into account.
 	 */
-	return smk_access(smack_from_secid(secid), p->cred->security, MAY_WRITE);
+	return smk_access(smack_from_secid(secid), task_security(p), MAY_WRITE);
 }
 
 /**
@@ -1174,7 +1176,7 @@ static int smack_task_wait(struct task_struct *p)
 {
 	int rc;
 
-	rc = smk_access(current->cred->security, p->cred->security, MAY_WRITE);
+	rc = smk_access(current_security(), task_security(p), MAY_WRITE);
 	if (rc == 0)
 		return 0;
 
@@ -1205,7 +1207,7 @@ static int smack_task_wait(struct task_struct *p)
 static void smack_task_to_inode(struct task_struct *p, struct inode *inode)
 {
 	struct inode_smack *isp = inode->i_security;
-	isp->smk_inode = p->cred->security;
+	isp->smk_inode = task_security(p);
 }
 
 /*
@@ -2010,7 +2012,7 @@ static int smack_getprocattr(struct task_struct *p, char *name, char **value)
 	if (strcmp(name, "current") != 0)
 		return -EINVAL;
 
-	cp = kstrdup(p->cred->security, GFP_KERNEL);
+	cp = kstrdup(task_security(p), GFP_KERNEL);
 	if (cp == NULL)
 		return -ENOMEM;
 
-- 
cgit v1.2.3-70-g09d2


From bb952bb98a7e479262c7eb25d5592545a3af147d Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 14 Nov 2008 10:39:20 +1100
Subject: CRED: Separate per-task-group keyrings from signal_struct

Separate per-task-group keyrings from signal_struct and dangle their anchor
from the cred struct rather than the signal_struct.

Signed-off-by: David Howells <dhowells@redhat.com>
Reviewed-by: James Morris <jmorris@namei.org>
Signed-off-by: James Morris <jmorris@namei.org>
---
 include/linux/cred.h         |  16 +++++++
 include/linux/key.h          |   8 +---
 include/linux/sched.h        |   6 ---
 kernel/cred.c                |  63 +++++++++++++++++++++++++++
 kernel/fork.c                |   7 ---
 security/keys/process_keys.c | 100 +++++++++++++++++--------------------------
 security/keys/request_key.c  |  34 ++++++---------
 7 files changed, 135 insertions(+), 99 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/cred.h b/include/linux/cred.h
index 166ce4ddba6..62b9e532422 100644
--- a/include/linux/cred.h
+++ b/include/linux/cred.h
@@ -71,6 +71,21 @@ extern int groups_search(const struct group_info *, gid_t);
 extern int in_group_p(gid_t);
 extern int in_egroup_p(gid_t);
 
+/*
+ * The common credentials for a thread group
+ * - shared by CLONE_THREAD
+ */
+#ifdef CONFIG_KEYS
+struct thread_group_cred {
+	atomic_t	usage;
+	pid_t		tgid;			/* thread group process ID */
+	spinlock_t	lock;
+	struct key	*session_keyring;	/* keyring inherited over fork */
+	struct key	*process_keyring;	/* keyring private to this process */
+	struct rcu_head	rcu;			/* RCU deletion hook */
+};
+#endif
+
 /*
  * The security context of a task
  *
@@ -114,6 +129,7 @@ struct cred {
 					 * keys to */
 	struct key	*thread_keyring; /* keyring private to this thread */
 	struct key	*request_key_auth; /* assumed request_key authority */
+	struct thread_group_cred *tgcred; /* thread-group shared credentials */
 #endif
 #ifdef CONFIG_SECURITY
 	void		*security;	/* subjective LSM security */
diff --git a/include/linux/key.h b/include/linux/key.h
index df709e1af3c..0836cc838b0 100644
--- a/include/linux/key.h
+++ b/include/linux/key.h
@@ -278,9 +278,7 @@ extern ctl_table key_sysctls[];
  */
 extern void switch_uid_keyring(struct user_struct *new_user);
 extern int copy_keys(unsigned long clone_flags, struct task_struct *tsk);
-extern int copy_thread_group_keys(struct task_struct *tsk);
 extern void exit_keys(struct task_struct *tsk);
-extern void exit_thread_group_keys(struct signal_struct *tg);
 extern int suid_keys(struct task_struct *tsk);
 extern int exec_keys(struct task_struct *tsk);
 extern void key_fsuid_changed(struct task_struct *tsk);
@@ -289,8 +287,8 @@ extern void key_init(void);
 
 #define __install_session_keyring(keyring)				\
 ({									\
-	struct key *old_session = current->signal->session_keyring;	\
-	current->signal->session_keyring = keyring;			\
+	struct key *old_session = current->cred->tgcred->session_keyring; \
+	current->cred->tgcred->session_keyring = keyring;		\
 	old_session;							\
 })
 
@@ -308,9 +306,7 @@ extern void key_init(void);
 #define switch_uid_keyring(u)		do { } while(0)
 #define __install_session_keyring(k)	({ NULL; })
 #define copy_keys(f,t)			0
-#define copy_thread_group_keys(t)	0
 #define exit_keys(t)			do { } while(0)
-#define exit_thread_group_keys(tg)	do { } while(0)
 #define suid_keys(t)			do { } while(0)
 #define exec_keys(t)			do { } while(0)
 #define key_fsuid_changed(t)		do { } while(0)
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 740cf946c8c..2913252989b 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -571,12 +571,6 @@ struct signal_struct {
 	 */
 	struct rlimit rlim[RLIM_NLIMITS];
 
-	/* keep the process-shared keyrings here so that they do the right
-	 * thing in threads created with CLONE_THREAD */
-#ifdef CONFIG_KEYS
-	struct key *session_keyring;	/* keyring inherited over fork */
-	struct key *process_keyring;	/* keyring private to this process */
-#endif
 #ifdef CONFIG_BSD_PROCESS_ACCT
 	struct pacct_struct pacct;	/* per-process accounting information */
 #endif
diff --git a/kernel/cred.c b/kernel/cred.c
index 833244a7cb0..ac73e361768 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -16,6 +16,17 @@
 #include <linux/init_task.h>
 #include <linux/security.h>
 
+/*
+ * The common credentials for the initial task's thread group
+ */
+#ifdef CONFIG_KEYS
+static struct thread_group_cred init_tgcred = {
+	.usage	= ATOMIC_INIT(2),
+	.tgid	= 0,
+	.lock	= SPIN_LOCK_UNLOCKED,
+};
+#endif
+
 /*
  * The initial credentials for the initial task
  */
@@ -28,8 +39,41 @@ struct cred init_cred = {
 	.cap_bset		= CAP_INIT_BSET,
 	.user			= INIT_USER,
 	.group_info		= &init_groups,
+#ifdef CONFIG_KEYS
+	.tgcred			= &init_tgcred,
+#endif
 };
 
+/*
+ * Dispose of the shared task group credentials
+ */
+#ifdef CONFIG_KEYS
+static void release_tgcred_rcu(struct rcu_head *rcu)
+{
+	struct thread_group_cred *tgcred =
+		container_of(rcu, struct thread_group_cred, rcu);
+
+	BUG_ON(atomic_read(&tgcred->usage) != 0);
+
+	key_put(tgcred->session_keyring);
+	key_put(tgcred->process_keyring);
+	kfree(tgcred);
+}
+#endif
+
+/*
+ * Release a set of thread group credentials.
+ */
+static void release_tgcred(struct cred *cred)
+{
+#ifdef CONFIG_KEYS
+	struct thread_group_cred *tgcred = cred->tgcred;
+
+	if (atomic_dec_and_test(&tgcred->usage))
+		call_rcu(&tgcred->rcu, release_tgcred_rcu);
+#endif
+}
+
 /*
  * The RCU callback to actually dispose of a set of credentials
  */
@@ -41,6 +85,7 @@ static void put_cred_rcu(struct rcu_head *rcu)
 
 	key_put(cred->thread_keyring);
 	key_put(cred->request_key_auth);
+	release_tgcred(cred);
 	put_group_info(cred->group_info);
 	free_uid(cred->user);
 	security_cred_free(cred);
@@ -71,12 +116,30 @@ int copy_creds(struct task_struct *p, unsigned long clone_flags)
 	if (!pcred)
 		return -ENOMEM;
 
+#ifdef CONFIG_KEYS
+	if (clone_flags & CLONE_THREAD) {
+		atomic_inc(&pcred->tgcred->usage);
+	} else {
+		pcred->tgcred = kmalloc(sizeof(struct cred), GFP_KERNEL);
+		if (!pcred->tgcred) {
+			kfree(pcred);
+			return -ENOMEM;
+		}
+		atomic_set(&pcred->tgcred->usage, 1);
+		spin_lock_init(&pcred->tgcred->lock);
+		pcred->tgcred->process_keyring = NULL;
+		pcred->tgcred->session_keyring =
+			key_get(p->cred->tgcred->session_keyring);
+	}
+#endif
+
 #ifdef CONFIG_SECURITY
 	pcred->security = NULL;
 #endif
 
 	ret = security_cred_alloc(pcred);
 	if (ret < 0) {
+		release_tgcred(pcred);
 		kfree(pcred);
 		return ret;
 	}
diff --git a/kernel/fork.c b/kernel/fork.c
index c932e283ddf..ded1972672a 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -802,12 +802,6 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
 	if (!sig)
 		return -ENOMEM;
 
-	ret = copy_thread_group_keys(tsk);
-	if (ret < 0) {
-		kmem_cache_free(signal_cachep, sig);
-		return ret;
-	}
-
 	atomic_set(&sig->count, 1);
 	atomic_set(&sig->live, 1);
 	init_waitqueue_head(&sig->wait_chldexit);
@@ -852,7 +846,6 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
 void __cleanup_signal(struct signal_struct *sig)
 {
 	thread_group_cputime_free(sig);
-	exit_thread_group_keys(sig);
 	tty_kref_put(sig->tty);
 	kmem_cache_free(signal_cachep, sig);
 }
diff --git a/security/keys/process_keys.c b/security/keys/process_keys.c
index 212601ebaa4..70ee93406f3 100644
--- a/security/keys/process_keys.c
+++ b/security/keys/process_keys.c
@@ -189,7 +189,7 @@ int install_process_keyring(void)
 
 	might_sleep();
 
-	if (!tsk->signal->process_keyring) {
+	if (!tsk->cred->tgcred->process_keyring) {
 		sprintf(buf, "_pid.%u", tsk->tgid);
 
 		keyring = keyring_alloc(buf, tsk->cred->uid, tsk->cred->gid, tsk,
@@ -200,12 +200,12 @@ int install_process_keyring(void)
 		}
 
 		/* attach keyring */
-		spin_lock_irq(&tsk->sighand->siglock);
-		if (!tsk->signal->process_keyring) {
-			tsk->signal->process_keyring = keyring;
+		spin_lock_irq(&tsk->cred->tgcred->lock);
+		if (!tsk->cred->tgcred->process_keyring) {
+			tsk->cred->tgcred->process_keyring = keyring;
 			keyring = NULL;
 		}
-		spin_unlock_irq(&tsk->sighand->siglock);
+		spin_unlock_irq(&tsk->cred->tgcred->lock);
 
 		key_put(keyring);
 	}
@@ -235,11 +235,11 @@ static int install_session_keyring(struct key *keyring)
 		sprintf(buf, "_ses.%u", tsk->tgid);
 
 		flags = KEY_ALLOC_QUOTA_OVERRUN;
-		if (tsk->signal->session_keyring)
+		if (tsk->cred->tgcred->session_keyring)
 			flags = KEY_ALLOC_IN_QUOTA;
 
-		keyring = keyring_alloc(buf, tsk->cred->uid, tsk->cred->gid, tsk,
-					flags, NULL);
+		keyring = keyring_alloc(buf, tsk->cred->uid, tsk->cred->gid,
+					tsk, flags, NULL);
 		if (IS_ERR(keyring))
 			return PTR_ERR(keyring);
 	}
@@ -248,10 +248,10 @@ static int install_session_keyring(struct key *keyring)
 	}
 
 	/* install the keyring */
-	spin_lock_irq(&tsk->sighand->siglock);
-	old = tsk->signal->session_keyring;
-	rcu_assign_pointer(tsk->signal->session_keyring, keyring);
-	spin_unlock_irq(&tsk->sighand->siglock);
+	spin_lock_irq(&tsk->cred->tgcred->lock);
+	old = tsk->cred->tgcred->session_keyring;
+	rcu_assign_pointer(tsk->cred->tgcred->session_keyring, keyring);
+	spin_unlock_irq(&tsk->cred->tgcred->lock);
 
 	/* we're using RCU on the pointer, but there's no point synchronising
 	 * on it if it didn't previously point to anything */
@@ -264,28 +264,6 @@ static int install_session_keyring(struct key *keyring)
 
 } /* end install_session_keyring() */
 
-/*****************************************************************************/
-/*
- * copy the keys in a thread group for fork without CLONE_THREAD
- */
-int copy_thread_group_keys(struct task_struct *tsk)
-{
-	key_check(current->thread_group->session_keyring);
-	key_check(current->thread_group->process_keyring);
-
-	/* no process keyring yet */
-	tsk->signal->process_keyring = NULL;
-
-	/* same session keyring */
-	rcu_read_lock();
-	tsk->signal->session_keyring =
-		key_get(rcu_dereference(current->signal->session_keyring));
-	rcu_read_unlock();
-
-	return 0;
-
-} /* end copy_thread_group_keys() */
-
 /*****************************************************************************/
 /*
  * copy the keys for fork
@@ -305,17 +283,6 @@ int copy_keys(unsigned long clone_flags, struct task_struct *tsk)
 
 } /* end copy_keys() */
 
-/*****************************************************************************/
-/*
- * dispose of thread group keys upon thread group destruction
- */
-void exit_thread_group_keys(struct signal_struct *tg)
-{
-	key_put(tg->session_keyring);
-	key_put(tg->process_keyring);
-
-} /* end exit_thread_group_keys() */
-
 /*****************************************************************************/
 /*
  * dispose of per-thread keys upon thread exit
@@ -344,10 +311,10 @@ int exec_keys(struct task_struct *tsk)
 	key_put(old);
 
 	/* discard the process keyring from a newly exec'd task */
-	spin_lock_irq(&tsk->sighand->siglock);
-	old = tsk->signal->process_keyring;
-	tsk->signal->process_keyring = NULL;
-	spin_unlock_irq(&tsk->sighand->siglock);
+	spin_lock_irq(&tsk->cred->tgcred->lock);
+	old = tsk->cred->tgcred->process_keyring;
+	tsk->cred->tgcred->process_keyring = NULL;
+	spin_unlock_irq(&tsk->cred->tgcred->lock);
 
 	key_put(old);
 
@@ -452,9 +419,9 @@ key_ref_t search_process_keyrings(struct key_type *type,
 	}
 
 	/* search the process keyring second */
-	if (context->signal->process_keyring) {
+	if (cred->tgcred->process_keyring) {
 		key_ref = keyring_search_aux(
-			make_key_ref(context->signal->process_keyring, 1),
+			make_key_ref(cred->tgcred->process_keyring, 1),
 			context, type, description, match);
 		if (!IS_ERR(key_ref))
 			goto found;
@@ -473,11 +440,11 @@ key_ref_t search_process_keyrings(struct key_type *type,
 	}
 
 	/* search the session keyring */
-	if (context->signal->session_keyring) {
+	if (cred->tgcred->session_keyring) {
 		rcu_read_lock();
 		key_ref = keyring_search_aux(
 			make_key_ref(rcu_dereference(
-					     context->signal->session_keyring),
+					     cred->tgcred->session_keyring),
 				     1),
 			context, type, description, match);
 		rcu_read_unlock();
@@ -586,11 +553,13 @@ key_ref_t lookup_user_key(key_serial_t id, int create, int partial,
 {
 	struct request_key_auth *rka;
 	struct task_struct *t = current;
-	struct cred *cred = current_cred();
+	struct cred *cred;
 	struct key *key;
 	key_ref_t key_ref, skey_ref;
 	int ret;
 
+try_again:
+	cred = get_current_cred();
 	key_ref = ERR_PTR(-ENOKEY);
 
 	switch (id) {
@@ -604,6 +573,7 @@ key_ref_t lookup_user_key(key_serial_t id, int create, int partial,
 				key = ERR_PTR(ret);
 				goto error;
 			}
+			goto reget_creds;
 		}
 
 		key = cred->thread_keyring;
@@ -612,7 +582,7 @@ key_ref_t lookup_user_key(key_serial_t id, int create, int partial,
 		break;
 
 	case KEY_SPEC_PROCESS_KEYRING:
-		if (!t->signal->process_keyring) {
+		if (!cred->tgcred->process_keyring) {
 			if (!create)
 				goto error;
 
@@ -621,15 +591,16 @@ key_ref_t lookup_user_key(key_serial_t id, int create, int partial,
 				key = ERR_PTR(ret);
 				goto error;
 			}
+			goto reget_creds;
 		}
 
-		key = t->signal->process_keyring;
+		key = cred->tgcred->process_keyring;
 		atomic_inc(&key->usage);
 		key_ref = make_key_ref(key, 1);
 		break;
 
 	case KEY_SPEC_SESSION_KEYRING:
-		if (!t->signal->session_keyring) {
+		if (!cred->tgcred->session_keyring) {
 			/* always install a session keyring upon access if one
 			 * doesn't exist yet */
 			ret = install_user_keyrings();
@@ -639,10 +610,11 @@ key_ref_t lookup_user_key(key_serial_t id, int create, int partial,
 				cred->user->session_keyring);
 			if (ret < 0)
 				goto error;
+			goto reget_creds;
 		}
 
 		rcu_read_lock();
-		key = rcu_dereference(t->signal->session_keyring);
+		key = rcu_dereference(cred->tgcred->session_keyring);
 		atomic_inc(&key->usage);
 		rcu_read_unlock();
 		key_ref = make_key_ref(key, 1);
@@ -758,6 +730,7 @@ key_ref_t lookup_user_key(key_serial_t id, int create, int partial,
 		goto invalid_key;
 
 error:
+	put_cred(cred);
 	return key_ref;
 
 invalid_key:
@@ -765,6 +738,12 @@ invalid_key:
 	key_ref = ERR_PTR(ret);
 	goto error;
 
+	/* if we attempted to install a keyring, then it may have caused new
+	 * creds to be installed */
+reget_creds:
+	put_cred(cred);
+	goto try_again;
+
 } /* end lookup_user_key() */
 
 /*****************************************************************************/
@@ -777,6 +756,7 @@ invalid_key:
 long join_session_keyring(const char *name)
 {
 	struct task_struct *tsk = current;
+	struct cred *cred = current->cred;
 	struct key *keyring;
 	long ret;
 
@@ -787,7 +767,7 @@ long join_session_keyring(const char *name)
 			goto error;
 
 		rcu_read_lock();
-		ret = rcu_dereference(tsk->signal->session_keyring)->serial;
+		ret = rcu_dereference(cred->tgcred->session_keyring)->serial;
 		rcu_read_unlock();
 		goto error;
 	}
@@ -799,7 +779,7 @@ long join_session_keyring(const char *name)
 	keyring = find_keyring_by_name(name, false);
 	if (PTR_ERR(keyring) == -ENOKEY) {
 		/* not found - try and create a new one */
-		keyring = keyring_alloc(name, tsk->cred->uid, tsk->cred->gid, tsk,
+		keyring = keyring_alloc(name, cred->uid, cred->gid, tsk,
 					KEY_ALLOC_IN_QUOTA, NULL);
 		if (IS_ERR(keyring)) {
 			ret = PTR_ERR(keyring);
diff --git a/security/keys/request_key.c b/security/keys/request_key.c
index 0488b0af5bd..3d12558362d 100644
--- a/security/keys/request_key.c
+++ b/security/keys/request_key.c
@@ -66,7 +66,6 @@ static int call_sbin_request_key(struct key_construction *cons,
 				 const char *op,
 				 void *aux)
 {
-	struct task_struct *tsk = current;
 	const struct cred *cred = current_cred();
 	key_serial_t prkey, sskey;
 	struct key *key = cons->key, *authkey = cons->authkey, *keyring;
@@ -109,18 +108,13 @@ static int call_sbin_request_key(struct key_construction *cons,
 		cred->thread_keyring->serial : 0);
 
 	prkey = 0;
-	if (tsk->signal->process_keyring)
-		prkey = tsk->signal->process_keyring->serial;
+	if (cred->tgcred->process_keyring)
+		prkey = cred->tgcred->process_keyring->serial;
 
-	sprintf(keyring_str[1], "%d", prkey);
-
-	if (tsk->signal->session_keyring) {
-		rcu_read_lock();
-		sskey = rcu_dereference(tsk->signal->session_keyring)->serial;
-		rcu_read_unlock();
-	} else {
+	if (cred->tgcred->session_keyring)
+		sskey = rcu_dereference(cred->tgcred->session_keyring)->serial;
+	else
 		sskey = cred->user->session_keyring->serial;
-	}
 
 	sprintf(keyring_str[2], "%d", sskey);
 
@@ -222,7 +216,7 @@ static int construct_key(struct key *key, const void *callout_info,
 static void construct_get_dest_keyring(struct key **_dest_keyring)
 {
 	struct request_key_auth *rka;
-	struct task_struct *tsk = current;
+	const struct cred *cred = current_cred();
 	struct key *dest_keyring = *_dest_keyring, *authkey;
 
 	kenter("%p", dest_keyring);
@@ -234,11 +228,11 @@ static void construct_get_dest_keyring(struct key **_dest_keyring)
 	} else {
 		/* use a default keyring; falling through the cases until we
 		 * find one that we actually have */
-		switch (tsk->cred->jit_keyring) {
+		switch (cred->jit_keyring) {
 		case KEY_REQKEY_DEFL_DEFAULT:
 		case KEY_REQKEY_DEFL_REQUESTOR_KEYRING:
-			if (tsk->cred->request_key_auth) {
-				authkey = tsk->cred->request_key_auth;
+			if (cred->request_key_auth) {
+				authkey = cred->request_key_auth;
 				down_read(&authkey->sem);
 				rka = authkey->payload.data;
 				if (!test_bit(KEY_FLAG_REVOKED,
@@ -251,19 +245,19 @@ static void construct_get_dest_keyring(struct key **_dest_keyring)
 			}
 
 		case KEY_REQKEY_DEFL_THREAD_KEYRING:
-			dest_keyring = key_get(tsk->cred->thread_keyring);
+			dest_keyring = key_get(cred->thread_keyring);
 			if (dest_keyring)
 				break;
 
 		case KEY_REQKEY_DEFL_PROCESS_KEYRING:
-			dest_keyring = key_get(tsk->signal->process_keyring);
+			dest_keyring = key_get(cred->tgcred->process_keyring);
 			if (dest_keyring)
 				break;
 
 		case KEY_REQKEY_DEFL_SESSION_KEYRING:
 			rcu_read_lock();
 			dest_keyring = key_get(
-				rcu_dereference(tsk->signal->session_keyring));
+				rcu_dereference(cred->tgcred->session_keyring));
 			rcu_read_unlock();
 
 			if (dest_keyring)
@@ -271,11 +265,11 @@ static void construct_get_dest_keyring(struct key **_dest_keyring)
 
 		case KEY_REQKEY_DEFL_USER_SESSION_KEYRING:
 			dest_keyring =
-				key_get(tsk->cred->user->session_keyring);
+				key_get(cred->user->session_keyring);
 			break;
 
 		case KEY_REQKEY_DEFL_USER_KEYRING:
-			dest_keyring = key_get(tsk->cred->user->uid_keyring);
+			dest_keyring = key_get(cred->user->uid_keyring);
 			break;
 
 		case KEY_REQKEY_DEFL_GROUP_KEYRING:
-- 
cgit v1.2.3-70-g09d2


From 745ca2475a6ac596e3d8d37c2759c0fbe2586227 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 14 Nov 2008 10:39:22 +1100
Subject: CRED: Pass credentials through dentry_open()

Pass credentials through dentry_open() so that the COW creds patch can have
SELinux's flush_unauthorized_files() pass the appropriate creds back to itself
when it opens its null chardev.

The security_dentry_open() call also now takes a creds pointer, as does the
dentry_open hook in struct security_operations.

Signed-off-by: David Howells <dhowells@redhat.com>
Acked-by: James Morris <jmorris@namei.org>
Signed-off-by: James Morris <jmorris@namei.org>
---
 arch/powerpc/platforms/cell/spufs/inode.c |  4 ++--
 arch/um/drivers/mconsole_kern.c           |  3 ++-
 fs/autofs4/dev-ioctl.c                    |  3 ++-
 fs/ecryptfs/ecryptfs_kernel.h             |  3 ++-
 fs/ecryptfs/kthread.c                     |  9 +++++----
 fs/ecryptfs/main.c                        |  3 ++-
 fs/exportfs/expfs.c                       |  4 +++-
 fs/hppfs/hppfs.c                          |  6 ++++--
 fs/nfsctl.c                               |  3 ++-
 fs/nfsd/nfs4recover.c                     |  3 ++-
 fs/nfsd/vfs.c                             |  3 ++-
 fs/open.c                                 | 17 +++++++++++------
 fs/xfs/linux-2.6/xfs_ioctl.c              |  3 ++-
 include/linux/fs.h                        |  4 +++-
 include/linux/security.h                  |  7 ++++---
 ipc/mqueue.c                              | 11 +++++++----
 security/capability.c                     |  2 +-
 security/security.c                       |  4 ++--
 security/selinux/hooks.c                  | 15 +++++++++------
 19 files changed, 67 insertions(+), 40 deletions(-)

(limited to 'include/linux')

diff --git a/arch/powerpc/platforms/cell/spufs/inode.c b/arch/powerpc/platforms/cell/spufs/inode.c
index e128ce7f099..6296bfd9cb0 100644
--- a/arch/powerpc/platforms/cell/spufs/inode.c
+++ b/arch/powerpc/platforms/cell/spufs/inode.c
@@ -323,7 +323,7 @@ static int spufs_context_open(struct dentry *dentry, struct vfsmount *mnt)
 		goto out;
 	}
 
-	filp = dentry_open(dentry, mnt, O_RDONLY);
+	filp = dentry_open(dentry, mnt, O_RDONLY, current_cred());
 	if (IS_ERR(filp)) {
 		put_unused_fd(ret);
 		ret = PTR_ERR(filp);
@@ -562,7 +562,7 @@ static int spufs_gang_open(struct dentry *dentry, struct vfsmount *mnt)
 		goto out;
 	}
 
-	filp = dentry_open(dentry, mnt, O_RDONLY);
+	filp = dentry_open(dentry, mnt, O_RDONLY, current_cred());
 	if (IS_ERR(filp)) {
 		put_unused_fd(ret);
 		ret = PTR_ERR(filp);
diff --git a/arch/um/drivers/mconsole_kern.c b/arch/um/drivers/mconsole_kern.c
index 19d579d74d2..16d3b3789a5 100644
--- a/arch/um/drivers/mconsole_kern.c
+++ b/arch/um/drivers/mconsole_kern.c
@@ -159,7 +159,8 @@ void mconsole_proc(struct mc_request *req)
 		goto out_kill;
 	}
 
-	file = dentry_open(nd.path.dentry, nd.path.mnt, O_RDONLY);
+	file = dentry_open(nd.path.dentry, nd.path.mnt, O_RDONLY,
+			   current_cred());
 	if (IS_ERR(file)) {
 		mconsole_reply(req, "Failed to open file", 1, 0);
 		goto out_kill;
diff --git a/fs/autofs4/dev-ioctl.c b/fs/autofs4/dev-ioctl.c
index 625abf5422e..ec16255d27d 100644
--- a/fs/autofs4/dev-ioctl.c
+++ b/fs/autofs4/dev-ioctl.c
@@ -307,7 +307,8 @@ static int autofs_dev_ioctl_open_mountpoint(const char *path, dev_t devid)
 			goto out;
 		}
 
-		filp = dentry_open(nd.path.dentry, nd.path.mnt, O_RDONLY);
+		filp = dentry_open(nd.path.dentry, nd.path.mnt, O_RDONLY,
+				   current_cred());
 		if (IS_ERR(filp)) {
 			err = PTR_ERR(filp);
 			goto out;
diff --git a/fs/ecryptfs/ecryptfs_kernel.h b/fs/ecryptfs/ecryptfs_kernel.h
index 3504cf9df35..a75026d35d1 100644
--- a/fs/ecryptfs/ecryptfs_kernel.h
+++ b/fs/ecryptfs/ecryptfs_kernel.h
@@ -691,7 +691,8 @@ int ecryptfs_init_kthread(void);
 void ecryptfs_destroy_kthread(void);
 int ecryptfs_privileged_open(struct file **lower_file,
 			     struct dentry *lower_dentry,
-			     struct vfsmount *lower_mnt);
+			     struct vfsmount *lower_mnt,
+			     const struct cred *cred);
 int ecryptfs_init_persistent_file(struct dentry *ecryptfs_dentry);
 
 #endif /* #ifndef ECRYPTFS_KERNEL_H */
diff --git a/fs/ecryptfs/kthread.c b/fs/ecryptfs/kthread.c
index c440c6b58b2..c6d7a4d748a 100644
--- a/fs/ecryptfs/kthread.c
+++ b/fs/ecryptfs/kthread.c
@@ -73,7 +73,7 @@ static int ecryptfs_threadfn(void *ignored)
 				mntget(req->lower_mnt);
 				(*req->lower_file) = dentry_open(
 					req->lower_dentry, req->lower_mnt,
-					(O_RDWR | O_LARGEFILE));
+					(O_RDWR | O_LARGEFILE), current_cred());
 				req->flags |= ECRYPTFS_REQ_PROCESSED;
 			}
 			wake_up(&req->wait);
@@ -132,7 +132,8 @@ void ecryptfs_destroy_kthread(void)
  */
 int ecryptfs_privileged_open(struct file **lower_file,
 			     struct dentry *lower_dentry,
-			     struct vfsmount *lower_mnt)
+			     struct vfsmount *lower_mnt,
+			     const struct cred *cred)
 {
 	struct ecryptfs_open_req *req;
 	int rc = 0;
@@ -143,7 +144,7 @@ int ecryptfs_privileged_open(struct file **lower_file,
 	dget(lower_dentry);
 	mntget(lower_mnt);
 	(*lower_file) = dentry_open(lower_dentry, lower_mnt,
-				    (O_RDWR | O_LARGEFILE));
+				    (O_RDWR | O_LARGEFILE), cred);
 	if (!IS_ERR(*lower_file))
 		goto out;
 	req = kmem_cache_alloc(ecryptfs_open_req_cache, GFP_KERNEL);
@@ -184,7 +185,7 @@ int ecryptfs_privileged_open(struct file **lower_file,
 		dget(lower_dentry);
 		mntget(lower_mnt);
 		(*lower_file) = dentry_open(lower_dentry, lower_mnt,
-					    (O_RDONLY | O_LARGEFILE));
+					    (O_RDONLY | O_LARGEFILE), cred);
 		if (IS_ERR(*lower_file)) {
 			rc = PTR_ERR(*req->lower_file);
 			(*lower_file) = NULL;
diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c
index 64d2ba980df..fd630713c5c 100644
--- a/fs/ecryptfs/main.c
+++ b/fs/ecryptfs/main.c
@@ -115,6 +115,7 @@ void __ecryptfs_printk(const char *fmt, ...)
  */
 int ecryptfs_init_persistent_file(struct dentry *ecryptfs_dentry)
 {
+	const struct cred *cred = current_cred();
 	struct ecryptfs_inode_info *inode_info =
 		ecryptfs_inode_to_private(ecryptfs_dentry->d_inode);
 	int rc = 0;
@@ -127,7 +128,7 @@ int ecryptfs_init_persistent_file(struct dentry *ecryptfs_dentry)
 
 		lower_dentry = ecryptfs_dentry_to_lower(ecryptfs_dentry);
 		rc = ecryptfs_privileged_open(&inode_info->lower_file,
-						     lower_dentry, lower_mnt);
+					      lower_dentry, lower_mnt, cred);
 		if (rc || IS_ERR(inode_info->lower_file)) {
 			printk(KERN_ERR "Error opening lower persistent file "
 			       "for lower_dentry [0x%p] and lower_mnt [0x%p]; "
diff --git a/fs/exportfs/expfs.c b/fs/exportfs/expfs.c
index 80246bad1b7..ec1fb918200 100644
--- a/fs/exportfs/expfs.c
+++ b/fs/exportfs/expfs.c
@@ -14,6 +14,7 @@
 #include <linux/module.h>
 #include <linux/mount.h>
 #include <linux/namei.h>
+#include <linux/sched.h>
 
 #define dprintk(fmt, args...) do{}while(0)
 
@@ -249,6 +250,7 @@ static int filldir_one(void * __buf, const char * name, int len,
 static int get_name(struct vfsmount *mnt, struct dentry *dentry,
 		char *name, struct dentry *child)
 {
+	const struct cred *cred = current_cred();
 	struct inode *dir = dentry->d_inode;
 	int error;
 	struct file *file;
@@ -263,7 +265,7 @@ static int get_name(struct vfsmount *mnt, struct dentry *dentry,
 	/*
 	 * Open the directory ...
 	 */
-	file = dentry_open(dget(dentry), mntget(mnt), O_RDONLY);
+	file = dentry_open(dget(dentry), mntget(mnt), O_RDONLY, cred);
 	error = PTR_ERR(file);
 	if (IS_ERR(file))
 		goto out;
diff --git a/fs/hppfs/hppfs.c b/fs/hppfs/hppfs.c
index 2b3d1828db9..795e2c130ad 100644
--- a/fs/hppfs/hppfs.c
+++ b/fs/hppfs/hppfs.c
@@ -426,6 +426,7 @@ static int file_mode(int fmode)
 
 static int hppfs_open(struct inode *inode, struct file *file)
 {
+	const struct cred *cred = current_cred();
 	struct hppfs_private *data;
 	struct vfsmount *proc_mnt;
 	struct dentry *proc_dentry;
@@ -446,7 +447,7 @@ static int hppfs_open(struct inode *inode, struct file *file)
 
 	/* XXX This isn't closed anywhere */
 	data->proc_file = dentry_open(dget(proc_dentry), mntget(proc_mnt),
-				      file_mode(file->f_mode));
+				      file_mode(file->f_mode), cred);
 	err = PTR_ERR(data->proc_file);
 	if (IS_ERR(data->proc_file))
 		goto out_free1;
@@ -489,6 +490,7 @@ static int hppfs_open(struct inode *inode, struct file *file)
 
 static int hppfs_dir_open(struct inode *inode, struct file *file)
 {
+	const struct cred *cred = current_cred();
 	struct hppfs_private *data;
 	struct vfsmount *proc_mnt;
 	struct dentry *proc_dentry;
@@ -502,7 +504,7 @@ static int hppfs_dir_open(struct inode *inode, struct file *file)
 	proc_dentry = HPPFS_I(inode)->proc_dentry;
 	proc_mnt = inode->i_sb->s_fs_info;
 	data->proc_file = dentry_open(dget(proc_dentry), mntget(proc_mnt),
-				      file_mode(file->f_mode));
+				      file_mode(file->f_mode), cred);
 	err = PTR_ERR(data->proc_file);
 	if (IS_ERR(data->proc_file))
 		goto out_free;
diff --git a/fs/nfsctl.c b/fs/nfsctl.c
index aed8145d908..cc4ef2642a5 100644
--- a/fs/nfsctl.c
+++ b/fs/nfsctl.c
@@ -41,7 +41,8 @@ static struct file *do_open(char *name, int flags)
 		error = may_open(&nd, MAY_WRITE, FMODE_WRITE);
 
 	if (!error)
-		return dentry_open(nd.path.dentry, nd.path.mnt, flags);
+		return dentry_open(nd.path.dentry, nd.path.mnt, flags,
+				   current_cred());
 
 	path_put(&nd.path);
 	return ERR_PTR(error);
diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c
index a5e14e8695e..632a50b4b37 100644
--- a/fs/nfsd/nfs4recover.c
+++ b/fs/nfsd/nfs4recover.c
@@ -226,7 +226,8 @@ nfsd4_list_rec_dir(struct dentry *dir, recdir_func *f)
 
 	nfs4_save_user(&uid, &gid);
 
-	filp = dentry_open(dget(dir), mntget(rec_dir.mnt), O_RDONLY);
+	filp = dentry_open(dget(dir), mntget(rec_dir.mnt), O_RDONLY,
+			   current_cred());
 	status = PTR_ERR(filp);
 	if (IS_ERR(filp))
 		goto out;
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 890d9a68c85..b59ec5a6ed2 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -671,6 +671,7 @@ __be32
 nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, int type,
 			int access, struct file **filp)
 {
+	const struct cred *cred = current_cred();
 	struct dentry	*dentry;
 	struct inode	*inode;
 	int		flags = O_RDONLY|O_LARGEFILE;
@@ -725,7 +726,7 @@ nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, int type,
 		DQUOT_INIT(inode);
 	}
 	*filp = dentry_open(dget(dentry), mntget(fhp->fh_export->ex_path.mnt),
-				flags);
+			    flags, cred);
 	if (IS_ERR(*filp))
 		host_err = PTR_ERR(*filp);
 out_nfserr:
diff --git a/fs/open.c b/fs/open.c
index b1238e195e7..f96eaab280a 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -783,7 +783,8 @@ static inline int __get_file_write_access(struct inode *inode,
 
 static struct file *__dentry_open(struct dentry *dentry, struct vfsmount *mnt,
 					int flags, struct file *f,
-					int (*open)(struct inode *, struct file *))
+					int (*open)(struct inode *, struct file *),
+					const struct cred *cred)
 {
 	struct inode *inode;
 	int error;
@@ -807,7 +808,7 @@ static struct file *__dentry_open(struct dentry *dentry, struct vfsmount *mnt,
 	f->f_op = fops_get(inode->i_fop);
 	file_move(f, &inode->i_sb->s_files);
 
-	error = security_dentry_open(f);
+	error = security_dentry_open(f, cred);
 	if (error)
 		goto cleanup_all;
 
@@ -882,6 +883,8 @@ cleanup_file:
 struct file *lookup_instantiate_filp(struct nameidata *nd, struct dentry *dentry,
 		int (*open)(struct inode *, struct file *))
 {
+	const struct cred *cred = current_cred();
+
 	if (IS_ERR(nd->intent.open.file))
 		goto out;
 	if (IS_ERR(dentry))
@@ -889,7 +892,7 @@ struct file *lookup_instantiate_filp(struct nameidata *nd, struct dentry *dentry
 	nd->intent.open.file = __dentry_open(dget(dentry), mntget(nd->path.mnt),
 					     nd->intent.open.flags - 1,
 					     nd->intent.open.file,
-					     open);
+					     open, cred);
 out:
 	return nd->intent.open.file;
 out_err:
@@ -908,6 +911,7 @@ EXPORT_SYMBOL_GPL(lookup_instantiate_filp);
  */
 struct file *nameidata_to_filp(struct nameidata *nd, int flags)
 {
+	const struct cred *cred = current_cred();
 	struct file *filp;
 
 	/* Pick up the filp from the open intent */
@@ -915,7 +919,7 @@ struct file *nameidata_to_filp(struct nameidata *nd, int flags)
 	/* Has the filesystem initialised the file for us? */
 	if (filp->f_path.dentry == NULL)
 		filp = __dentry_open(nd->path.dentry, nd->path.mnt, flags, filp,
-				     NULL);
+				     NULL, cred);
 	else
 		path_put(&nd->path);
 	return filp;
@@ -925,7 +929,8 @@ struct file *nameidata_to_filp(struct nameidata *nd, int flags)
  * dentry_open() will have done dput(dentry) and mntput(mnt) if it returns an
  * error.
  */
-struct file *dentry_open(struct dentry *dentry, struct vfsmount *mnt, int flags)
+struct file *dentry_open(struct dentry *dentry, struct vfsmount *mnt, int flags,
+			 const struct cred *cred)
 {
 	int error;
 	struct file *f;
@@ -950,7 +955,7 @@ struct file *dentry_open(struct dentry *dentry, struct vfsmount *mnt, int flags)
 		return ERR_PTR(error);
 	}
 
-	return __dentry_open(dentry, mnt, flags, f, NULL);
+	return __dentry_open(dentry, mnt, flags, f, NULL, cred);
 }
 EXPORT_SYMBOL(dentry_open);
 
diff --git a/fs/xfs/linux-2.6/xfs_ioctl.c b/fs/xfs/linux-2.6/xfs_ioctl.c
index 67c72aec97e..281cbd5a25c 100644
--- a/fs/xfs/linux-2.6/xfs_ioctl.c
+++ b/fs/xfs/linux-2.6/xfs_ioctl.c
@@ -256,6 +256,7 @@ xfs_open_by_handle(
 	struct file		*parfilp,
 	struct inode		*parinode)
 {
+	const struct cred	*cred = current_cred();
 	int			error;
 	int			new_fd;
 	int			permflag;
@@ -321,7 +322,7 @@ xfs_open_by_handle(
 	mntget(parfilp->f_path.mnt);
 
 	/* Create file pointer. */
-	filp = dentry_open(dentry, parfilp->f_path.mnt, hreq.oflags);
+	filp = dentry_open(dentry, parfilp->f_path.mnt, hreq.oflags, cred);
 	if (IS_ERR(filp)) {
 		put_unused_fd(new_fd);
 		return -XFS_ERROR(-PTR_ERR(filp));
diff --git a/include/linux/fs.h b/include/linux/fs.h
index b3d404aaabe..3bfec1327b8 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -315,6 +315,7 @@ struct poll_table_struct;
 struct kstatfs;
 struct vm_area_struct;
 struct vfsmount;
+struct cred;
 
 extern void __init inode_init(void);
 extern void __init inode_init_early(void);
@@ -1673,7 +1674,8 @@ extern int do_truncate(struct dentry *, loff_t start, unsigned int time_attrs,
 extern long do_sys_open(int dfd, const char __user *filename, int flags,
 			int mode);
 extern struct file *filp_open(const char *, int, int);
-extern struct file * dentry_open(struct dentry *, struct vfsmount *, int);
+extern struct file * dentry_open(struct dentry *, struct vfsmount *, int,
+				 const struct cred *);
 extern int filp_close(struct file *, fl_owner_t id);
 extern char * getname(const char __user *);
 
diff --git a/include/linux/security.h b/include/linux/security.h
index 9239cc11eb9..7e9fe046a0d 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -1402,7 +1402,7 @@ struct security_operations {
 	int (*file_send_sigiotask) (struct task_struct *tsk,
 				    struct fown_struct *fown, int sig);
 	int (*file_receive) (struct file *file);
-	int (*dentry_open) (struct file *file);
+	int (*dentry_open) (struct file *file, const struct cred *cred);
 
 	int (*task_create) (unsigned long clone_flags);
 	int (*cred_alloc_security) (struct cred *cred);
@@ -1658,7 +1658,7 @@ int security_file_set_fowner(struct file *file);
 int security_file_send_sigiotask(struct task_struct *tsk,
 				 struct fown_struct *fown, int sig);
 int security_file_receive(struct file *file);
-int security_dentry_open(struct file *file);
+int security_dentry_open(struct file *file, const struct cred *cred);
 int security_task_create(unsigned long clone_flags);
 int security_cred_alloc(struct cred *cred);
 void security_cred_free(struct cred *cred);
@@ -2171,7 +2171,8 @@ static inline int security_file_receive(struct file *file)
 	return 0;
 }
 
-static inline int security_dentry_open(struct file *file)
+static inline int security_dentry_open(struct file *file,
+				       const struct cred *cred)
 {
 	return 0;
 }
diff --git a/ipc/mqueue.c b/ipc/mqueue.c
index 1151881ccb9..d9393f8e4c3 100644
--- a/ipc/mqueue.c
+++ b/ipc/mqueue.c
@@ -594,6 +594,7 @@ static int mq_attr_ok(struct mq_attr *attr)
 static struct file *do_create(struct dentry *dir, struct dentry *dentry,
 			int oflag, mode_t mode, struct mq_attr __user *u_attr)
 {
+	const struct cred *cred = current_cred();
 	struct mq_attr attr;
 	struct file *result;
 	int ret;
@@ -618,7 +619,7 @@ static struct file *do_create(struct dentry *dir, struct dentry *dentry,
 	if (ret)
 		goto out_drop_write;
 
-	result = dentry_open(dentry, mqueue_mnt, oflag);
+	result = dentry_open(dentry, mqueue_mnt, oflag, cred);
 	/*
 	 * dentry_open() took a persistent mnt_want_write(),
 	 * so we can now drop this one.
@@ -637,8 +638,10 @@ out:
 /* Opens existing queue */
 static struct file *do_open(struct dentry *dentry, int oflag)
 {
-static int oflag2acc[O_ACCMODE] = { MAY_READ, MAY_WRITE,
-					MAY_READ | MAY_WRITE };
+	const struct cred *cred = current_cred();
+
+	static const int oflag2acc[O_ACCMODE] = { MAY_READ, MAY_WRITE,
+						  MAY_READ | MAY_WRITE };
 
 	if ((oflag & O_ACCMODE) == (O_RDWR | O_WRONLY)) {
 		dput(dentry);
@@ -652,7 +655,7 @@ static int oflag2acc[O_ACCMODE] = { MAY_READ, MAY_WRITE,
 		return ERR_PTR(-EACCES);
 	}
 
-	return dentry_open(dentry, mqueue_mnt, oflag);
+	return dentry_open(dentry, mqueue_mnt, oflag, cred);
 }
 
 asmlinkage long sys_mq_open(const char __user *u_name, int oflag, mode_t mode,
diff --git a/security/capability.c b/security/capability.c
index 6c4b5137ca7..fac2f61b69a 100644
--- a/security/capability.c
+++ b/security/capability.c
@@ -330,7 +330,7 @@ static int cap_file_receive(struct file *file)
 	return 0;
 }
 
-static int cap_dentry_open(struct file *file)
+static int cap_dentry_open(struct file *file, const struct cred *cred)
 {
 	return 0;
 }
diff --git a/security/security.c b/security/security.c
index d058f7d5b10..f40a0a04c3c 100644
--- a/security/security.c
+++ b/security/security.c
@@ -606,9 +606,9 @@ int security_file_receive(struct file *file)
 	return security_ops->file_receive(file);
 }
 
-int security_dentry_open(struct file *file)
+int security_dentry_open(struct file *file, const struct cred *cred)
 {
-	return security_ops->dentry_open(file);
+	return security_ops->dentry_open(file, cred);
 }
 
 int security_task_create(unsigned long clone_flags)
diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index cc6e5a3f10c..f20cbd681ba 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -2150,9 +2150,9 @@ extern struct vfsmount *selinuxfs_mount;
 extern struct dentry *selinux_null;
 
 /* Derived from fs/exec.c:flush_old_files. */
-static inline void flush_unauthorized_files(struct files_struct *files)
+static inline void flush_unauthorized_files(const struct cred *cred,
+					    struct files_struct *files)
 {
-	const struct cred *cred = current_cred();
 	struct avc_audit_data ad;
 	struct file *file, *devnull = NULL;
 	struct tty_struct *tty;
@@ -2222,7 +2222,10 @@ static inline void flush_unauthorized_files(struct files_struct *files)
 					if (devnull) {
 						get_file(devnull);
 					} else {
-						devnull = dentry_open(dget(selinux_null), mntget(selinuxfs_mount), O_RDWR);
+						devnull = dentry_open(
+							dget(selinux_null),
+							mntget(selinuxfs_mount),
+							O_RDWR, cred);
 						if (IS_ERR(devnull)) {
 							devnull = NULL;
 							put_unused_fd(fd);
@@ -2302,6 +2305,7 @@ static void selinux_bprm_apply_creds(struct linux_binprm *bprm, int unsafe)
  */
 static void selinux_bprm_post_apply_creds(struct linux_binprm *bprm)
 {
+	const struct cred *cred = current_cred();
 	struct task_security_struct *tsec;
 	struct rlimit *rlim, *initrlim;
 	struct itimerval itimer;
@@ -2321,7 +2325,7 @@ static void selinux_bprm_post_apply_creds(struct linux_binprm *bprm)
 		return;
 
 	/* Close files for which the new task SID is not authorized. */
-	flush_unauthorized_files(current->files);
+	flush_unauthorized_files(cred, current->files);
 
 	/* Check whether the new SID can inherit signal state
 	   from the old SID.  If not, clear itimers to avoid
@@ -3202,9 +3206,8 @@ static int selinux_file_receive(struct file *file)
 	return file_has_perm(cred, file, file_to_av(file));
 }
 
-static int selinux_dentry_open(struct file *file)
+static int selinux_dentry_open(struct file *file, const struct cred *cred)
 {
-	const struct cred *cred = current_cred();
 	struct file_security_struct *fsec;
 	struct inode *inode;
 	struct inode_security_struct *isec;
-- 
cgit v1.2.3-70-g09d2


From d84f4f992cbd76e8f39c488cf0c5d123843923b1 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 14 Nov 2008 10:39:23 +1100
Subject: CRED: Inaugurate COW credentials

Inaugurate copy-on-write credentials management.  This uses RCU to manage the
credentials pointer in the task_struct with respect to accesses by other tasks.
A process may only modify its own credentials, and so does not need locking to
access or modify its own credentials.

A mutex (cred_replace_mutex) is added to the task_struct to control the effect
of PTRACE_ATTACHED on credential calculations, particularly with respect to
execve().

With this patch, the contents of an active credentials struct may not be
changed directly; rather a new set of credentials must be prepared, modified
and committed using something like the following sequence of events:

	struct cred *new = prepare_creds();
	int ret = blah(new);
	if (ret < 0) {
		abort_creds(new);
		return ret;
	}
	return commit_creds(new);

There are some exceptions to this rule: the keyrings pointed to by the active
credentials may be instantiated - keyrings violate the COW rule as managing
COW keyrings is tricky, given that it is possible for a task to directly alter
the keys in a keyring in use by another task.

To help enforce this, various pointers to sets of credentials, such as those in
the task_struct, are declared const.  The purpose of this is compile-time
discouragement of altering credentials through those pointers.  Once a set of
credentials has been made public through one of these pointers, it may not be
modified, except under special circumstances:

  (1) Its reference count may incremented and decremented.

  (2) The keyrings to which it points may be modified, but not replaced.

The only safe way to modify anything else is to create a replacement and commit
using the functions described in Documentation/credentials.txt (which will be
added by a later patch).

This patch and the preceding patches have been tested with the LTP SELinux
testsuite.

This patch makes several logical sets of alteration:

 (1) execve().

     This now prepares and commits credentials in various places in the
     security code rather than altering the current creds directly.

 (2) Temporary credential overrides.

     do_coredump() and sys_faccessat() now prepare their own credentials and
     temporarily override the ones currently on the acting thread, whilst
     preventing interference from other threads by holding cred_replace_mutex
     on the thread being dumped.

     This will be replaced in a future patch by something that hands down the
     credentials directly to the functions being called, rather than altering
     the task's objective credentials.

 (3) LSM interface.

     A number of functions have been changed, added or removed:

     (*) security_capset_check(), ->capset_check()
     (*) security_capset_set(), ->capset_set()

     	 Removed in favour of security_capset().

     (*) security_capset(), ->capset()

     	 New.  This is passed a pointer to the new creds, a pointer to the old
     	 creds and the proposed capability sets.  It should fill in the new
     	 creds or return an error.  All pointers, barring the pointer to the
     	 new creds, are now const.

     (*) security_bprm_apply_creds(), ->bprm_apply_creds()

     	 Changed; now returns a value, which will cause the process to be
     	 killed if it's an error.

     (*) security_task_alloc(), ->task_alloc_security()

     	 Removed in favour of security_prepare_creds().

     (*) security_cred_free(), ->cred_free()

     	 New.  Free security data attached to cred->security.

     (*) security_prepare_creds(), ->cred_prepare()

     	 New. Duplicate any security data attached to cred->security.

     (*) security_commit_creds(), ->cred_commit()

     	 New. Apply any security effects for the upcoming installation of new
     	 security by commit_creds().

     (*) security_task_post_setuid(), ->task_post_setuid()

     	 Removed in favour of security_task_fix_setuid().

     (*) security_task_fix_setuid(), ->task_fix_setuid()

     	 Fix up the proposed new credentials for setuid().  This is used by
     	 cap_set_fix_setuid() to implicitly adjust capabilities in line with
     	 setuid() changes.  Changes are made to the new credentials, rather
     	 than the task itself as in security_task_post_setuid().

     (*) security_task_reparent_to_init(), ->task_reparent_to_init()

     	 Removed.  Instead the task being reparented to init is referred
     	 directly to init's credentials.

	 NOTE!  This results in the loss of some state: SELinux's osid no
	 longer records the sid of the thread that forked it.

     (*) security_key_alloc(), ->key_alloc()
     (*) security_key_permission(), ->key_permission()

     	 Changed.  These now take cred pointers rather than task pointers to
     	 refer to the security context.

 (4) sys_capset().

     This has been simplified and uses less locking.  The LSM functions it
     calls have been merged.

 (5) reparent_to_kthreadd().

     This gives the current thread the same credentials as init by simply using
     commit_thread() to point that way.

 (6) __sigqueue_alloc() and switch_uid()

     __sigqueue_alloc() can't stop the target task from changing its creds
     beneath it, so this function gets a reference to the currently applicable
     user_struct which it then passes into the sigqueue struct it returns if
     successful.

     switch_uid() is now called from commit_creds(), and possibly should be
     folded into that.  commit_creds() should take care of protecting
     __sigqueue_alloc().

 (7) [sg]et[ug]id() and co and [sg]et_current_groups.

     The set functions now all use prepare_creds(), commit_creds() and
     abort_creds() to build and check a new set of credentials before applying
     it.

     security_task_set[ug]id() is called inside the prepared section.  This
     guarantees that nothing else will affect the creds until we've finished.

     The calling of set_dumpable() has been moved into commit_creds().

     Much of the functionality of set_user() has been moved into
     commit_creds().

     The get functions all simply access the data directly.

 (8) security_task_prctl() and cap_task_prctl().

     security_task_prctl() has been modified to return -ENOSYS if it doesn't
     want to handle a function, or otherwise return the return value directly
     rather than through an argument.

     Additionally, cap_task_prctl() now prepares a new set of credentials, even
     if it doesn't end up using it.

 (9) Keyrings.

     A number of changes have been made to the keyrings code:

     (a) switch_uid_keyring(), copy_keys(), exit_keys() and suid_keys() have
     	 all been dropped and built in to the credentials functions directly.
     	 They may want separating out again later.

     (b) key_alloc() and search_process_keyrings() now take a cred pointer
     	 rather than a task pointer to specify the security context.

     (c) copy_creds() gives a new thread within the same thread group a new
     	 thread keyring if its parent had one, otherwise it discards the thread
     	 keyring.

     (d) The authorisation key now points directly to the credentials to extend
     	 the search into rather pointing to the task that carries them.

     (e) Installing thread, process or session keyrings causes a new set of
     	 credentials to be created, even though it's not strictly necessary for
     	 process or session keyrings (they're shared).

(10) Usermode helper.

     The usermode helper code now carries a cred struct pointer in its
     subprocess_info struct instead of a new session keyring pointer.  This set
     of credentials is derived from init_cred and installed on the new process
     after it has been cloned.

     call_usermodehelper_setup() allocates the new credentials and
     call_usermodehelper_freeinfo() discards them if they haven't been used.  A
     special cred function (prepare_usermodeinfo_creds()) is provided
     specifically for call_usermodehelper_setup() to call.

     call_usermodehelper_setkeys() adjusts the credentials to sport the
     supplied keyring as the new session keyring.

(11) SELinux.

     SELinux has a number of changes, in addition to those to support the LSM
     interface changes mentioned above:

     (a) selinux_setprocattr() no longer does its check for whether the
     	 current ptracer can access processes with the new SID inside the lock
     	 that covers getting the ptracer's SID.  Whilst this lock ensures that
     	 the check is done with the ptracer pinned, the result is only valid
     	 until the lock is released, so there's no point doing it inside the
     	 lock.

(12) is_single_threaded().

     This function has been extracted from selinux_setprocattr() and put into
     a file of its own in the lib/ directory as join_session_keyring() now
     wants to use it too.

     The code in SELinux just checked to see whether a task shared mm_structs
     with other tasks (CLONE_VM), but that isn't good enough.  We really want
     to know if they're part of the same thread group (CLONE_THREAD).

(13) nfsd.

     The NFS server daemon now has to use the COW credentials to set the
     credentials it is going to use.  It really needs to pass the credentials
     down to the functions it calls, but it can't do that until other patches
     in this series have been applied.

Signed-off-by: David Howells <dhowells@redhat.com>
Acked-by: James Morris <jmorris@namei.org>
Signed-off-by: James Morris <jmorris@namei.org>
---
 fs/exec.c                        |  31 ++-
 fs/nfsd/auth.c                   |  92 ++++----
 fs/nfsd/nfs4recover.c            |  68 +++---
 fs/nfsd/nfsfh.c                  |  11 +-
 fs/open.c                        |  31 ++-
 include/linux/audit.h            |  22 +-
 include/linux/capability.h       |   2 -
 include/linux/cred.h             |  44 +++-
 include/linux/init_task.h        |   2 +
 include/linux/key.h              |  22 +-
 include/linux/sched.h            |   6 +-
 include/linux/security.h         | 178 +++++++---------
 init/main.c                      |   1 +
 kernel/auditsc.c                 |  42 ++--
 kernel/capability.c              |  78 +++----
 kernel/cred-internals.h          |  21 ++
 kernel/cred.c                    | 321 ++++++++++++++++++++++++----
 kernel/exit.c                    |   9 +-
 kernel/fork.c                    |   7 +-
 kernel/kmod.c                    |  30 ++-
 kernel/ptrace.c                  |   9 +
 kernel/signal.c                  |  10 +-
 kernel/sys.c                     | 450 +++++++++++++++++++++------------------
 kernel/user.c                    |  37 +---
 kernel/user_namespace.c          |  12 +-
 lib/Makefile                     |   2 +-
 net/rxrpc/ar-key.c               |   6 +-
 security/capability.c            |  21 +-
 security/commoncap.c             | 265 +++++++++++------------
 security/keys/internal.h         |  17 +-
 security/keys/key.c              |  25 +--
 security/keys/keyctl.c           |  95 ++++++---
 security/keys/keyring.c          |  14 +-
 security/keys/permission.c       |  24 ++-
 security/keys/proc.c             |   8 +-
 security/keys/process_keys.c     | 333 ++++++++++++++---------------
 security/keys/request_key.c      |  29 ++-
 security/keys/request_key_auth.c |  41 ++--
 security/security.c              |  58 +++--
 security/selinux/hooks.c         | 286 ++++++++++++-------------
 security/smack/smack_lsm.c       |  82 ++++---
 41 files changed, 1603 insertions(+), 1239 deletions(-)
 create mode 100644 kernel/cred-internals.h

(limited to 'include/linux')

diff --git a/fs/exec.c b/fs/exec.c
index a5330e1a221..9bd3559ddec 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1007,13 +1007,12 @@ int flush_old_exec(struct linux_binprm * bprm)
 	 */
 	current->mm->task_size = TASK_SIZE;
 
-	if (bprm->e_uid != current_euid() || bprm->e_gid != current_egid()) {
-		suid_keys(current);
+	if (bprm->e_uid != current_euid() ||
+	    bprm->e_gid != current_egid()) {
 		set_dumpable(current->mm, suid_dumpable);
 		current->pdeath_signal = 0;
 	} else if (file_permission(bprm->file, MAY_READ) ||
 			(bprm->interp_flags & BINPRM_FLAGS_ENFORCE_NONDUMP)) {
-		suid_keys(current);
 		set_dumpable(current->mm, suid_dumpable);
 	}
 
@@ -1096,10 +1095,8 @@ void compute_creds(struct linux_binprm *bprm)
 {
 	int unsafe;
 
-	if (bprm->e_uid != current_uid()) {
-		suid_keys(current);
+	if (bprm->e_uid != current_uid())
 		current->pdeath_signal = 0;
-	}
 	exec_keys(current);
 
 	task_lock(current);
@@ -1709,8 +1706,9 @@ int do_coredump(long signr, int exit_code, struct pt_regs * regs)
 	struct linux_binfmt * binfmt;
 	struct inode * inode;
 	struct file * file;
+	const struct cred *old_cred;
+	struct cred *cred;
 	int retval = 0;
-	int fsuid = current_fsuid();
 	int flag = 0;
 	int ispipe = 0;
 	unsigned long core_limit = current->signal->rlim[RLIMIT_CORE].rlim_cur;
@@ -1723,12 +1721,20 @@ int do_coredump(long signr, int exit_code, struct pt_regs * regs)
 	binfmt = current->binfmt;
 	if (!binfmt || !binfmt->core_dump)
 		goto fail;
+
+	cred = prepare_creds();
+	if (!cred) {
+		retval = -ENOMEM;
+		goto fail;
+	}
+
 	down_write(&mm->mmap_sem);
 	/*
 	 * If another thread got here first, or we are not dumpable, bail out.
 	 */
 	if (mm->core_state || !get_dumpable(mm)) {
 		up_write(&mm->mmap_sem);
+		put_cred(cred);
 		goto fail;
 	}
 
@@ -1739,12 +1745,16 @@ int do_coredump(long signr, int exit_code, struct pt_regs * regs)
 	 */
 	if (get_dumpable(mm) == 2) {	/* Setuid core dump mode */
 		flag = O_EXCL;		/* Stop rewrite attacks */
-		current->cred->fsuid = 0;	/* Dump root private */
+		cred->fsuid = 0;	/* Dump root private */
 	}
 
 	retval = coredump_wait(exit_code, &core_state);
-	if (retval < 0)
+	if (retval < 0) {
+		put_cred(cred);
 		goto fail;
+	}
+
+	old_cred = override_creds(cred);
 
 	/*
 	 * Clear any false indication of pending signals that might
@@ -1835,7 +1845,8 @@ fail_unlock:
 	if (helper_argv)
 		argv_free(helper_argv);
 
-	current->cred->fsuid = fsuid;
+	revert_creds(old_cred);
+	put_cred(cred);
 	coredump_finish(mm);
 fail:
 	return retval;
diff --git a/fs/nfsd/auth.c b/fs/nfsd/auth.c
index 808fc03a6fb..836ffa1047d 100644
--- a/fs/nfsd/auth.c
+++ b/fs/nfsd/auth.c
@@ -27,55 +27,67 @@ int nfsexp_flags(struct svc_rqst *rqstp, struct svc_export *exp)
 
 int nfsd_setuser(struct svc_rqst *rqstp, struct svc_export *exp)
 {
-	struct cred *act_as = current->cred ;
-	struct svc_cred	cred = rqstp->rq_cred;
+	struct group_info *rqgi;
+	struct group_info *gi;
+	struct cred *new;
 	int i;
 	int flags = nfsexp_flags(rqstp, exp);
 	int ret;
 
+	new = prepare_creds();
+	if (!new)
+		return -ENOMEM;
+
+	new->fsuid = rqstp->rq_cred.cr_uid;
+	new->fsgid = rqstp->rq_cred.cr_gid;
+
+	rqgi = rqstp->rq_cred.cr_group_info;
+
 	if (flags & NFSEXP_ALLSQUASH) {
-		cred.cr_uid = exp->ex_anon_uid;
-		cred.cr_gid = exp->ex_anon_gid;
-		cred.cr_group_info = groups_alloc(0);
+		new->fsuid = exp->ex_anon_uid;
+		new->fsgid = exp->ex_anon_gid;
+		gi = groups_alloc(0);
 	} else if (flags & NFSEXP_ROOTSQUASH) {
-		struct group_info *gi;
-		if (!cred.cr_uid)
-			cred.cr_uid = exp->ex_anon_uid;
-		if (!cred.cr_gid)
-			cred.cr_gid = exp->ex_anon_gid;
-		gi = groups_alloc(cred.cr_group_info->ngroups);
-		if (gi)
-			for (i = 0; i < cred.cr_group_info->ngroups; i++) {
-				if (!GROUP_AT(cred.cr_group_info, i))
-					GROUP_AT(gi, i) = exp->ex_anon_gid;
-				else
-					GROUP_AT(gi, i) = GROUP_AT(cred.cr_group_info, i);
-			}
-		cred.cr_group_info = gi;
-	} else
-		get_group_info(cred.cr_group_info);
-
-	if (cred.cr_uid != (uid_t) -1)
-		act_as->fsuid = cred.cr_uid;
-	else
-		act_as->fsuid = exp->ex_anon_uid;
-	if (cred.cr_gid != (gid_t) -1)
-		act_as->fsgid = cred.cr_gid;
-	else
-		act_as->fsgid = exp->ex_anon_gid;
+		if (!new->fsuid)
+			new->fsuid = exp->ex_anon_uid;
+		if (!new->fsgid)
+			new->fsgid = exp->ex_anon_gid;
 
-	if (!cred.cr_group_info)
-		return -ENOMEM;
-	ret = set_groups(act_as, cred.cr_group_info);
-	put_group_info(cred.cr_group_info);
-	if ((cred.cr_uid)) {
-		act_as->cap_effective =
-			cap_drop_nfsd_set(act_as->cap_effective);
+		gi = groups_alloc(rqgi->ngroups);
+		if (!gi)
+			goto oom;
+
+		for (i = 0; i < rqgi->ngroups; i++) {
+			if (!GROUP_AT(rqgi, i))
+				GROUP_AT(gi, i) = exp->ex_anon_gid;
+			else
+				GROUP_AT(gi, i) = GROUP_AT(rqgi, i);
+		}
 	} else {
-		act_as->cap_effective =
-			cap_raise_nfsd_set(act_as->cap_effective,
-					   act_as->cap_permitted);
+		gi = get_group_info(rqgi);
 	}
+
+	if (new->fsuid == (uid_t) -1)
+		new->fsuid = exp->ex_anon_uid;
+	if (new->fsgid == (gid_t) -1)
+		new->fsgid = exp->ex_anon_gid;
+
+	ret = set_groups(new, gi);
+	put_group_info(gi);
+	if (!ret)
+		goto error;
+
+	if (new->uid)
+		new->cap_effective = cap_drop_nfsd_set(new->cap_effective);
+	else
+		new->cap_effective = cap_raise_nfsd_set(new->cap_effective,
+							new->cap_permitted);
+	return commit_creds(new);
+
+oom:
+	ret = -ENOMEM;
+error:
+	abort_creds(new);
 	return ret;
 }
 
diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c
index 632a50b4b37..9371ea12d7f 100644
--- a/fs/nfsd/nfs4recover.c
+++ b/fs/nfsd/nfs4recover.c
@@ -54,20 +54,26 @@
 static struct path rec_dir;
 static int rec_dir_init = 0;
 
-static void
-nfs4_save_user(uid_t *saveuid, gid_t *savegid)
+static int
+nfs4_save_creds(const struct cred **original_creds)
 {
-	*saveuid = current->cred->fsuid;
-	*savegid = current->cred->fsgid;
-	current->cred->fsuid = 0;
-	current->cred->fsgid = 0;
+	struct cred *new;
+
+	new = prepare_creds();
+	if (!new)
+		return -ENOMEM;
+
+	new->fsuid = 0;
+	new->fsgid = 0;
+	*original_creds = override_creds(new);
+	put_cred(new);
+	return 0;
 }
 
 static void
-nfs4_reset_user(uid_t saveuid, gid_t savegid)
+nfs4_reset_creds(const struct cred *original)
 {
-	current->cred->fsuid = saveuid;
-	current->cred->fsgid = savegid;
+	revert_creds(original);
 }
 
 static void
@@ -129,10 +135,9 @@ nfsd4_sync_rec_dir(void)
 int
 nfsd4_create_clid_dir(struct nfs4_client *clp)
 {
+	const struct cred *original_cred;
 	char *dname = clp->cl_recdir;
 	struct dentry *dentry;
-	uid_t uid;
-	gid_t gid;
 	int status;
 
 	dprintk("NFSD: nfsd4_create_clid_dir for \"%s\"\n", dname);
@@ -140,7 +145,9 @@ nfsd4_create_clid_dir(struct nfs4_client *clp)
 	if (!rec_dir_init || clp->cl_firststate)
 		return 0;
 
-	nfs4_save_user(&uid, &gid);
+	status = nfs4_save_creds(&original_cred);
+	if (status < 0)
+		return status;
 
 	/* lock the parent */
 	mutex_lock(&rec_dir.dentry->d_inode->i_mutex);
@@ -168,7 +175,7 @@ out_unlock:
 		clp->cl_firststate = 1;
 		nfsd4_sync_rec_dir();
 	}
-	nfs4_reset_user(uid, gid);
+	nfs4_reset_creds(original_cred);
 	dprintk("NFSD: nfsd4_create_clid_dir returns %d\n", status);
 	return status;
 }
@@ -211,20 +218,21 @@ nfsd4_build_dentrylist(void *arg, const char *name, int namlen,
 static int
 nfsd4_list_rec_dir(struct dentry *dir, recdir_func *f)
 {
+	const struct cred *original_cred;
 	struct file *filp;
 	struct dentry_list_arg dla = {
 		.parent = dir,
 	};
 	struct list_head *dentries = &dla.dentries;
 	struct dentry_list *child;
-	uid_t uid;
-	gid_t gid;
 	int status;
 
 	if (!rec_dir_init)
 		return 0;
 
-	nfs4_save_user(&uid, &gid);
+	status = nfs4_save_creds(&original_cred);
+	if (status < 0)
+		return status;
 
 	filp = dentry_open(dget(dir), mntget(rec_dir.mnt), O_RDONLY,
 			   current_cred());
@@ -250,7 +258,7 @@ out:
 		dput(child->dentry);
 		kfree(child);
 	}
-	nfs4_reset_user(uid, gid);
+	nfs4_reset_creds(original_cred);
 	return status;
 }
 
@@ -312,8 +320,7 @@ out:
 void
 nfsd4_remove_clid_dir(struct nfs4_client *clp)
 {
-	uid_t uid;
-	gid_t gid;
+	const struct cred *original_cred;
 	int status;
 
 	if (!rec_dir_init || !clp->cl_firststate)
@@ -323,9 +330,13 @@ nfsd4_remove_clid_dir(struct nfs4_client *clp)
 	if (status)
 		goto out;
 	clp->cl_firststate = 0;
-	nfs4_save_user(&uid, &gid);
+
+	status = nfs4_save_creds(&original_cred);
+	if (status < 0)
+		goto out;
+
 	status = nfsd4_unlink_clid_dir(clp->cl_recdir, HEXDIR_LEN-1);
-	nfs4_reset_user(uid, gid);
+	nfs4_reset_creds(original_cred);
 	if (status == 0)
 		nfsd4_sync_rec_dir();
 	mnt_drop_write(rec_dir.mnt);
@@ -402,16 +413,21 @@ nfsd4_recdir_load(void) {
 void
 nfsd4_init_recdir(char *rec_dirname)
 {
-	uid_t			uid = 0;
-	gid_t			gid = 0;
-	int 			status;
+	const struct cred *original_cred;
+	int status;
 
 	printk("NFSD: Using %s as the NFSv4 state recovery directory\n",
 			rec_dirname);
 
 	BUG_ON(rec_dir_init);
 
-	nfs4_save_user(&uid, &gid);
+	status = nfs4_save_creds(&original_cred);
+	if (status < 0) {
+		printk("NFSD: Unable to change credentials to find recovery"
+		       " directory: error %d\n",
+		       status);
+		return;
+	}
 
 	status = kern_path(rec_dirname, LOOKUP_FOLLOW | LOOKUP_DIRECTORY,
 			&rec_dir);
@@ -421,7 +437,7 @@ nfsd4_init_recdir(char *rec_dirname)
 
 	if (!status)
 		rec_dir_init = 1;
-	nfs4_reset_user(uid, gid);
+	nfs4_reset_creds(original_cred);
 }
 
 void
diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c
index e67cfaea086..f0da7d9c3a9 100644
--- a/fs/nfsd/nfsfh.c
+++ b/fs/nfsd/nfsfh.c
@@ -186,9 +186,14 @@ static __be32 nfsd_set_fh_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp)
 		 * access control settings being in effect, we cannot
 		 * fix that case easily.
 		 */
-		current->cred->cap_effective =
-			cap_raise_nfsd_set(current->cred->cap_effective,
-					   current->cred->cap_permitted);
+		struct cred *new = prepare_creds();
+		if (!new)
+			return nfserrno(-ENOMEM);
+		new->cap_effective =
+			cap_raise_nfsd_set(new->cap_effective,
+					   new->cap_permitted);
+		put_cred(override_creds(new));
+		put_cred(new);
 	} else {
 		error = nfsd_setuser_and_check_port(rqstp, exp);
 		if (error)
diff --git a/fs/open.c b/fs/open.c
index f96eaab280a..c0a426d5766 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -425,30 +425,33 @@ out:
  */
 asmlinkage long sys_faccessat(int dfd, const char __user *filename, int mode)
 {
-	struct cred *cred = current->cred;
+	const struct cred *old_cred;
+	struct cred *override_cred;
 	struct path path;
 	struct inode *inode;
-	int old_fsuid, old_fsgid;
-	kernel_cap_t uninitialized_var(old_cap);  /* !SECURE_NO_SETUID_FIXUP */
 	int res;
 
 	if (mode & ~S_IRWXO)	/* where's F_OK, X_OK, W_OK, R_OK? */
 		return -EINVAL;
 
-	old_fsuid = cred->fsuid;
-	old_fsgid = cred->fsgid;
+	override_cred = prepare_creds();
+	if (!override_cred)
+		return -ENOMEM;
 
-	cred->fsuid = cred->uid;
-	cred->fsgid = cred->gid;
+	override_cred->fsuid = override_cred->uid;
+	override_cred->fsgid = override_cred->gid;
 
 	if (!issecure(SECURE_NO_SETUID_FIXUP)) {
 		/* Clear the capabilities if we switch to a non-root user */
-		if (current->cred->uid)
-			old_cap = cap_set_effective(__cap_empty_set);
+		if (override_cred->uid)
+			cap_clear(override_cred->cap_effective);
 		else
-			old_cap = cap_set_effective(cred->cap_permitted);
+			override_cred->cap_effective =
+				override_cred->cap_permitted;
 	}
 
+	old_cred = override_creds(override_cred);
+
 	res = user_path_at(dfd, filename, LOOKUP_FOLLOW, &path);
 	if (res)
 		goto out;
@@ -485,12 +488,8 @@ asmlinkage long sys_faccessat(int dfd, const char __user *filename, int mode)
 out_path_release:
 	path_put(&path);
 out:
-	cred->fsuid = old_fsuid;
-	cred->fsgid = old_fsgid;
-
-	if (!issecure(SECURE_NO_SETUID_FIXUP))
-		cap_set_effective(old_cap);
-
+	revert_creds(old_cred);
+	put_cred(override_cred);
 	return res;
 }
 
diff --git a/include/linux/audit.h b/include/linux/audit.h
index 6fbebac7b1b..0b2fcb698a6 100644
--- a/include/linux/audit.h
+++ b/include/linux/audit.h
@@ -454,8 +454,10 @@ extern int __audit_mq_timedsend(mqd_t mqdes, size_t msg_len, unsigned int msg_pr
 extern int __audit_mq_timedreceive(mqd_t mqdes, size_t msg_len, unsigned int __user *u_msg_prio, const struct timespec __user *u_abs_timeout);
 extern int __audit_mq_notify(mqd_t mqdes, const struct sigevent __user *u_notification);
 extern int __audit_mq_getsetattr(mqd_t mqdes, struct mq_attr *mqstat);
-extern void __audit_log_bprm_fcaps(struct linux_binprm *bprm, kernel_cap_t *pP, kernel_cap_t *pE);
-extern int __audit_log_capset(pid_t pid, kernel_cap_t *eff, kernel_cap_t *inh, kernel_cap_t *perm);
+extern int __audit_log_bprm_fcaps(struct linux_binprm *bprm,
+				  const struct cred *new,
+				  const struct cred *old);
+extern int __audit_log_capset(pid_t pid, const struct cred *new, const struct cred *old);
 
 static inline int audit_ipc_obj(struct kern_ipc_perm *ipcp)
 {
@@ -522,16 +524,20 @@ static inline int audit_mq_getsetattr(mqd_t mqdes, struct mq_attr *mqstat)
  *
  * -Eric
  */
-static inline void audit_log_bprm_fcaps(struct linux_binprm *bprm, kernel_cap_t *pP, kernel_cap_t *pE)
+static inline int audit_log_bprm_fcaps(struct linux_binprm *bprm,
+				       const struct cred *new,
+				       const struct cred *old)
 {
 	if (unlikely(!audit_dummy_context()))
-		__audit_log_bprm_fcaps(bprm, pP, pE);
+		return __audit_log_bprm_fcaps(bprm, new, old);
+	return 0;
 }
 
-static inline int audit_log_capset(pid_t pid, kernel_cap_t *eff, kernel_cap_t *inh, kernel_cap_t *perm)
+static inline int audit_log_capset(pid_t pid, const struct cred *new,
+				   const struct cred *old)
 {
 	if (unlikely(!audit_dummy_context()))
-		return __audit_log_capset(pid, eff, inh, perm);
+		return __audit_log_capset(pid, new, old);
 	return 0;
 }
 
@@ -566,8 +572,8 @@ extern int audit_signals;
 #define audit_mq_timedreceive(d,l,p,t) ({ 0; })
 #define audit_mq_notify(d,n) ({ 0; })
 #define audit_mq_getsetattr(d,s) ({ 0; })
-#define audit_log_bprm_fcaps(b, p, e) do { ; } while (0)
-#define audit_log_capset(pid, e, i, p) ({ 0; })
+#define audit_log_bprm_fcaps(b, ncr, ocr) ({ 0; })
+#define audit_log_capset(pid, ncr, ocr) ({ 0; })
 #define audit_ptrace(t) ((void)0)
 #define audit_n_rules 0
 #define audit_signals 0
diff --git a/include/linux/capability.h b/include/linux/capability.h
index 7f26580a5a4..e22f48c2a46 100644
--- a/include/linux/capability.h
+++ b/include/linux/capability.h
@@ -519,8 +519,6 @@ extern const kernel_cap_t __cap_empty_set;
 extern const kernel_cap_t __cap_full_set;
 extern const kernel_cap_t __cap_init_eff_set;
 
-kernel_cap_t cap_set_effective(const kernel_cap_t pE_new);
-
 /**
  * has_capability - Determine if a task has a superior capability available
  * @t: The task in question
diff --git a/include/linux/cred.h b/include/linux/cred.h
index 62b9e532422..eaf6fa695a0 100644
--- a/include/linux/cred.h
+++ b/include/linux/cred.h
@@ -84,6 +84,8 @@ struct thread_group_cred {
 	struct key	*process_keyring;	/* keyring private to this process */
 	struct rcu_head	rcu;			/* RCU deletion hook */
 };
+
+extern void release_tgcred(struct cred *cred);
 #endif
 
 /*
@@ -137,11 +139,30 @@ struct cred {
 	struct user_struct *user;	/* real user ID subscription */
 	struct group_info *group_info;	/* supplementary groups for euid/fsgid */
 	struct rcu_head	rcu;		/* RCU deletion hook */
-	spinlock_t	lock;		/* lock for pointer changes */
 };
 
 extern void __put_cred(struct cred *);
 extern int copy_creds(struct task_struct *, unsigned long);
+extern struct cred *prepare_creds(void);
+extern struct cred *prepare_usermodehelper_creds(void);
+extern int commit_creds(struct cred *);
+extern void abort_creds(struct cred *);
+extern const struct cred *override_creds(const struct cred *) __deprecated;
+extern void revert_creds(const struct cred *) __deprecated;
+extern void __init cred_init(void);
+
+/**
+ * get_new_cred - Get a reference on a new set of credentials
+ * @cred: The new credentials to reference
+ *
+ * Get a reference on the specified set of new credentials.  The caller must
+ * release the reference.
+ */
+static inline struct cred *get_new_cred(struct cred *cred)
+{
+	atomic_inc(&cred->usage);
+	return cred;
+}
 
 /**
  * get_cred - Get a reference on a set of credentials
@@ -150,10 +171,9 @@ extern int copy_creds(struct task_struct *, unsigned long);
  * Get a reference on the specified set of credentials.  The caller must
  * release the reference.
  */
-static inline struct cred *get_cred(struct cred *cred)
+static inline const struct cred *get_cred(const struct cred *cred)
 {
-	atomic_inc(&cred->usage);
-	return cred;
+	return get_new_cred((struct cred *) cred);
 }
 
 /**
@@ -166,6 +186,8 @@ static inline struct cred *get_cred(struct cred *cred)
 static inline void put_cred(const struct cred *_cred)
 {
 	struct cred *cred = (struct cred *) _cred;
+
+	BUG_ON(atomic_read(&(cred)->usage) <= 0);
 	if (atomic_dec_and_test(&(cred)->usage))
 		__put_cred(cred);
 }
@@ -250,13 +272,13 @@ static inline void put_cred(const struct cred *_cred)
 	__groups;					\
 })
 
-#define task_cred_xxx(task, xxx)		\
-({						\
-	__typeof__(task->cred->xxx) ___val;	\
-	rcu_read_lock();			\
-	___val = __task_cred((task))->xxx;	\
-	rcu_read_unlock();			\
-	___val;					\
+#define task_cred_xxx(task, xxx)			\
+({							\
+	__typeof__(((struct cred *)NULL)->xxx) ___val;	\
+	rcu_read_lock();				\
+	___val = __task_cred((task))->xxx;		\
+	rcu_read_unlock();				\
+	___val;						\
 })
 
 #define task_uid(task)		(task_cred_xxx((task), uid))
diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index 5e24c54b6df..08c3b24ad9a 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -150,6 +150,8 @@ extern struct cred init_cred;
 	.sibling	= LIST_HEAD_INIT(tsk.sibling),			\
 	.group_leader	= &tsk,						\
 	.cred		= &init_cred,					\
+	.cred_exec_mutex =						\
+		 __MUTEX_INITIALIZER(tsk.cred_exec_mutex),		\
 	.comm		= "swapper",					\
 	.thread		= INIT_THREAD,					\
 	.fs		= &init_fs,					\
diff --git a/include/linux/key.h b/include/linux/key.h
index 0836cc838b0..69ecf0934b0 100644
--- a/include/linux/key.h
+++ b/include/linux/key.h
@@ -73,6 +73,7 @@ struct key;
 struct seq_file;
 struct user_struct;
 struct signal_struct;
+struct cred;
 
 struct key_type;
 struct key_owner;
@@ -181,7 +182,7 @@ struct key {
 extern struct key *key_alloc(struct key_type *type,
 			     const char *desc,
 			     uid_t uid, gid_t gid,
-			     struct task_struct *ctx,
+			     const struct cred *cred,
 			     key_perm_t perm,
 			     unsigned long flags);
 
@@ -249,7 +250,7 @@ extern int key_unlink(struct key *keyring,
 		      struct key *key);
 
 extern struct key *keyring_alloc(const char *description, uid_t uid, gid_t gid,
-				 struct task_struct *ctx,
+				 const struct cred *cred,
 				 unsigned long flags,
 				 struct key *dest);
 
@@ -276,22 +277,12 @@ extern ctl_table key_sysctls[];
 /*
  * the userspace interface
  */
-extern void switch_uid_keyring(struct user_struct *new_user);
-extern int copy_keys(unsigned long clone_flags, struct task_struct *tsk);
-extern void exit_keys(struct task_struct *tsk);
-extern int suid_keys(struct task_struct *tsk);
+extern int install_thread_keyring_to_cred(struct cred *cred);
 extern int exec_keys(struct task_struct *tsk);
 extern void key_fsuid_changed(struct task_struct *tsk);
 extern void key_fsgid_changed(struct task_struct *tsk);
 extern void key_init(void);
 
-#define __install_session_keyring(keyring)				\
-({									\
-	struct key *old_session = current->cred->tgcred->session_keyring; \
-	current->cred->tgcred->session_keyring = keyring;		\
-	old_session;							\
-})
-
 #else /* CONFIG_KEYS */
 
 #define key_validate(k)			0
@@ -303,11 +294,6 @@ extern void key_init(void);
 #define make_key_ref(k, p)		NULL
 #define key_ref_to_ptr(k)		NULL
 #define is_key_possessed(k)		0
-#define switch_uid_keyring(u)		do { } while(0)
-#define __install_session_keyring(k)	({ NULL; })
-#define copy_keys(f,t)			0
-#define exit_keys(t)			do { } while(0)
-#define suid_keys(t)			do { } while(0)
 #define exec_keys(t)			do { } while(0)
 #define key_fsuid_changed(t)		do { } while(0)
 #define key_fsgid_changed(t)		do { } while(0)
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 2913252989b..121d655e460 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1145,7 +1145,8 @@ struct task_struct {
 	struct list_head cpu_timers[3];
 
 /* process credentials */
-	struct cred *cred;	/* actual/objective task credentials */
+	const struct cred *cred;	/* actual/objective task credentials (COW) */
+	struct mutex cred_exec_mutex;	/* execve vs ptrace cred calculation mutex */
 
 	char comm[TASK_COMM_LEN]; /* executable name excluding path
 				     - access with [gs]et_task_comm (which lock
@@ -1720,7 +1721,6 @@ static inline struct user_struct *get_uid(struct user_struct *u)
 	return u;
 }
 extern void free_uid(struct user_struct *);
-extern void switch_uid(struct user_struct *);
 extern void release_uids(struct user_namespace *ns);
 
 #include <asm/current.h>
@@ -1870,6 +1870,8 @@ static inline unsigned long wait_task_inactive(struct task_struct *p,
 #define for_each_process(p) \
 	for (p = &init_task ; (p = next_task(p)) != &init_task ; )
 
+extern bool is_single_threaded(struct task_struct *);
+
 /*
  * Careful: do_each_thread/while_each_thread is a double loop so
  *          'break' will not work as expected - use goto instead.
diff --git a/include/linux/security.h b/include/linux/security.h
index 7e9fe046a0d..68be1125144 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -53,24 +53,21 @@ extern int cap_settime(struct timespec *ts, struct timezone *tz);
 extern int cap_ptrace_may_access(struct task_struct *child, unsigned int mode);
 extern int cap_ptrace_traceme(struct task_struct *parent);
 extern int cap_capget(struct task_struct *target, kernel_cap_t *effective, kernel_cap_t *inheritable, kernel_cap_t *permitted);
-extern int cap_capset_check(const kernel_cap_t *effective,
-			    const kernel_cap_t *inheritable,
-			    const kernel_cap_t *permitted);
-extern void cap_capset_set(const kernel_cap_t *effective,
-			   const kernel_cap_t *inheritable,
-			   const kernel_cap_t *permitted);
+extern int cap_capset(struct cred *new, const struct cred *old,
+		      const kernel_cap_t *effective,
+		      const kernel_cap_t *inheritable,
+		      const kernel_cap_t *permitted);
 extern int cap_bprm_set_security(struct linux_binprm *bprm);
-extern void cap_bprm_apply_creds(struct linux_binprm *bprm, int unsafe);
+extern int cap_bprm_apply_creds(struct linux_binprm *bprm, int unsafe);
 extern int cap_bprm_secureexec(struct linux_binprm *bprm);
 extern int cap_inode_setxattr(struct dentry *dentry, const char *name,
 			      const void *value, size_t size, int flags);
 extern int cap_inode_removexattr(struct dentry *dentry, const char *name);
 extern int cap_inode_need_killpriv(struct dentry *dentry);
 extern int cap_inode_killpriv(struct dentry *dentry);
-extern int cap_task_post_setuid(uid_t old_ruid, uid_t old_euid, uid_t old_suid, int flags);
-extern void cap_task_reparent_to_init(struct task_struct *p);
+extern int cap_task_fix_setuid(struct cred *new, const struct cred *old, int flags);
 extern int cap_task_prctl(int option, unsigned long arg2, unsigned long arg3,
-			  unsigned long arg4, unsigned long arg5, long *rc_p);
+			  unsigned long arg4, unsigned long arg5);
 extern int cap_task_setscheduler(struct task_struct *p, int policy, struct sched_param *lp);
 extern int cap_task_setioprio(struct task_struct *p, int ioprio);
 extern int cap_task_setnice(struct task_struct *p, int nice);
@@ -170,8 +167,8 @@ static inline void security_free_mnt_opts(struct security_mnt_opts *opts)
  *	Compute and set the security attributes of a process being transformed
  *	by an execve operation based on the old attributes (current->security)
  *	and the information saved in @bprm->security by the set_security hook.
- *	Since this hook function (and its caller) are void, this hook can not
- *	return an error.  However, it can leave the security attributes of the
+ *	Since this function may return an error, in which case the process will
+ *      be killed.  However, it can leave the security attributes of the
  *	process unchanged if an access failure occurs at this point.
  *	bprm_apply_creds is called under task_lock.  @unsafe indicates various
  *	reasons why it may be unsafe to change security state.
@@ -593,15 +590,18 @@ static inline void security_free_mnt_opts(struct security_mnt_opts *opts)
  *	manual page for definitions of the @clone_flags.
  *	@clone_flags contains the flags indicating what should be shared.
  *	Return 0 if permission is granted.
- * @cred_alloc_security:
- *	@cred contains the cred struct for child process.
- *	Allocate and attach a security structure to the cred->security field.
- *	The security field is initialized to NULL when the task structure is
- *	allocated.
- *	Return 0 if operation was successful.
  * @cred_free:
  *	@cred points to the credentials.
  *	Deallocate and clear the cred->security field in a set of credentials.
+ * @cred_prepare:
+ *	@new points to the new credentials.
+ *	@old points to the original credentials.
+ *	@gfp indicates the atomicity of any memory allocations.
+ *	Prepare a new set of credentials by copying the data from the old set.
+ * @cred_commit:
+ *	@new points to the new credentials.
+ *	@old points to the original credentials.
+ *	Install a new set of credentials.
  * @task_setuid:
  *	Check permission before setting one or more of the user identity
  *	attributes of the current process.  The @flags parameter indicates
@@ -614,15 +614,13 @@ static inline void security_free_mnt_opts(struct security_mnt_opts *opts)
  *	@id2 contains a uid.
  *	@flags contains one of the LSM_SETID_* values.
  *	Return 0 if permission is granted.
- * @task_post_setuid:
+ * @task_fix_setuid:
  *	Update the module's state after setting one or more of the user
  *	identity attributes of the current process.  The @flags parameter
  *	indicates which of the set*uid system calls invoked this hook.  If
- *	@flags is LSM_SETID_FS, then @old_ruid is the old fs uid and the other
- *	parameters are not used.
- *	@old_ruid contains the old real uid (or fs uid if LSM_SETID_FS).
- *	@old_euid contains the old effective uid (or -1 if LSM_SETID_FS).
- *	@old_suid contains the old saved uid (or -1 if LSM_SETID_FS).
+ *	@new is the set of credentials that will be installed.  Modifications
+ *	should be made to this rather than to @current->cred.
+ *	@old is the set of credentials that are being replaces
  *	@flags contains one of the LSM_SETID_* values.
  *	Return 0 on success.
  * @task_setgid:
@@ -725,13 +723,8 @@ static inline void security_free_mnt_opts(struct security_mnt_opts *opts)
  *	@arg3 contains a argument.
  *	@arg4 contains a argument.
  *	@arg5 contains a argument.
- *      @rc_p contains a pointer to communicate back the forced return code
- *	Return 0 if permission is granted, and non-zero if the security module
- *      has taken responsibility (setting *rc_p) for the prctl call.
- * @task_reparent_to_init:
- *	Set the security attributes in @p->security for a kernel thread that
- *	is being reparented to the init task.
- *	@p contains the task_struct for the kernel thread.
+ *	Return -ENOSYS if no-one wanted to handle this op, any other value to
+ *	cause prctl() to return immediately with that value.
  * @task_to_inode:
  *	Set the security attributes for an inode based on an associated task's
  *	security attributes, e.g. for /proc/pid inodes.
@@ -1008,7 +1001,7 @@ static inline void security_free_mnt_opts(struct security_mnt_opts *opts)
  *	See whether a specific operational right is granted to a process on a
  *	key.
  *	@key_ref refers to the key (key pointer + possession attribute bit).
- *	@context points to the process to provide the context against which to
+ *	@cred points to the credentials to provide the context against which to
  *	evaluate the security data on the key.
  *	@perm describes the combination of permissions required of this key.
  *	Return 1 if permission granted, 0 if permission denied and -ve it the
@@ -1170,6 +1163,7 @@ static inline void security_free_mnt_opts(struct security_mnt_opts *opts)
  *	@child process.
  *	Security modules may also want to perform a process tracing check
  *	during an execve in the set_security or apply_creds hooks of
+ *	tracing check during an execve in the bprm_set_creds hook of
  *	binprm_security_ops if the process is being traced and its security
  *	attributes would be changed by the execve.
  *	@child contains the task_struct structure for the target process.
@@ -1193,19 +1187,15 @@ static inline void security_free_mnt_opts(struct security_mnt_opts *opts)
  *	@inheritable contains the inheritable capability set.
  *	@permitted contains the permitted capability set.
  *	Return 0 if the capability sets were successfully obtained.
- * @capset_check:
- *	Check permission before setting the @effective, @inheritable, and
- *	@permitted capability sets for the current process.
- *	@effective contains the effective capability set.
- *	@inheritable contains the inheritable capability set.
- *	@permitted contains the permitted capability set.
- *	Return 0 if permission is granted.
- * @capset_set:
+ * @capset:
  *	Set the @effective, @inheritable, and @permitted capability sets for
  *	the current process.
+ *	@new contains the new credentials structure for target process.
+ *	@old contains the current credentials structure for target process.
  *	@effective contains the effective capability set.
  *	@inheritable contains the inheritable capability set.
  *	@permitted contains the permitted capability set.
+ *	Return 0 and update @new if permission is granted.
  * @capable:
  *	Check whether the @tsk process has the @cap capability.
  *	@tsk contains the task_struct for the process.
@@ -1297,12 +1287,11 @@ struct security_operations {
 	int (*capget) (struct task_struct *target,
 		       kernel_cap_t *effective,
 		       kernel_cap_t *inheritable, kernel_cap_t *permitted);
-	int (*capset_check) (const kernel_cap_t *effective,
-			     const kernel_cap_t *inheritable,
-			     const kernel_cap_t *permitted);
-	void (*capset_set) (const kernel_cap_t *effective,
-			    const kernel_cap_t *inheritable,
-			    const kernel_cap_t *permitted);
+	int (*capset) (struct cred *new,
+		       const struct cred *old,
+		       const kernel_cap_t *effective,
+		       const kernel_cap_t *inheritable,
+		       const kernel_cap_t *permitted);
 	int (*capable) (struct task_struct *tsk, int cap, int audit);
 	int (*acct) (struct file *file);
 	int (*sysctl) (struct ctl_table *table, int op);
@@ -1314,7 +1303,7 @@ struct security_operations {
 
 	int (*bprm_alloc_security) (struct linux_binprm *bprm);
 	void (*bprm_free_security) (struct linux_binprm *bprm);
-	void (*bprm_apply_creds) (struct linux_binprm *bprm, int unsafe);
+	int (*bprm_apply_creds) (struct linux_binprm *bprm, int unsafe);
 	void (*bprm_post_apply_creds) (struct linux_binprm *bprm);
 	int (*bprm_set_security) (struct linux_binprm *bprm);
 	int (*bprm_check_security) (struct linux_binprm *bprm);
@@ -1405,11 +1394,13 @@ struct security_operations {
 	int (*dentry_open) (struct file *file, const struct cred *cred);
 
 	int (*task_create) (unsigned long clone_flags);
-	int (*cred_alloc_security) (struct cred *cred);
 	void (*cred_free) (struct cred *cred);
+	int (*cred_prepare)(struct cred *new, const struct cred *old,
+			    gfp_t gfp);
+	void (*cred_commit)(struct cred *new, const struct cred *old);
 	int (*task_setuid) (uid_t id0, uid_t id1, uid_t id2, int flags);
-	int (*task_post_setuid) (uid_t old_ruid /* or fsuid */ ,
-				 uid_t old_euid, uid_t old_suid, int flags);
+	int (*task_fix_setuid) (struct cred *new, const struct cred *old,
+				int flags);
 	int (*task_setgid) (gid_t id0, gid_t id1, gid_t id2, int flags);
 	int (*task_setpgid) (struct task_struct *p, pid_t pgid);
 	int (*task_getpgid) (struct task_struct *p);
@@ -1429,8 +1420,7 @@ struct security_operations {
 	int (*task_wait) (struct task_struct *p);
 	int (*task_prctl) (int option, unsigned long arg2,
 			   unsigned long arg3, unsigned long arg4,
-			   unsigned long arg5, long *rc_p);
-	void (*task_reparent_to_init) (struct task_struct *p);
+			   unsigned long arg5);
 	void (*task_to_inode) (struct task_struct *p, struct inode *inode);
 
 	int (*ipc_permission) (struct kern_ipc_perm *ipcp, short flag);
@@ -1535,10 +1525,10 @@ struct security_operations {
 
 	/* key management security hooks */
 #ifdef CONFIG_KEYS
-	int (*key_alloc) (struct key *key, struct task_struct *tsk, unsigned long flags);
+	int (*key_alloc) (struct key *key, const struct cred *cred, unsigned long flags);
 	void (*key_free) (struct key *key);
 	int (*key_permission) (key_ref_t key_ref,
-			       struct task_struct *context,
+			       const struct cred *cred,
 			       key_perm_t perm);
 	int (*key_getsecurity)(struct key *key, char **_buffer);
 #endif	/* CONFIG_KEYS */
@@ -1564,12 +1554,10 @@ int security_capget(struct task_struct *target,
 		    kernel_cap_t *effective,
 		    kernel_cap_t *inheritable,
 		    kernel_cap_t *permitted);
-int security_capset_check(const kernel_cap_t *effective,
-			  const kernel_cap_t *inheritable,
-			  const kernel_cap_t *permitted);
-void security_capset_set(const kernel_cap_t *effective,
-			 const kernel_cap_t *inheritable,
-			 const kernel_cap_t *permitted);
+int security_capset(struct cred *new, const struct cred *old,
+		    const kernel_cap_t *effective,
+		    const kernel_cap_t *inheritable,
+		    const kernel_cap_t *permitted);
 int security_capable(struct task_struct *tsk, int cap);
 int security_capable_noaudit(struct task_struct *tsk, int cap);
 int security_acct(struct file *file);
@@ -1583,7 +1571,7 @@ int security_vm_enough_memory_mm(struct mm_struct *mm, long pages);
 int security_vm_enough_memory_kern(long pages);
 int security_bprm_alloc(struct linux_binprm *bprm);
 void security_bprm_free(struct linux_binprm *bprm);
-void security_bprm_apply_creds(struct linux_binprm *bprm, int unsafe);
+int security_bprm_apply_creds(struct linux_binprm *bprm, int unsafe);
 void security_bprm_post_apply_creds(struct linux_binprm *bprm);
 int security_bprm_set(struct linux_binprm *bprm);
 int security_bprm_check(struct linux_binprm *bprm);
@@ -1660,11 +1648,12 @@ int security_file_send_sigiotask(struct task_struct *tsk,
 int security_file_receive(struct file *file);
 int security_dentry_open(struct file *file, const struct cred *cred);
 int security_task_create(unsigned long clone_flags);
-int security_cred_alloc(struct cred *cred);
 void security_cred_free(struct cred *cred);
+int security_prepare_creds(struct cred *new, const struct cred *old, gfp_t gfp);
+void security_commit_creds(struct cred *new, const struct cred *old);
 int security_task_setuid(uid_t id0, uid_t id1, uid_t id2, int flags);
-int security_task_post_setuid(uid_t old_ruid, uid_t old_euid,
-			      uid_t old_suid, int flags);
+int security_task_fix_setuid(struct cred *new, const struct cred *old,
+			     int flags);
 int security_task_setgid(gid_t id0, gid_t id1, gid_t id2, int flags);
 int security_task_setpgid(struct task_struct *p, pid_t pgid);
 int security_task_getpgid(struct task_struct *p);
@@ -1683,8 +1672,7 @@ int security_task_kill(struct task_struct *p, struct siginfo *info,
 			int sig, u32 secid);
 int security_task_wait(struct task_struct *p);
 int security_task_prctl(int option, unsigned long arg2, unsigned long arg3,
-			 unsigned long arg4, unsigned long arg5, long *rc_p);
-void security_task_reparent_to_init(struct task_struct *p);
+			unsigned long arg4, unsigned long arg5);
 void security_task_to_inode(struct task_struct *p, struct inode *inode);
 int security_ipc_permission(struct kern_ipc_perm *ipcp, short flag);
 void security_ipc_getsecid(struct kern_ipc_perm *ipcp, u32 *secid);
@@ -1759,18 +1747,13 @@ static inline int security_capget(struct task_struct *target,
 	return cap_capget(target, effective, inheritable, permitted);
 }
 
-static inline int security_capset_check(const kernel_cap_t *effective,
-					const kernel_cap_t *inheritable,
-					const kernel_cap_t *permitted)
+static inline int security_capset(struct cred *new,
+				   const struct cred *old,
+				   const kernel_cap_t *effective,
+				   const kernel_cap_t *inheritable,
+				   const kernel_cap_t *permitted)
 {
-	return cap_capset_check(effective, inheritable, permitted);
-}
-
-static inline void security_capset_set(const kernel_cap_t *effective,
-				       const kernel_cap_t *inheritable,
-				       const kernel_cap_t *permitted)
-{
-	cap_capset_set(effective, inheritable, permitted);
+	return cap_capset(new, old, effective, inheritable, permitted);
 }
 
 static inline int security_capable(struct task_struct *tsk, int cap)
@@ -1837,9 +1820,9 @@ static inline int security_bprm_alloc(struct linux_binprm *bprm)
 static inline void security_bprm_free(struct linux_binprm *bprm)
 { }
 
-static inline void security_bprm_apply_creds(struct linux_binprm *bprm, int unsafe)
+static inline int security_bprm_apply_creds(struct linux_binprm *bprm, int unsafe)
 {
-	cap_bprm_apply_creds(bprm, unsafe);
+	return cap_bprm_apply_creds(bprm, unsafe);
 }
 
 static inline void security_bprm_post_apply_creds(struct linux_binprm *bprm)
@@ -2182,13 +2165,20 @@ static inline int security_task_create(unsigned long clone_flags)
 	return 0;
 }
 
-static inline int security_cred_alloc(struct cred *cred)
+static inline void security_cred_free(struct cred *cred)
+{ }
+
+static inline int security_prepare_creds(struct cred *new,
+					 const struct cred *old,
+					 gfp_t gfp)
 {
 	return 0;
 }
 
-static inline void security_cred_free(struct cred *cred)
-{ }
+static inline void security_commit_creds(struct cred *new,
+					 const struct cred *old)
+{
+}
 
 static inline int security_task_setuid(uid_t id0, uid_t id1, uid_t id2,
 				       int flags)
@@ -2196,10 +2186,11 @@ static inline int security_task_setuid(uid_t id0, uid_t id1, uid_t id2,
 	return 0;
 }
 
-static inline int security_task_post_setuid(uid_t old_ruid, uid_t old_euid,
-					    uid_t old_suid, int flags)
+static inline int security_task_fix_setuid(struct cred *new,
+					   const struct cred *old,
+					   int flags)
 {
-	return cap_task_post_setuid(old_ruid, old_euid, old_suid, flags);
+	return cap_task_fix_setuid(new, old, flags);
 }
 
 static inline int security_task_setgid(gid_t id0, gid_t id1, gid_t id2,
@@ -2286,14 +2277,9 @@ static inline int security_task_wait(struct task_struct *p)
 static inline int security_task_prctl(int option, unsigned long arg2,
 				      unsigned long arg3,
 				      unsigned long arg4,
-				      unsigned long arg5, long *rc_p)
-{
-	return cap_task_prctl(option, arg2, arg3, arg3, arg5, rc_p);
-}
-
-static inline void security_task_reparent_to_init(struct task_struct *p)
+				      unsigned long arg5)
 {
-	cap_task_reparent_to_init(p);
+	return cap_task_prctl(option, arg2, arg3, arg3, arg5);
 }
 
 static inline void security_task_to_inode(struct task_struct *p, struct inode *inode)
@@ -2719,16 +2705,16 @@ static inline void security_skb_classify_flow(struct sk_buff *skb, struct flowi
 #ifdef CONFIG_KEYS
 #ifdef CONFIG_SECURITY
 
-int security_key_alloc(struct key *key, struct task_struct *tsk, unsigned long flags);
+int security_key_alloc(struct key *key, const struct cred *cred, unsigned long flags);
 void security_key_free(struct key *key);
 int security_key_permission(key_ref_t key_ref,
-			    struct task_struct *context, key_perm_t perm);
+			    const struct cred *cred, key_perm_t perm);
 int security_key_getsecurity(struct key *key, char **_buffer);
 
 #else
 
 static inline int security_key_alloc(struct key *key,
-				     struct task_struct *tsk,
+				     const struct cred *cred,
 				     unsigned long flags)
 {
 	return 0;
@@ -2739,7 +2725,7 @@ static inline void security_key_free(struct key *key)
 }
 
 static inline int security_key_permission(key_ref_t key_ref,
-					  struct task_struct *context,
+					  const struct cred *cred,
 					  key_perm_t perm)
 {
 	return 0;
diff --git a/init/main.c b/init/main.c
index 7e117a231af..db843bff573 100644
--- a/init/main.c
+++ b/init/main.c
@@ -669,6 +669,7 @@ asmlinkage void __init start_kernel(void)
 		efi_enter_virtual_mode();
 #endif
 	thread_info_cache_init();
+	cred_init();
 	fork_init(num_physpages);
 	proc_caches_init();
 	buffer_init();
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index ae8ef88ade3..bc1e2d854bf 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -2546,18 +2546,17 @@ int __audit_signal_info(int sig, struct task_struct *t)
 
 /**
  * __audit_log_bprm_fcaps - store information about a loading bprm and relevant fcaps
- * @bprm pointer to the bprm being processed
- * @caps the caps read from the disk
+ * @bprm: pointer to the bprm being processed
+ * @new: the proposed new credentials
+ * @old: the old credentials
  *
  * Simply check if the proc already has the caps given by the file and if not
  * store the priv escalation info for later auditing at the end of the syscall
  *
- * this can fail and we don't care.  See the note in audit.h for
- * audit_log_bprm_fcaps() for my explaination....
- *
  * -Eric
  */
-void __audit_log_bprm_fcaps(struct linux_binprm *bprm, kernel_cap_t *pP, kernel_cap_t *pE)
+int __audit_log_bprm_fcaps(struct linux_binprm *bprm,
+			   const struct cred *new, const struct cred *old)
 {
 	struct audit_aux_data_bprm_fcaps *ax;
 	struct audit_context *context = current->audit_context;
@@ -2566,7 +2565,7 @@ void __audit_log_bprm_fcaps(struct linux_binprm *bprm, kernel_cap_t *pP, kernel_
 
 	ax = kmalloc(sizeof(*ax), GFP_KERNEL);
 	if (!ax)
-		return;
+		return -ENOMEM;
 
 	ax->d.type = AUDIT_BPRM_FCAPS;
 	ax->d.next = context->aux;
@@ -2581,26 +2580,27 @@ void __audit_log_bprm_fcaps(struct linux_binprm *bprm, kernel_cap_t *pP, kernel_
 	ax->fcap.fE = !!(vcaps.magic_etc & VFS_CAP_FLAGS_EFFECTIVE);
 	ax->fcap_ver = (vcaps.magic_etc & VFS_CAP_REVISION_MASK) >> VFS_CAP_REVISION_SHIFT;
 
-	ax->old_pcap.permitted = *pP;
-	ax->old_pcap.inheritable = current->cred->cap_inheritable;
-	ax->old_pcap.effective = *pE;
+	ax->old_pcap.permitted   = old->cap_permitted;
+	ax->old_pcap.inheritable = old->cap_inheritable;
+	ax->old_pcap.effective   = old->cap_effective;
 
-	ax->new_pcap.permitted = current->cred->cap_permitted;
-	ax->new_pcap.inheritable = current->cred->cap_inheritable;
-	ax->new_pcap.effective = current->cred->cap_effective;
+	ax->new_pcap.permitted   = new->cap_permitted;
+	ax->new_pcap.inheritable = new->cap_inheritable;
+	ax->new_pcap.effective   = new->cap_effective;
+	return 0;
 }
 
 /**
  * __audit_log_capset - store information about the arguments to the capset syscall
- * @pid target pid of the capset call
- * @eff effective cap set
- * @inh inheritible cap set
- * @perm permited cap set
+ * @pid: target pid of the capset call
+ * @new: the new credentials
+ * @old: the old (current) credentials
  *
  * Record the aguments userspace sent to sys_capset for later printing by the
  * audit system if applicable
  */
-int __audit_log_capset(pid_t pid, kernel_cap_t *eff, kernel_cap_t *inh, kernel_cap_t *perm)
+int __audit_log_capset(pid_t pid,
+		       const struct cred *new, const struct cred *old)
 {
 	struct audit_aux_data_capset *ax;
 	struct audit_context *context = current->audit_context;
@@ -2617,9 +2617,9 @@ int __audit_log_capset(pid_t pid, kernel_cap_t *eff, kernel_cap_t *inh, kernel_c
 	context->aux = (void *)ax;
 
 	ax->pid = pid;
-	ax->cap.effective = *eff;
-	ax->cap.inheritable = *eff;
-	ax->cap.permitted = *perm;
+	ax->cap.effective   = new->cap_effective;
+	ax->cap.inheritable = new->cap_effective;
+	ax->cap.permitted   = new->cap_permitted;
 
 	return 0;
 }
diff --git a/kernel/capability.c b/kernel/capability.c
index a404b980b1b..36b4b4daebe 100644
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -15,12 +15,7 @@
 #include <linux/syscalls.h>
 #include <linux/pid_namespace.h>
 #include <asm/uaccess.h>
-
-/*
- * This lock protects task->cap_* for all tasks including current.
- * Locking rule: acquire this prior to tasklist_lock.
- */
-static DEFINE_SPINLOCK(task_capability_lock);
+#include "cred-internals.h"
 
 /*
  * Leveraged for setting/resetting capabilities
@@ -128,12 +123,11 @@ static int cap_validate_magic(cap_user_header_t header, unsigned *tocopy)
 }
 
 /*
- * If we have configured with filesystem capability support, then the
- * only thing that can change the capabilities of the current process
- * is the current process. As such, we can't be in this code at the
- * same time as we are in the process of setting capabilities in this
- * process. The net result is that we can limit our use of locks to
- * when we are reading the caps of another process.
+ * The only thing that can change the capabilities of the current
+ * process is the current process. As such, we can't be in this code
+ * at the same time as we are in the process of setting capabilities
+ * in this process. The net result is that we can limit our use of
+ * locks to when we are reading the caps of another process.
  */
 static inline int cap_get_target_pid(pid_t pid, kernel_cap_t *pEp,
 				     kernel_cap_t *pIp, kernel_cap_t *pPp)
@@ -143,7 +137,6 @@ static inline int cap_get_target_pid(pid_t pid, kernel_cap_t *pEp,
 	if (pid && (pid != task_pid_vnr(current))) {
 		struct task_struct *target;
 
-		spin_lock(&task_capability_lock);
 		read_lock(&tasklist_lock);
 
 		target = find_task_by_vpid(pid);
@@ -153,34 +146,12 @@ static inline int cap_get_target_pid(pid_t pid, kernel_cap_t *pEp,
 			ret = security_capget(target, pEp, pIp, pPp);
 
 		read_unlock(&tasklist_lock);
-		spin_unlock(&task_capability_lock);
 	} else
 		ret = security_capget(current, pEp, pIp, pPp);
 
 	return ret;
 }
 
-/*
- * Atomically modify the effective capabilities returning the original
- * value. No permission check is performed here - it is assumed that the
- * caller is permitted to set the desired effective capabilities.
- */
-kernel_cap_t cap_set_effective(const kernel_cap_t pE_new)
-{
-	kernel_cap_t pE_old;
-
-	spin_lock(&task_capability_lock);
-
-	pE_old = current->cred->cap_effective;
-	current->cred->cap_effective = pE_new;
-
-	spin_unlock(&task_capability_lock);
-
-	return pE_old;
-}
-
-EXPORT_SYMBOL(cap_set_effective);
-
 /**
  * sys_capget - get the capabilities of a given process.
  * @header: pointer to struct that contains capability version and
@@ -208,7 +179,6 @@ asmlinkage long sys_capget(cap_user_header_t header, cap_user_data_t dataptr)
 		return -EINVAL;
 
 	ret = cap_get_target_pid(pid, &pE, &pI, &pP);
-
 	if (!ret) {
 		struct __user_cap_data_struct kdata[_KERNEL_CAPABILITY_U32S];
 		unsigned i;
@@ -270,6 +240,7 @@ asmlinkage long sys_capset(cap_user_header_t header, const cap_user_data_t data)
 	struct __user_cap_data_struct kdata[_KERNEL_CAPABILITY_U32S];
 	unsigned i, tocopy;
 	kernel_cap_t inheritable, permitted, effective;
+	struct cred *new;
 	int ret;
 	pid_t pid;
 
@@ -284,8 +255,8 @@ asmlinkage long sys_capset(cap_user_header_t header, const cap_user_data_t data)
 	if (pid != 0 && pid != task_pid_vnr(current))
 		return -EPERM;
 
-	if (copy_from_user(&kdata, data, tocopy
-			   * sizeof(struct __user_cap_data_struct)))
+	if (copy_from_user(&kdata, data,
+			   tocopy * sizeof(struct __user_cap_data_struct)))
 		return -EFAULT;
 
 	for (i = 0; i < tocopy; i++) {
@@ -300,24 +271,23 @@ asmlinkage long sys_capset(cap_user_header_t header, const cap_user_data_t data)
 		i++;
 	}
 
-	ret = audit_log_capset(pid, &effective, &inheritable, &permitted);
-	if (ret)
+	new = prepare_creds();
+	if (!new)
+		return -ENOMEM;
+
+	ret = security_capset(new, current_cred(),
+			      &effective, &inheritable, &permitted);
+	if (ret < 0)
+		goto error;
+
+	ret = audit_log_capset(pid, new, current_cred());
+	if (ret < 0)
 		return ret;
 
-	/* This lock is required even when filesystem capability support is
-	 * configured - it protects the sys_capget() call from returning
-	 * incorrect data in the case that the targeted process is not the
-	 * current one.
-	 */
-	spin_lock(&task_capability_lock);
-
-	ret = security_capset_check(&effective, &inheritable, &permitted);
-	/* Having verified that the proposed changes are legal, we now put them
-	 * into effect.
-	 */
-	if (!ret)
-		security_capset_set(&effective, &inheritable, &permitted);
-	spin_unlock(&task_capability_lock);
+	return commit_creds(new);
+
+error:
+	abort_creds(new);
 	return ret;
 }
 
diff --git a/kernel/cred-internals.h b/kernel/cred-internals.h
new file mode 100644
index 00000000000..2dc4fc2d0bf
--- /dev/null
+++ b/kernel/cred-internals.h
@@ -0,0 +1,21 @@
+/* Internal credentials stuff
+ *
+ * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public Licence
+ * as published by the Free Software Foundation; either version
+ * 2 of the Licence, or (at your option) any later version.
+ */
+
+/*
+ * user.c
+ */
+static inline void sched_switch_user(struct task_struct *p)
+{
+#ifdef CONFIG_USER_SCHED
+	sched_move_task(p);
+#endif	/* CONFIG_USER_SCHED */
+}
+
diff --git a/kernel/cred.c b/kernel/cred.c
index ac73e361768..cb6b5eda978 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -15,6 +15,10 @@
 #include <linux/keyctl.h>
 #include <linux/init_task.h>
 #include <linux/security.h>
+#include <linux/cn_proc.h>
+#include "cred-internals.h"
+
+static struct kmem_cache *cred_jar;
 
 /*
  * The common credentials for the initial task's thread group
@@ -64,7 +68,7 @@ static void release_tgcred_rcu(struct rcu_head *rcu)
 /*
  * Release a set of thread group credentials.
  */
-static void release_tgcred(struct cred *cred)
+void release_tgcred(struct cred *cred)
 {
 #ifdef CONFIG_KEYS
 	struct thread_group_cred *tgcred = cred->tgcred;
@@ -81,79 +85,322 @@ static void put_cred_rcu(struct rcu_head *rcu)
 {
 	struct cred *cred = container_of(rcu, struct cred, rcu);
 
-	BUG_ON(atomic_read(&cred->usage) != 0);
+	if (atomic_read(&cred->usage) != 0)
+		panic("CRED: put_cred_rcu() sees %p with usage %d\n",
+		      cred, atomic_read(&cred->usage));
 
+	security_cred_free(cred);
 	key_put(cred->thread_keyring);
 	key_put(cred->request_key_auth);
 	release_tgcred(cred);
 	put_group_info(cred->group_info);
 	free_uid(cred->user);
-	security_cred_free(cred);
-	kfree(cred);
+	kmem_cache_free(cred_jar, cred);
 }
 
 /**
  * __put_cred - Destroy a set of credentials
- * @sec: The record to release
+ * @cred: The record to release
  *
  * Destroy a set of credentials on which no references remain.
  */
 void __put_cred(struct cred *cred)
 {
+	BUG_ON(atomic_read(&cred->usage) != 0);
+
 	call_rcu(&cred->rcu, put_cred_rcu);
 }
 EXPORT_SYMBOL(__put_cred);
 
+/**
+ * prepare_creds - Prepare a new set of credentials for modification
+ *
+ * Prepare a new set of task credentials for modification.  A task's creds
+ * shouldn't generally be modified directly, therefore this function is used to
+ * prepare a new copy, which the caller then modifies and then commits by
+ * calling commit_creds().
+ *
+ * Returns a pointer to the new creds-to-be if successful, NULL otherwise.
+ *
+ * Call commit_creds() or abort_creds() to clean up.
+ */
+struct cred *prepare_creds(void)
+{
+	struct task_struct *task = current;
+	const struct cred *old;
+	struct cred *new;
+
+	BUG_ON(atomic_read(&task->cred->usage) < 1);
+
+	new = kmem_cache_alloc(cred_jar, GFP_KERNEL);
+	if (!new)
+		return NULL;
+
+	old = task->cred;
+	memcpy(new, old, sizeof(struct cred));
+
+	atomic_set(&new->usage, 1);
+	get_group_info(new->group_info);
+	get_uid(new->user);
+
+#ifdef CONFIG_KEYS
+	key_get(new->thread_keyring);
+	key_get(new->request_key_auth);
+	atomic_inc(&new->tgcred->usage);
+#endif
+
+#ifdef CONFIG_SECURITY
+	new->security = NULL;
+#endif
+
+	if (security_prepare_creds(new, old, GFP_KERNEL) < 0)
+		goto error;
+	return new;
+
+error:
+	abort_creds(new);
+	return NULL;
+}
+EXPORT_SYMBOL(prepare_creds);
+
+/*
+ * prepare new credentials for the usermode helper dispatcher
+ */
+struct cred *prepare_usermodehelper_creds(void)
+{
+#ifdef CONFIG_KEYS
+	struct thread_group_cred *tgcred = NULL;
+#endif
+	struct cred *new;
+
+#ifdef CONFIG_KEYS
+	tgcred = kzalloc(sizeof(*new->tgcred), GFP_ATOMIC);
+	if (!tgcred)
+		return NULL;
+#endif
+
+	new = kmem_cache_alloc(cred_jar, GFP_ATOMIC);
+	if (!new)
+		return NULL;
+
+	memcpy(new, &init_cred, sizeof(struct cred));
+
+	atomic_set(&new->usage, 1);
+	get_group_info(new->group_info);
+	get_uid(new->user);
+
+#ifdef CONFIG_KEYS
+	new->thread_keyring = NULL;
+	new->request_key_auth = NULL;
+	new->jit_keyring = KEY_REQKEY_DEFL_DEFAULT;
+
+	atomic_set(&tgcred->usage, 1);
+	spin_lock_init(&tgcred->lock);
+	new->tgcred = tgcred;
+#endif
+
+#ifdef CONFIG_SECURITY
+	new->security = NULL;
+#endif
+	if (security_prepare_creds(new, &init_cred, GFP_ATOMIC) < 0)
+		goto error;
+
+	BUG_ON(atomic_read(&new->usage) != 1);
+	return new;
+
+error:
+	put_cred(new);
+	return NULL;
+}
+
 /*
  * Copy credentials for the new process created by fork()
+ *
+ * We share if we can, but under some circumstances we have to generate a new
+ * set.
  */
 int copy_creds(struct task_struct *p, unsigned long clone_flags)
 {
-	struct cred *pcred;
-	int ret;
+#ifdef CONFIG_KEYS
+	struct thread_group_cred *tgcred;
+#endif
+	struct cred *new;
+
+	mutex_init(&p->cred_exec_mutex);
 
-	pcred = kmemdup(p->cred, sizeof(*p->cred), GFP_KERNEL);
-	if (!pcred)
+	if (
+#ifdef CONFIG_KEYS
+		!p->cred->thread_keyring &&
+#endif
+		clone_flags & CLONE_THREAD
+	    ) {
+		get_cred(p->cred);
+		atomic_inc(&p->cred->user->processes);
+		return 0;
+	}
+
+	new = prepare_creds();
+	if (!new)
 		return -ENOMEM;
 
 #ifdef CONFIG_KEYS
-	if (clone_flags & CLONE_THREAD) {
-		atomic_inc(&pcred->tgcred->usage);
-	} else {
-		pcred->tgcred = kmalloc(sizeof(struct cred), GFP_KERNEL);
-		if (!pcred->tgcred) {
-			kfree(pcred);
+	/* new threads get their own thread keyrings if their parent already
+	 * had one */
+	if (new->thread_keyring) {
+		key_put(new->thread_keyring);
+		new->thread_keyring = NULL;
+		if (clone_flags & CLONE_THREAD)
+			install_thread_keyring_to_cred(new);
+	}
+
+	/* we share the process and session keyrings between all the threads in
+	 * a process - this is slightly icky as we violate COW credentials a
+	 * bit */
+	if (!(clone_flags & CLONE_THREAD)) {
+		tgcred = kmalloc(sizeof(*tgcred), GFP_KERNEL);
+		if (!tgcred) {
+			put_cred(new);
 			return -ENOMEM;
 		}
-		atomic_set(&pcred->tgcred->usage, 1);
-		spin_lock_init(&pcred->tgcred->lock);
-		pcred->tgcred->process_keyring = NULL;
-		pcred->tgcred->session_keyring =
-			key_get(p->cred->tgcred->session_keyring);
+		atomic_set(&tgcred->usage, 1);
+		spin_lock_init(&tgcred->lock);
+		tgcred->process_keyring = NULL;
+		tgcred->session_keyring = key_get(new->tgcred->session_keyring);
+
+		release_tgcred(new);
+		new->tgcred = tgcred;
 	}
 #endif
 
-#ifdef CONFIG_SECURITY
-	pcred->security = NULL;
-#endif
+	atomic_inc(&new->user->processes);
+	p->cred = new;
+	return 0;
+}
 
-	ret = security_cred_alloc(pcred);
-	if (ret < 0) {
-		release_tgcred(pcred);
-		kfree(pcred);
-		return ret;
+/**
+ * commit_creds - Install new credentials upon the current task
+ * @new: The credentials to be assigned
+ *
+ * Install a new set of credentials to the current task, using RCU to replace
+ * the old set.
+ *
+ * This function eats the caller's reference to the new credentials.
+ *
+ * Always returns 0 thus allowing this function to be tail-called at the end
+ * of, say, sys_setgid().
+ */
+int commit_creds(struct cred *new)
+{
+	struct task_struct *task = current;
+	const struct cred *old;
+
+	BUG_ON(atomic_read(&new->usage) < 1);
+	BUG_ON(atomic_read(&task->cred->usage) < 1);
+
+	old = task->cred;
+	security_commit_creds(new, old);
+
+	/* dumpability changes */
+	if (old->euid != new->euid ||
+	    old->egid != new->egid ||
+	    old->fsuid != new->fsuid ||
+	    old->fsgid != new->fsgid ||
+	    !cap_issubset(new->cap_permitted, old->cap_permitted)) {
+		set_dumpable(task->mm, suid_dumpable);
+		task->pdeath_signal = 0;
+		smp_wmb();
 	}
 
-	atomic_set(&pcred->usage, 1);
-	get_group_info(pcred->group_info);
-	get_uid(pcred->user);
-	key_get(pcred->thread_keyring);
-	key_get(pcred->request_key_auth);
+	/* alter the thread keyring */
+	if (new->fsuid != old->fsuid)
+		key_fsuid_changed(task);
+	if (new->fsgid != old->fsgid)
+		key_fsgid_changed(task);
+
+	/* do it
+	 * - What if a process setreuid()'s and this brings the
+	 *   new uid over his NPROC rlimit?  We can check this now
+	 *   cheaply with the new uid cache, so if it matters
+	 *   we should be checking for it.  -DaveM
+	 */
+	if (new->user != old->user)
+		atomic_inc(&new->user->processes);
+	rcu_assign_pointer(task->cred, new);
+	if (new->user != old->user)
+		atomic_dec(&old->user->processes);
+
+	sched_switch_user(task);
+
+	/* send notifications */
+	if (new->uid   != old->uid  ||
+	    new->euid  != old->euid ||
+	    new->suid  != old->suid ||
+	    new->fsuid != old->fsuid)
+		proc_id_connector(task, PROC_EVENT_UID);
 
-	atomic_inc(&pcred->user->processes);
+	if (new->gid   != old->gid  ||
+	    new->egid  != old->egid ||
+	    new->sgid  != old->sgid ||
+	    new->fsgid != old->fsgid)
+		proc_id_connector(task, PROC_EVENT_GID);
 
-	/* RCU assignment is unneeded here as no-one can have accessed this
-	 * pointer yet, barring us */
-	p->cred = pcred;
+	put_cred(old);
 	return 0;
 }
+EXPORT_SYMBOL(commit_creds);
+
+/**
+ * abort_creds - Discard a set of credentials and unlock the current task
+ * @new: The credentials that were going to be applied
+ *
+ * Discard a set of credentials that were under construction and unlock the
+ * current task.
+ */
+void abort_creds(struct cred *new)
+{
+	BUG_ON(atomic_read(&new->usage) < 1);
+	put_cred(new);
+}
+EXPORT_SYMBOL(abort_creds);
+
+/**
+ * override_creds - Temporarily override the current process's credentials
+ * @new: The credentials to be assigned
+ *
+ * Install a set of temporary override credentials on the current process,
+ * returning the old set for later reversion.
+ */
+const struct cred *override_creds(const struct cred *new)
+{
+	const struct cred *old = current->cred;
+
+	rcu_assign_pointer(current->cred, get_cred(new));
+	return old;
+}
+EXPORT_SYMBOL(override_creds);
+
+/**
+ * revert_creds - Revert a temporary credentials override
+ * @old: The credentials to be restored
+ *
+ * Revert a temporary set of override credentials to an old set, discarding the
+ * override set.
+ */
+void revert_creds(const struct cred *old)
+{
+	const struct cred *override = current->cred;
+
+	rcu_assign_pointer(current->cred, old);
+	put_cred(override);
+}
+EXPORT_SYMBOL(revert_creds);
+
+/*
+ * initialise the credentials stuff
+ */
+void __init cred_init(void)
+{
+	/* allocate a slab in which we can store credentials */
+	cred_jar = kmem_cache_create("cred_jar", sizeof(struct cred),
+				     0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
+}
diff --git a/kernel/exit.c b/kernel/exit.c
index bbc22530f2c..c0711da1548 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -47,12 +47,14 @@
 #include <linux/blkdev.h>
 #include <linux/task_io_accounting_ops.h>
 #include <linux/tracehook.h>
+#include <linux/init_task.h>
 #include <trace/sched.h>
 
 #include <asm/uaccess.h>
 #include <asm/unistd.h>
 #include <asm/pgtable.h>
 #include <asm/mmu_context.h>
+#include "cred-internals.h"
 
 static void exit_mm(struct task_struct * tsk);
 
@@ -338,12 +340,12 @@ static void reparent_to_kthreadd(void)
 	/* cpus_allowed? */
 	/* rt_priority? */
 	/* signals? */
-	security_task_reparent_to_init(current);
 	memcpy(current->signal->rlim, init_task.signal->rlim,
 	       sizeof(current->signal->rlim));
-	atomic_inc(&(INIT_USER->__count));
+
+	atomic_inc(&init_cred.usage);
+	commit_creds(&init_cred);
 	write_unlock_irq(&tasklist_lock);
-	switch_uid(INIT_USER);
 }
 
 void __set_special_pids(struct pid *pid)
@@ -1085,7 +1087,6 @@ NORET_TYPE void do_exit(long code)
 	check_stack_usage();
 	exit_thread();
 	cgroup_exit(tsk, 1);
-	exit_keys(tsk);
 
 	if (group_dead && tsk->signal->leader)
 		disassociate_ctty(1);
diff --git a/kernel/fork.c b/kernel/fork.c
index ded1972672a..82a7948a664 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1084,10 +1084,8 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 		goto bad_fork_cleanup_sighand;
 	if ((retval = copy_mm(clone_flags, p)))
 		goto bad_fork_cleanup_signal;
-	if ((retval = copy_keys(clone_flags, p)))
-		goto bad_fork_cleanup_mm;
 	if ((retval = copy_namespaces(clone_flags, p)))
-		goto bad_fork_cleanup_keys;
+		goto bad_fork_cleanup_mm;
 	if ((retval = copy_io(clone_flags, p)))
 		goto bad_fork_cleanup_namespaces;
 	retval = copy_thread(0, clone_flags, stack_start, stack_size, p, regs);
@@ -1252,8 +1250,6 @@ bad_fork_cleanup_io:
 	put_io_context(p->io_context);
 bad_fork_cleanup_namespaces:
 	exit_task_namespaces(p);
-bad_fork_cleanup_keys:
-	exit_keys(p);
 bad_fork_cleanup_mm:
 	if (p->mm)
 		mmput(p->mm);
@@ -1281,6 +1277,7 @@ bad_fork_cleanup_cgroup:
 bad_fork_cleanup_put_domain:
 	module_put(task_thread_info(p)->exec_domain->module);
 bad_fork_cleanup_count:
+	atomic_dec(&p->cred->user->processes);
 	put_cred(p->cred);
 bad_fork_free:
 	free_task(p);
diff --git a/kernel/kmod.c b/kernel/kmod.c
index f044f8f5770..b46dbb90866 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -118,10 +118,10 @@ EXPORT_SYMBOL(request_module);
 struct subprocess_info {
 	struct work_struct work;
 	struct completion *complete;
+	struct cred *cred;
 	char *path;
 	char **argv;
 	char **envp;
-	struct key *ring;
 	enum umh_wait wait;
 	int retval;
 	struct file *stdin;
@@ -134,19 +134,20 @@ struct subprocess_info {
 static int ____call_usermodehelper(void *data)
 {
 	struct subprocess_info *sub_info = data;
-	struct key *new_session, *old_session;
 	int retval;
 
-	/* Unblock all signals and set the session keyring. */
-	new_session = key_get(sub_info->ring);
+	BUG_ON(atomic_read(&sub_info->cred->usage) != 1);
+
+	/* Unblock all signals */
 	spin_lock_irq(&current->sighand->siglock);
-	old_session = __install_session_keyring(new_session);
 	flush_signal_handlers(current, 1);
 	sigemptyset(&current->blocked);
 	recalc_sigpending();
 	spin_unlock_irq(&current->sighand->siglock);
 
-	key_put(old_session);
+	/* Install the credentials */
+	commit_creds(sub_info->cred);
+	sub_info->cred = NULL;
 
 	/* Install input pipe when needed */
 	if (sub_info->stdin) {
@@ -185,6 +186,8 @@ void call_usermodehelper_freeinfo(struct subprocess_info *info)
 {
 	if (info->cleanup)
 		(*info->cleanup)(info->argv, info->envp);
+	if (info->cred)
+		put_cred(info->cred);
 	kfree(info);
 }
 EXPORT_SYMBOL(call_usermodehelper_freeinfo);
@@ -240,6 +243,8 @@ static void __call_usermodehelper(struct work_struct *work)
 	pid_t pid;
 	enum umh_wait wait = sub_info->wait;
 
+	BUG_ON(atomic_read(&sub_info->cred->usage) != 1);
+
 	/* CLONE_VFORK: wait until the usermode helper has execve'd
 	 * successfully We need the data structures to stay around
 	 * until that is done.  */
@@ -362,6 +367,9 @@ struct subprocess_info *call_usermodehelper_setup(char *path, char **argv,
 	sub_info->path = path;
 	sub_info->argv = argv;
 	sub_info->envp = envp;
+	sub_info->cred = prepare_usermodehelper_creds();
+	if (!sub_info->cred)
+		return NULL;
 
   out:
 	return sub_info;
@@ -376,7 +384,13 @@ EXPORT_SYMBOL(call_usermodehelper_setup);
 void call_usermodehelper_setkeys(struct subprocess_info *info,
 				 struct key *session_keyring)
 {
-	info->ring = session_keyring;
+#ifdef CONFIG_KEYS
+	struct thread_group_cred *tgcred = info->cred->tgcred;
+	key_put(tgcred->session_keyring);
+	tgcred->session_keyring = key_get(session_keyring);
+#else
+	BUG();
+#endif
 }
 EXPORT_SYMBOL(call_usermodehelper_setkeys);
 
@@ -444,6 +458,8 @@ int call_usermodehelper_exec(struct subprocess_info *sub_info,
 	DECLARE_COMPLETION_ONSTACK(done);
 	int retval = 0;
 
+	BUG_ON(atomic_read(&sub_info->cred->usage) != 1);
+
 	helper_lock();
 	if (sub_info->path[0] == '\0')
 		goto out;
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index b9d5f4e4f6a..f764b880695 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -171,6 +171,14 @@ int ptrace_attach(struct task_struct *task)
 	if (same_thread_group(task, current))
 		goto out;
 
+	/* Protect exec's credential calculations against our interference;
+	 * SUID, SGID and LSM creds get determined differently under ptrace.
+	 */
+	retval = mutex_lock_interruptible(&current->cred_exec_mutex);
+	if (retval  < 0)
+		goto out;
+
+	retval = -EPERM;
 repeat:
 	/*
 	 * Nasty, nasty.
@@ -210,6 +218,7 @@ repeat:
 bad:
 	write_unlock_irqrestore(&tasklist_lock, flags);
 	task_unlock(task);
+	mutex_unlock(&current->cred_exec_mutex);
 out:
 	return retval;
 }
diff --git a/kernel/signal.c b/kernel/signal.c
index 84989124baf..2a64304ed54 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -180,7 +180,7 @@ int next_signal(struct sigpending *pending, sigset_t *mask)
 /*
  * allocate a new signal queue record
  * - this may be called without locks if and only if t == current, otherwise an
- *   appopriate lock must be held to protect t's user_struct
+ *   appopriate lock must be held to stop the target task from exiting
  */
 static struct sigqueue *__sigqueue_alloc(struct task_struct *t, gfp_t flags,
 					 int override_rlimit)
@@ -194,7 +194,7 @@ static struct sigqueue *__sigqueue_alloc(struct task_struct *t, gfp_t flags,
 	 * caller must be holding the RCU readlock (by way of a spinlock) and
 	 * we use RCU protection here
 	 */
-	user = __task_cred(t)->user;
+	user = get_uid(__task_cred(t)->user);
 	atomic_inc(&user->sigpending);
 	if (override_rlimit ||
 	    atomic_read(&user->sigpending) <=
@@ -202,12 +202,14 @@ static struct sigqueue *__sigqueue_alloc(struct task_struct *t, gfp_t flags,
 		q = kmem_cache_alloc(sigqueue_cachep, flags);
 	if (unlikely(q == NULL)) {
 		atomic_dec(&user->sigpending);
+		free_uid(user);
 	} else {
 		INIT_LIST_HEAD(&q->list);
 		q->flags = 0;
-		q->user = get_uid(user);
+		q->user = user;
 	}
-	return(q);
+
+	return q;
 }
 
 static void __sigqueue_free(struct sigqueue *q)
diff --git a/kernel/sys.c b/kernel/sys.c
index ccc9eb736d3..ab735040468 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -180,7 +180,7 @@ asmlinkage long sys_setpriority(int which, int who, int niceval)
 			} while_each_pid_thread(pgrp, PIDTYPE_PGID, p);
 			break;
 		case PRIO_USER:
-			user = cred->user;
+			user = (struct user_struct *) cred->user;
 			if (!who)
 				who = cred->uid;
 			else if ((who != cred->uid) &&
@@ -479,47 +479,48 @@ void ctrl_alt_del(void)
  */
 asmlinkage long sys_setregid(gid_t rgid, gid_t egid)
 {
-	struct cred *cred = current->cred;
-	int old_rgid = cred->gid;
-	int old_egid = cred->egid;
-	int new_rgid = old_rgid;
-	int new_egid = old_egid;
+	const struct cred *old;
+	struct cred *new;
 	int retval;
 
+	new = prepare_creds();
+	if (!new)
+		return -ENOMEM;
+	old = current_cred();
+
 	retval = security_task_setgid(rgid, egid, (gid_t)-1, LSM_SETID_RE);
 	if (retval)
-		return retval;
+		goto error;
 
+	retval = -EPERM;
 	if (rgid != (gid_t) -1) {
-		if ((old_rgid == rgid) ||
-		    (cred->egid == rgid) ||
+		if (old->gid == rgid ||
+		    old->egid == rgid ||
 		    capable(CAP_SETGID))
-			new_rgid = rgid;
+			new->gid = rgid;
 		else
-			return -EPERM;
+			goto error;
 	}
 	if (egid != (gid_t) -1) {
-		if ((old_rgid == egid) ||
-		    (cred->egid == egid) ||
-		    (cred->sgid == egid) ||
+		if (old->gid == egid ||
+		    old->egid == egid ||
+		    old->sgid == egid ||
 		    capable(CAP_SETGID))
-			new_egid = egid;
+			new->egid = egid;
 		else
-			return -EPERM;
-	}
-	if (new_egid != old_egid) {
-		set_dumpable(current->mm, suid_dumpable);
-		smp_wmb();
+			goto error;
 	}
+
 	if (rgid != (gid_t) -1 ||
-	    (egid != (gid_t) -1 && egid != old_rgid))
-		cred->sgid = new_egid;
-	cred->fsgid = new_egid;
-	cred->egid = new_egid;
-	cred->gid = new_rgid;
-	key_fsgid_changed(current);
-	proc_id_connector(current, PROC_EVENT_GID);
-	return 0;
+	    (egid != (gid_t) -1 && egid != old->gid))
+		new->sgid = new->egid;
+	new->fsgid = new->egid;
+
+	return commit_creds(new);
+
+error:
+	abort_creds(new);
+	return retval;
 }
 
 /*
@@ -529,40 +530,42 @@ asmlinkage long sys_setregid(gid_t rgid, gid_t egid)
  */
 asmlinkage long sys_setgid(gid_t gid)
 {
-	struct cred *cred = current->cred;
-	int old_egid = cred->egid;
+	const struct cred *old;
+	struct cred *new;
 	int retval;
 
+	new = prepare_creds();
+	if (!new)
+		return -ENOMEM;
+	old = current_cred();
+
 	retval = security_task_setgid(gid, (gid_t)-1, (gid_t)-1, LSM_SETID_ID);
 	if (retval)
-		return retval;
+		goto error;
 
-	if (capable(CAP_SETGID)) {
-		if (old_egid != gid) {
-			set_dumpable(current->mm, suid_dumpable);
-			smp_wmb();
-		}
-		cred->gid = cred->egid = cred->sgid = cred->fsgid = gid;
-	} else if ((gid == cred->gid) || (gid == cred->sgid)) {
-		if (old_egid != gid) {
-			set_dumpable(current->mm, suid_dumpable);
-			smp_wmb();
-		}
-		cred->egid = cred->fsgid = gid;
-	}
+	retval = -EPERM;
+	if (capable(CAP_SETGID))
+		new->gid = new->egid = new->sgid = new->fsgid = gid;
+	else if (gid == old->gid || gid == old->sgid)
+		new->egid = new->fsgid = gid;
 	else
-		return -EPERM;
+		goto error;
 
-	key_fsgid_changed(current);
-	proc_id_connector(current, PROC_EVENT_GID);
-	return 0;
+	return commit_creds(new);
+
+error:
+	abort_creds(new);
+	return retval;
 }
   
-static int set_user(uid_t new_ruid, int dumpclear)
+/*
+ * change the user struct in a credentials set to match the new UID
+ */
+static int set_user(struct cred *new)
 {
 	struct user_struct *new_user;
 
-	new_user = alloc_uid(current->nsproxy->user_ns, new_ruid);
+	new_user = alloc_uid(current->nsproxy->user_ns, new->uid);
 	if (!new_user)
 		return -EAGAIN;
 
@@ -573,13 +576,8 @@ static int set_user(uid_t new_ruid, int dumpclear)
 		return -EAGAIN;
 	}
 
-	switch_uid(new_user);
-
-	if (dumpclear) {
-		set_dumpable(current->mm, suid_dumpable);
-		smp_wmb();
-	}
-	current->cred->uid = new_ruid;
+	free_uid(new->user);
+	new->user = new_user;
 	return 0;
 }
 
@@ -600,55 +598,56 @@ static int set_user(uid_t new_ruid, int dumpclear)
  */
 asmlinkage long sys_setreuid(uid_t ruid, uid_t euid)
 {
-	struct cred *cred = current->cred;
-	int old_ruid, old_euid, old_suid, new_ruid, new_euid;
+	const struct cred *old;
+	struct cred *new;
 	int retval;
 
+	new = prepare_creds();
+	if (!new)
+		return -ENOMEM;
+	old = current_cred();
+
 	retval = security_task_setuid(ruid, euid, (uid_t)-1, LSM_SETID_RE);
 	if (retval)
-		return retval;
-
-	new_ruid = old_ruid = cred->uid;
-	new_euid = old_euid = cred->euid;
-	old_suid = cred->suid;
+		goto error;
 
+	retval = -EPERM;
 	if (ruid != (uid_t) -1) {
-		new_ruid = ruid;
-		if ((old_ruid != ruid) &&
-		    (cred->euid != ruid) &&
+		new->uid = ruid;
+		if (old->uid != ruid &&
+		    old->euid != ruid &&
 		    !capable(CAP_SETUID))
-			return -EPERM;
+			goto error;
 	}
 
 	if (euid != (uid_t) -1) {
-		new_euid = euid;
-		if ((old_ruid != euid) &&
-		    (cred->euid != euid) &&
-		    (cred->suid != euid) &&
+		new->euid = euid;
+		if (old->uid != euid &&
+		    old->euid != euid &&
+		    old->suid != euid &&
 		    !capable(CAP_SETUID))
-			return -EPERM;
+			goto error;
 	}
 
-	if (new_ruid != old_ruid && set_user(new_ruid, new_euid != old_euid) < 0)
-		return -EAGAIN;
+	retval = -EAGAIN;
+	if (new->uid != old->uid && set_user(new) < 0)
+		goto error;
 
-	if (new_euid != old_euid) {
-		set_dumpable(current->mm, suid_dumpable);
-		smp_wmb();
-	}
-	cred->fsuid = cred->euid = new_euid;
 	if (ruid != (uid_t) -1 ||
-	    (euid != (uid_t) -1 && euid != old_ruid))
-		cred->suid = cred->euid;
-	cred->fsuid = cred->euid;
-
-	key_fsuid_changed(current);
-	proc_id_connector(current, PROC_EVENT_UID);
+	    (euid != (uid_t) -1 && euid != old->uid))
+		new->suid = new->euid;
+	new->fsuid = new->euid;
 
-	return security_task_post_setuid(old_ruid, old_euid, old_suid, LSM_SETID_RE);
-}
+	retval = security_task_fix_setuid(new, old, LSM_SETID_RE);
+	if (retval < 0)
+		goto error;
 
+	return commit_creds(new);
 
+error:
+	abort_creds(new);
+	return retval;
+}
 		
 /*
  * setuid() is implemented like SysV with SAVED_IDS 
@@ -663,37 +662,41 @@ asmlinkage long sys_setreuid(uid_t ruid, uid_t euid)
  */
 asmlinkage long sys_setuid(uid_t uid)
 {
-	struct cred *cred = current->cred;
-	int old_euid = cred->euid;
-	int old_ruid, old_suid, new_suid;
+	const struct cred *old;
+	struct cred *new;
 	int retval;
 
+	new = prepare_creds();
+	if (!new)
+		return -ENOMEM;
+	old = current_cred();
+
 	retval = security_task_setuid(uid, (uid_t)-1, (uid_t)-1, LSM_SETID_ID);
 	if (retval)
-		return retval;
+		goto error;
 
-	old_ruid = cred->uid;
-	old_suid = cred->suid;
-	new_suid = old_suid;
-	
+	retval = -EPERM;
 	if (capable(CAP_SETUID)) {
-		if (uid != old_ruid && set_user(uid, old_euid != uid) < 0)
-			return -EAGAIN;
-		new_suid = uid;
-	} else if ((uid != cred->uid) && (uid != new_suid))
-		return -EPERM;
-
-	if (old_euid != uid) {
-		set_dumpable(current->mm, suid_dumpable);
-		smp_wmb();
+		new->suid = new->uid = uid;
+		if (uid != old->uid && set_user(new) < 0) {
+			retval = -EAGAIN;
+			goto error;
+		}
+	} else if (uid != old->uid && uid != new->suid) {
+		goto error;
 	}
-	cred->fsuid = cred->euid = uid;
-	cred->suid = new_suid;
 
-	key_fsuid_changed(current);
-	proc_id_connector(current, PROC_EVENT_UID);
+	new->fsuid = new->euid = uid;
+
+	retval = security_task_fix_setuid(new, old, LSM_SETID_ID);
+	if (retval < 0)
+		goto error;
+
+	return commit_creds(new);
 
-	return security_task_post_setuid(old_ruid, old_euid, old_suid, LSM_SETID_ID);
+error:
+	abort_creds(new);
+	return retval;
 }
 
 
@@ -703,47 +706,53 @@ asmlinkage long sys_setuid(uid_t uid)
  */
 asmlinkage long sys_setresuid(uid_t ruid, uid_t euid, uid_t suid)
 {
-	struct cred *cred = current->cred;
-	int old_ruid = cred->uid;
-	int old_euid = cred->euid;
-	int old_suid = cred->suid;
+	const struct cred *old;
+	struct cred *new;
 	int retval;
 
+	new = prepare_creds();
+	if (!new)
+		return -ENOMEM;
+
 	retval = security_task_setuid(ruid, euid, suid, LSM_SETID_RES);
 	if (retval)
-		return retval;
+		goto error;
+	old = current_cred();
 
+	retval = -EPERM;
 	if (!capable(CAP_SETUID)) {
-		if ((ruid != (uid_t) -1) && (ruid != cred->uid) &&
-		    (ruid != cred->euid) && (ruid != cred->suid))
-			return -EPERM;
-		if ((euid != (uid_t) -1) && (euid != cred->uid) &&
-		    (euid != cred->euid) && (euid != cred->suid))
-			return -EPERM;
-		if ((suid != (uid_t) -1) && (suid != cred->uid) &&
-		    (suid != cred->euid) && (suid != cred->suid))
-			return -EPERM;
+		if (ruid != (uid_t) -1 && ruid != old->uid &&
+		    ruid != old->euid  && ruid != old->suid)
+			goto error;
+		if (euid != (uid_t) -1 && euid != old->uid &&
+		    euid != old->euid  && euid != old->suid)
+			goto error;
+		if (suid != (uid_t) -1 && suid != old->uid &&
+		    suid != old->euid  && suid != old->suid)
+			goto error;
 	}
+
+	retval = -EAGAIN;
 	if (ruid != (uid_t) -1) {
-		if (ruid != cred->uid &&
-		    set_user(ruid, euid != cred->euid) < 0)
-			return -EAGAIN;
+		new->uid = ruid;
+		if (ruid != old->uid && set_user(new) < 0)
+			goto error;
 	}
-	if (euid != (uid_t) -1) {
-		if (euid != cred->euid) {
-			set_dumpable(current->mm, suid_dumpable);
-			smp_wmb();
-		}
-		cred->euid = euid;
-	}
-	cred->fsuid = cred->euid;
+	if (euid != (uid_t) -1)
+		new->euid = euid;
 	if (suid != (uid_t) -1)
-		cred->suid = suid;
+		new->suid = suid;
+	new->fsuid = new->euid;
 
-	key_fsuid_changed(current);
-	proc_id_connector(current, PROC_EVENT_UID);
+	retval = security_task_fix_setuid(new, old, LSM_SETID_RES);
+	if (retval < 0)
+		goto error;
 
-	return security_task_post_setuid(old_ruid, old_euid, old_suid, LSM_SETID_RES);
+	return commit_creds(new);
+
+error:
+	abort_creds(new);
+	return retval;
 }
 
 asmlinkage long sys_getresuid(uid_t __user *ruid, uid_t __user *euid, uid_t __user *suid)
@@ -763,40 +772,45 @@ asmlinkage long sys_getresuid(uid_t __user *ruid, uid_t __user *euid, uid_t __us
  */
 asmlinkage long sys_setresgid(gid_t rgid, gid_t egid, gid_t sgid)
 {
-	struct cred *cred = current->cred;
+	const struct cred *old;
+	struct cred *new;
 	int retval;
 
+	new = prepare_creds();
+	if (!new)
+		return -ENOMEM;
+	old = current_cred();
+
 	retval = security_task_setgid(rgid, egid, sgid, LSM_SETID_RES);
 	if (retval)
-		return retval;
+		goto error;
 
+	retval = -EPERM;
 	if (!capable(CAP_SETGID)) {
-		if ((rgid != (gid_t) -1) && (rgid != cred->gid) &&
-		    (rgid != cred->egid) && (rgid != cred->sgid))
-			return -EPERM;
-		if ((egid != (gid_t) -1) && (egid != cred->gid) &&
-		    (egid != cred->egid) && (egid != cred->sgid))
-			return -EPERM;
-		if ((sgid != (gid_t) -1) && (sgid != cred->gid) &&
-		    (sgid != cred->egid) && (sgid != cred->sgid))
-			return -EPERM;
+		if (rgid != (gid_t) -1 && rgid != old->gid &&
+		    rgid != old->egid  && rgid != old->sgid)
+			goto error;
+		if (egid != (gid_t) -1 && egid != old->gid &&
+		    egid != old->egid  && egid != old->sgid)
+			goto error;
+		if (sgid != (gid_t) -1 && sgid != old->gid &&
+		    sgid != old->egid  && sgid != old->sgid)
+			goto error;
 	}
-	if (egid != (gid_t) -1) {
-		if (egid != cred->egid) {
-			set_dumpable(current->mm, suid_dumpable);
-			smp_wmb();
-		}
-		cred->egid = egid;
-	}
-	cred->fsgid = cred->egid;
+
 	if (rgid != (gid_t) -1)
-		cred->gid = rgid;
+		new->gid = rgid;
+	if (egid != (gid_t) -1)
+		new->egid = egid;
 	if (sgid != (gid_t) -1)
-		cred->sgid = sgid;
+		new->sgid = sgid;
+	new->fsgid = new->egid;
 
-	key_fsgid_changed(current);
-	proc_id_connector(current, PROC_EVENT_GID);
-	return 0;
+	return commit_creds(new);
+
+error:
+	abort_creds(new);
+	return retval;
 }
 
 asmlinkage long sys_getresgid(gid_t __user *rgid, gid_t __user *egid, gid_t __user *sgid)
@@ -820,28 +834,35 @@ asmlinkage long sys_getresgid(gid_t __user *rgid, gid_t __user *egid, gid_t __us
  */
 asmlinkage long sys_setfsuid(uid_t uid)
 {
-	struct cred *cred = current->cred;
-	int old_fsuid;
+	const struct cred *old;
+	struct cred *new;
+	uid_t old_fsuid;
+
+	new = prepare_creds();
+	if (!new)
+		return current_fsuid();
+	old = current_cred();
+	old_fsuid = old->fsuid;
 
-	old_fsuid = cred->fsuid;
-	if (security_task_setuid(uid, (uid_t)-1, (uid_t)-1, LSM_SETID_FS))
-		return old_fsuid;
+	if (security_task_setuid(uid, (uid_t)-1, (uid_t)-1, LSM_SETID_FS) < 0)
+		goto error;
 
-	if (uid == cred->uid || uid == cred->euid ||
-	    uid == cred->suid || uid == cred->fsuid ||
+	if (uid == old->uid  || uid == old->euid  ||
+	    uid == old->suid || uid == old->fsuid ||
 	    capable(CAP_SETUID)) {
 		if (uid != old_fsuid) {
-			set_dumpable(current->mm, suid_dumpable);
-			smp_wmb();
+			new->fsuid = uid;
+			if (security_task_fix_setuid(new, old, LSM_SETID_FS) == 0)
+				goto change_okay;
 		}
-		cred->fsuid = uid;
 	}
 
-	key_fsuid_changed(current);
-	proc_id_connector(current, PROC_EVENT_UID);
-
-	security_task_post_setuid(old_fsuid, (uid_t)-1, (uid_t)-1, LSM_SETID_FS);
+error:
+	abort_creds(new);
+	return old_fsuid;
 
+change_okay:
+	commit_creds(new);
 	return old_fsuid;
 }
 
@@ -850,24 +871,34 @@ asmlinkage long sys_setfsuid(uid_t uid)
  */
 asmlinkage long sys_setfsgid(gid_t gid)
 {
-	struct cred *cred = current->cred;
-	int old_fsgid;
+	const struct cred *old;
+	struct cred *new;
+	gid_t old_fsgid;
+
+	new = prepare_creds();
+	if (!new)
+		return current_fsgid();
+	old = current_cred();
+	old_fsgid = old->fsgid;
 
-	old_fsgid = cred->fsgid;
 	if (security_task_setgid(gid, (gid_t)-1, (gid_t)-1, LSM_SETID_FS))
-		return old_fsgid;
+		goto error;
 
-	if (gid == cred->gid || gid == cred->egid ||
-	    gid == cred->sgid || gid == cred->fsgid ||
+	if (gid == old->gid  || gid == old->egid  ||
+	    gid == old->sgid || gid == old->fsgid ||
 	    capable(CAP_SETGID)) {
 		if (gid != old_fsgid) {
-			set_dumpable(current->mm, suid_dumpable);
-			smp_wmb();
+			new->fsgid = gid;
+			goto change_okay;
 		}
-		cred->fsgid = gid;
-		key_fsgid_changed(current);
-		proc_id_connector(current, PROC_EVENT_GID);
 	}
+
+error:
+	abort_creds(new);
+	return old_fsgid;
+
+change_okay:
+	commit_creds(new);
 	return old_fsgid;
 }
 
@@ -1136,7 +1167,7 @@ EXPORT_SYMBOL(groups_free);
 
 /* export the group_info to a user-space array */
 static int groups_to_user(gid_t __user *grouplist,
-    struct group_info *group_info)
+			  const struct group_info *group_info)
 {
 	int i;
 	unsigned int count = group_info->ngroups;
@@ -1227,31 +1258,25 @@ int groups_search(const struct group_info *group_info, gid_t grp)
 }
 
 /**
- * set_groups - Change a group subscription in a security record
- * @sec: The security record to alter
- * @group_info: The group list to impose
+ * set_groups - Change a group subscription in a set of credentials
+ * @new: The newly prepared set of credentials to alter
+ * @group_info: The group list to install
  *
- * Validate a group subscription and, if valid, impose it upon a task security
- * record.
+ * Validate a group subscription and, if valid, insert it into a set
+ * of credentials.
  */
-int set_groups(struct cred *cred, struct group_info *group_info)
+int set_groups(struct cred *new, struct group_info *group_info)
 {
 	int retval;
-	struct group_info *old_info;
 
 	retval = security_task_setgroups(group_info);
 	if (retval)
 		return retval;
 
+	put_group_info(new->group_info);
 	groups_sort(group_info);
 	get_group_info(group_info);
-
-	spin_lock(&cred->lock);
-	old_info = cred->group_info;
-	cred->group_info = group_info;
-	spin_unlock(&cred->lock);
-
-	put_group_info(old_info);
+	new->group_info = group_info;
 	return 0;
 }
 
@@ -1266,7 +1291,20 @@ EXPORT_SYMBOL(set_groups);
  */
 int set_current_groups(struct group_info *group_info)
 {
-	return set_groups(current->cred, group_info);
+	struct cred *new;
+	int ret;
+
+	new = prepare_creds();
+	if (!new)
+		return -ENOMEM;
+
+	ret = set_groups(new, group_info);
+	if (ret < 0) {
+		abort_creds(new);
+		return ret;
+	}
+
+	return commit_creds(new);
 }
 
 EXPORT_SYMBOL(set_current_groups);
@@ -1666,9 +1704,11 @@ asmlinkage long sys_prctl(int option, unsigned long arg2, unsigned long arg3,
 	unsigned char comm[sizeof(me->comm)];
 	long error;
 
-	if (security_task_prctl(option, arg2, arg3, arg4, arg5, &error))
+	error = security_task_prctl(option, arg2, arg3, arg4, arg5);
+	if (error != -ENOSYS)
 		return error;
 
+	error = 0;
 	switch (option) {
 		case PR_SET_PDEATHSIG:
 			if (!valid_signal(arg2)) {
diff --git a/kernel/user.c b/kernel/user.c
index 104d22ac84d..d476307dd4b 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -16,6 +16,7 @@
 #include <linux/interrupt.h>
 #include <linux/module.h>
 #include <linux/user_namespace.h>
+#include "cred-internals.h"
 
 struct user_namespace init_user_ns = {
 	.kref = {
@@ -104,16 +105,10 @@ static int sched_create_user(struct user_struct *up)
 	return rc;
 }
 
-static void sched_switch_user(struct task_struct *p)
-{
-	sched_move_task(p);
-}
-
 #else	/* CONFIG_USER_SCHED */
 
 static void sched_destroy_user(struct user_struct *up) { }
 static int sched_create_user(struct user_struct *up) { return 0; }
-static void sched_switch_user(struct task_struct *p) { }
 
 #endif	/* CONFIG_USER_SCHED */
 
@@ -448,36 +443,6 @@ out_unlock:
 	return NULL;
 }
 
-void switch_uid(struct user_struct *new_user)
-{
-	struct user_struct *old_user;
-
-	/* What if a process setreuid()'s and this brings the
-	 * new uid over his NPROC rlimit?  We can check this now
-	 * cheaply with the new uid cache, so if it matters
-	 * we should be checking for it.  -DaveM
-	 */
-	old_user = current->cred->user;
-	atomic_inc(&new_user->processes);
-	atomic_dec(&old_user->processes);
-	switch_uid_keyring(new_user);
-	current->cred->user = new_user;
-	sched_switch_user(current);
-
-	/*
-	 * We need to synchronize with __sigqueue_alloc()
-	 * doing a get_uid(p->user).. If that saw the old
-	 * user value, we need to wait until it has exited
-	 * its critical region before we can free the old
-	 * structure.
-	 */
-	smp_mb();
-	spin_unlock_wait(&current->sighand->siglock);
-
-	free_uid(old_user);
-	suid_keys(current);
-}
-
 #ifdef CONFIG_USER_NS
 void release_uids(struct user_namespace *ns)
 {
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index f82730adea0..0d9c51d6733 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -19,6 +19,7 @@ static struct user_namespace *clone_user_ns(struct user_namespace *old_ns)
 {
 	struct user_namespace *ns;
 	struct user_struct *new_user;
+	struct cred *new;
 	int n;
 
 	ns = kmalloc(sizeof(struct user_namespace), GFP_KERNEL);
@@ -45,7 +46,16 @@ static struct user_namespace *clone_user_ns(struct user_namespace *old_ns)
 		return ERR_PTR(-ENOMEM);
 	}
 
-	switch_uid(new_user);
+	/* Install the new user */
+	new = prepare_creds();
+	if (!new) {
+		free_uid(new_user);
+		free_uid(ns->root_user);
+		kfree(ns);
+	}
+	free_uid(new->user);
+	new->user = new_user;
+	commit_creds(new);
 	return ns;
 }
 
diff --git a/lib/Makefile b/lib/Makefile
index 7cb65d85aeb..80fe8a3ec12 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -11,7 +11,7 @@ lib-y := ctype.o string.o vsprintf.o cmdline.o \
 	 rbtree.o radix-tree.o dump_stack.o \
 	 idr.o int_sqrt.o extable.o prio_tree.o \
 	 sha1.o irq_regs.o reciprocal_div.o argv_split.o \
-	 proportions.o prio_heap.o ratelimit.o show_mem.o
+	 proportions.o prio_heap.o ratelimit.o show_mem.o is_single_threaded.o
 
 lib-$(CONFIG_MMU) += ioremap.o
 lib-$(CONFIG_SMP) += cpumask.o
diff --git a/net/rxrpc/ar-key.c b/net/rxrpc/ar-key.c
index 9a8ff684da7..ad8c7a782da 100644
--- a/net/rxrpc/ar-key.c
+++ b/net/rxrpc/ar-key.c
@@ -287,6 +287,7 @@ int rxrpc_get_server_data_key(struct rxrpc_connection *conn,
 			      time_t expiry,
 			      u32 kvno)
 {
+	const struct cred *cred = current_cred();
 	struct key *key;
 	int ret;
 
@@ -297,7 +298,7 @@ int rxrpc_get_server_data_key(struct rxrpc_connection *conn,
 
 	_enter("");
 
-	key = key_alloc(&key_type_rxrpc, "x", 0, 0, current, 0,
+	key = key_alloc(&key_type_rxrpc, "x", 0, 0, cred, 0,
 			KEY_ALLOC_NOT_IN_QUOTA);
 	if (IS_ERR(key)) {
 		_leave(" = -ENOMEM [alloc %ld]", PTR_ERR(key));
@@ -340,10 +341,11 @@ EXPORT_SYMBOL(rxrpc_get_server_data_key);
  */
 struct key *rxrpc_get_null_key(const char *keyname)
 {
+	const struct cred *cred = current_cred();
 	struct key *key;
 	int ret;
 
-	key = key_alloc(&key_type_rxrpc, keyname, 0, 0, current,
+	key = key_alloc(&key_type_rxrpc, keyname, 0, 0, cred,
 			KEY_POS_SEARCH, KEY_ALLOC_NOT_IN_QUOTA);
 	if (IS_ERR(key))
 		return key;
diff --git a/security/capability.c b/security/capability.c
index fac2f61b69a..efeb6d9e0e6 100644
--- a/security/capability.c
+++ b/security/capability.c
@@ -340,12 +340,16 @@ static int cap_task_create(unsigned long clone_flags)
 	return 0;
 }
 
-static int cap_cred_alloc_security(struct cred *cred)
+static void cap_cred_free(struct cred *cred)
+{
+}
+
+static int cap_cred_prepare(struct cred *new, const struct cred *old, gfp_t gfp)
 {
 	return 0;
 }
 
-static void cap_cred_free(struct cred *cred)
+static void cap_cred_commit(struct cred *new, const struct cred *old)
 {
 }
 
@@ -750,7 +754,7 @@ static void cap_release_secctx(char *secdata, u32 seclen)
 }
 
 #ifdef CONFIG_KEYS
-static int cap_key_alloc(struct key *key, struct task_struct *ctx,
+static int cap_key_alloc(struct key *key, const struct cred *cred,
 			 unsigned long flags)
 {
 	return 0;
@@ -760,7 +764,7 @@ static void cap_key_free(struct key *key)
 {
 }
 
-static int cap_key_permission(key_ref_t key_ref, struct task_struct *context,
+static int cap_key_permission(key_ref_t key_ref, const struct cred *cred,
 			      key_perm_t perm)
 {
 	return 0;
@@ -814,8 +818,7 @@ void security_fixup_ops(struct security_operations *ops)
 	set_to_cap_if_null(ops, ptrace_may_access);
 	set_to_cap_if_null(ops, ptrace_traceme);
 	set_to_cap_if_null(ops, capget);
-	set_to_cap_if_null(ops, capset_check);
-	set_to_cap_if_null(ops, capset_set);
+	set_to_cap_if_null(ops, capset);
 	set_to_cap_if_null(ops, acct);
 	set_to_cap_if_null(ops, capable);
 	set_to_cap_if_null(ops, quotactl);
@@ -890,10 +893,11 @@ void security_fixup_ops(struct security_operations *ops)
 	set_to_cap_if_null(ops, file_receive);
 	set_to_cap_if_null(ops, dentry_open);
 	set_to_cap_if_null(ops, task_create);
-	set_to_cap_if_null(ops, cred_alloc_security);
 	set_to_cap_if_null(ops, cred_free);
+	set_to_cap_if_null(ops, cred_prepare);
+	set_to_cap_if_null(ops, cred_commit);
 	set_to_cap_if_null(ops, task_setuid);
-	set_to_cap_if_null(ops, task_post_setuid);
+	set_to_cap_if_null(ops, task_fix_setuid);
 	set_to_cap_if_null(ops, task_setgid);
 	set_to_cap_if_null(ops, task_setpgid);
 	set_to_cap_if_null(ops, task_getpgid);
@@ -910,7 +914,6 @@ void security_fixup_ops(struct security_operations *ops)
 	set_to_cap_if_null(ops, task_wait);
 	set_to_cap_if_null(ops, task_kill);
 	set_to_cap_if_null(ops, task_prctl);
-	set_to_cap_if_null(ops, task_reparent_to_init);
 	set_to_cap_if_null(ops, task_to_inode);
 	set_to_cap_if_null(ops, ipc_permission);
 	set_to_cap_if_null(ops, ipc_getsecid);
diff --git a/security/commoncap.c b/security/commoncap.c
index 0384bf95db6..b5419273f92 100644
--- a/security/commoncap.c
+++ b/security/commoncap.c
@@ -72,8 +72,8 @@ int cap_ptrace_may_access(struct task_struct *child, unsigned int mode)
 	int ret = 0;
 
 	rcu_read_lock();
-	if (!cap_issubset(child->cred->cap_permitted,
-			  current->cred->cap_permitted) &&
+	if (!cap_issubset(__task_cred(child)->cap_permitted,
+			  current_cred()->cap_permitted) &&
 	    !capable(CAP_SYS_PTRACE))
 		ret = -EPERM;
 	rcu_read_unlock();
@@ -85,8 +85,8 @@ int cap_ptrace_traceme(struct task_struct *parent)
 	int ret = 0;
 
 	rcu_read_lock();
-	if (!cap_issubset(current->cred->cap_permitted,
-			 parent->cred->cap_permitted) &&
+	if (!cap_issubset(current_cred()->cap_permitted,
+			  __task_cred(parent)->cap_permitted) &&
 	    !has_capability(parent, CAP_SYS_PTRACE))
 		ret = -EPERM;
 	rcu_read_unlock();
@@ -117,7 +117,7 @@ static inline int cap_inh_is_capped(void)
 	 * to the old permitted set. That is, if the current task
 	 * does *not* possess the CAP_SETPCAP capability.
 	 */
-	return (cap_capable(current, CAP_SETPCAP, SECURITY_CAP_AUDIT) != 0);
+	return cap_capable(current, CAP_SETPCAP, SECURITY_CAP_AUDIT) != 0;
 }
 
 static inline int cap_limit_ptraced_target(void) { return 1; }
@@ -132,52 +132,39 @@ static inline int cap_limit_ptraced_target(void)
 
 #endif /* def CONFIG_SECURITY_FILE_CAPABILITIES */
 
-int cap_capset_check(const kernel_cap_t *effective,
-		     const kernel_cap_t *inheritable,
-		     const kernel_cap_t *permitted)
+int cap_capset(struct cred *new,
+	       const struct cred *old,
+	       const kernel_cap_t *effective,
+	       const kernel_cap_t *inheritable,
+	       const kernel_cap_t *permitted)
 {
-	const struct cred *cred = current->cred;
-
-	if (cap_inh_is_capped()
-	    && !cap_issubset(*inheritable,
-			     cap_combine(cred->cap_inheritable,
-					 cred->cap_permitted))) {
+	if (cap_inh_is_capped() &&
+	    !cap_issubset(*inheritable,
+			  cap_combine(old->cap_inheritable,
+				      old->cap_permitted)))
 		/* incapable of using this inheritable set */
 		return -EPERM;
-	}
+
 	if (!cap_issubset(*inheritable,
-			   cap_combine(cred->cap_inheritable,
-				       cred->cap_bset))) {
+			  cap_combine(old->cap_inheritable,
+				      old->cap_bset)))
 		/* no new pI capabilities outside bounding set */
 		return -EPERM;
-	}
 
 	/* verify restrictions on target's new Permitted set */
-	if (!cap_issubset (*permitted,
-			   cap_combine (cred->cap_permitted,
-					cred->cap_permitted))) {
+	if (!cap_issubset(*permitted, old->cap_permitted))
 		return -EPERM;
-	}
 
 	/* verify the _new_Effective_ is a subset of the _new_Permitted_ */
-	if (!cap_issubset (*effective, *permitted)) {
+	if (!cap_issubset(*effective, *permitted))
 		return -EPERM;
-	}
 
+	new->cap_effective   = *effective;
+	new->cap_inheritable = *inheritable;
+	new->cap_permitted   = *permitted;
 	return 0;
 }
 
-void cap_capset_set(const kernel_cap_t *effective,
-		    const kernel_cap_t *inheritable,
-		    const kernel_cap_t *permitted)
-{
-	struct cred *cred = current->cred;
-
-	cred->cap_effective   = *effective;
-	cred->cap_inheritable = *inheritable;
-	cred->cap_permitted   = *permitted;
-}
-
 static inline void bprm_clear_caps(struct linux_binprm *bprm)
 {
 	cap_clear(bprm->cap_post_exec_permitted);
@@ -382,41 +369,46 @@ int cap_bprm_set_security (struct linux_binprm *bprm)
 	return ret;
 }
 
-void cap_bprm_apply_creds (struct linux_binprm *bprm, int unsafe)
+int cap_bprm_apply_creds (struct linux_binprm *bprm, int unsafe)
 {
-	struct cred *cred = current->cred;
+	const struct cred *old = current_cred();
+	struct cred *new;
+
+	new = prepare_creds();
+	if (!new)
+		return -ENOMEM;
 
-	if (bprm->e_uid != cred->uid || bprm->e_gid != cred->gid ||
+	if (bprm->e_uid != old->uid || bprm->e_gid != old->gid ||
 	    !cap_issubset(bprm->cap_post_exec_permitted,
-			  cred->cap_permitted)) {
+			  old->cap_permitted)) {
 		set_dumpable(current->mm, suid_dumpable);
 		current->pdeath_signal = 0;
 
 		if (unsafe & ~LSM_UNSAFE_PTRACE_CAP) {
 			if (!capable(CAP_SETUID)) {
-				bprm->e_uid = cred->uid;
-				bprm->e_gid = cred->gid;
+				bprm->e_uid = old->uid;
+				bprm->e_gid = old->gid;
 			}
 			if (cap_limit_ptraced_target()) {
 				bprm->cap_post_exec_permitted = cap_intersect(
 					bprm->cap_post_exec_permitted,
-					cred->cap_permitted);
+					new->cap_permitted);
 			}
 		}
 	}
 
-	cred->suid = cred->euid = cred->fsuid = bprm->e_uid;
-	cred->sgid = cred->egid = cred->fsgid = bprm->e_gid;
+	new->suid = new->euid = new->fsuid = bprm->e_uid;
+	new->sgid = new->egid = new->fsgid = bprm->e_gid;
 
 	/* For init, we want to retain the capabilities set
 	 * in the init_task struct. Thus we skip the usual
 	 * capability rules */
 	if (!is_global_init(current)) {
-		cred->cap_permitted = bprm->cap_post_exec_permitted;
+		new->cap_permitted = bprm->cap_post_exec_permitted;
 		if (bprm->cap_effective)
-			cred->cap_effective = bprm->cap_post_exec_permitted;
+			new->cap_effective = bprm->cap_post_exec_permitted;
 		else
-			cap_clear(cred->cap_effective);
+			cap_clear(new->cap_effective);
 	}
 
 	/*
@@ -431,15 +423,15 @@ void cap_bprm_apply_creds (struct linux_binprm *bprm, int unsafe)
 	 * Number 1 above might fail if you don't have a full bset, but I think
 	 * that is interesting information to audit.
 	 */
-	if (!cap_isclear(cred->cap_effective)) {
-		if (!cap_issubset(CAP_FULL_SET, cred->cap_effective) ||
-		    (bprm->e_uid != 0) || (cred->uid != 0) ||
+	if (!cap_isclear(new->cap_effective)) {
+		if (!cap_issubset(CAP_FULL_SET, new->cap_effective) ||
+		    bprm->e_uid != 0 || new->uid != 0 ||
 		    issecure(SECURE_NOROOT))
-			audit_log_bprm_fcaps(bprm, &cred->cap_permitted,
-					     &cred->cap_effective);
+			audit_log_bprm_fcaps(bprm, new, old);
 	}
 
-	cred->securebits &= ~issecure_mask(SECURE_KEEP_CAPS);
+	new->securebits &= ~issecure_mask(SECURE_KEEP_CAPS);
+	return commit_creds(new);
 }
 
 int cap_bprm_secureexec (struct linux_binprm *bprm)
@@ -514,65 +506,49 @@ int cap_inode_removexattr(struct dentry *dentry, const char *name)
  * files..
  * Thanks to Olaf Kirch and Peter Benie for spotting this.
  */
-static inline void cap_emulate_setxuid (int old_ruid, int old_euid,
-					int old_suid)
+static inline void cap_emulate_setxuid(struct cred *new, const struct cred *old)
 {
-	struct cred *cred = current->cred;
-
-	if ((old_ruid == 0 || old_euid == 0 || old_suid == 0) &&
-	    (cred->uid != 0 && cred->euid != 0 && cred->suid != 0) &&
+	if ((old->uid == 0 || old->euid == 0 || old->suid == 0) &&
+	    (new->uid != 0 && new->euid != 0 && new->suid != 0) &&
 	    !issecure(SECURE_KEEP_CAPS)) {
-		cap_clear(cred->cap_permitted);
-		cap_clear(cred->cap_effective);
-	}
-	if (old_euid == 0 && cred->euid != 0) {
-		cap_clear(cred->cap_effective);
-	}
-	if (old_euid != 0 && cred->euid == 0) {
-		cred->cap_effective = cred->cap_permitted;
+		cap_clear(new->cap_permitted);
+		cap_clear(new->cap_effective);
 	}
+	if (old->euid == 0 && new->euid != 0)
+		cap_clear(new->cap_effective);
+	if (old->euid != 0 && new->euid == 0)
+		new->cap_effective = new->cap_permitted;
 }
 
-int cap_task_post_setuid (uid_t old_ruid, uid_t old_euid, uid_t old_suid,
-			  int flags)
+int cap_task_fix_setuid(struct cred *new, const struct cred *old, int flags)
 {
-	struct cred *cred = current->cred;
-
 	switch (flags) {
 	case LSM_SETID_RE:
 	case LSM_SETID_ID:
 	case LSM_SETID_RES:
 		/* Copied from kernel/sys.c:setreuid/setuid/setresuid. */
-		if (!issecure (SECURE_NO_SETUID_FIXUP)) {
-			cap_emulate_setxuid (old_ruid, old_euid, old_suid);
-		}
+		if (!issecure(SECURE_NO_SETUID_FIXUP))
+			cap_emulate_setxuid(new, old);
 		break;
 	case LSM_SETID_FS:
-		{
-			uid_t old_fsuid = old_ruid;
-
-			/* Copied from kernel/sys.c:setfsuid. */
+		/* Copied from kernel/sys.c:setfsuid. */
 
-			/*
-			 * FIXME - is fsuser used for all CAP_FS_MASK capabilities?
-			 *          if not, we might be a bit too harsh here.
-			 */
-
-			if (!issecure (SECURE_NO_SETUID_FIXUP)) {
-				if (old_fsuid == 0 && cred->fsuid != 0) {
-					cred->cap_effective =
-						cap_drop_fs_set(
-							cred->cap_effective);
-				}
-				if (old_fsuid != 0 && cred->fsuid == 0) {
-					cred->cap_effective =
-						cap_raise_fs_set(
-						    cred->cap_effective,
-						    cred->cap_permitted);
-				}
+		/*
+		 * FIXME - is fsuser used for all CAP_FS_MASK capabilities?
+		 *          if not, we might be a bit too harsh here.
+		 */
+		if (!issecure(SECURE_NO_SETUID_FIXUP)) {
+			if (old->fsuid == 0 && new->fsuid != 0) {
+				new->cap_effective =
+					cap_drop_fs_set(new->cap_effective);
+			}
+			if (old->fsuid != 0 && new->fsuid == 0) {
+				new->cap_effective =
+					cap_raise_fs_set(new->cap_effective,
+							 new->cap_permitted);
 			}
-			break;
 		}
+		break;
 	default:
 		return -EINVAL;
 	}
@@ -628,13 +604,14 @@ int cap_task_setnice (struct task_struct *p, int nice)
  * this task could get inconsistent info.  There can be no
  * racing writer bc a task can only change its own caps.
  */
-static long cap_prctl_drop(unsigned long cap)
+static long cap_prctl_drop(struct cred *new, unsigned long cap)
 {
 	if (!capable(CAP_SETPCAP))
 		return -EPERM;
 	if (!cap_valid(cap))
 		return -EINVAL;
-	cap_lower(current->cred->cap_bset, cap);
+
+	cap_lower(new->cap_bset, cap);
 	return 0;
 }
 
@@ -655,22 +632,29 @@ int cap_task_setnice (struct task_struct *p, int nice)
 #endif
 
 int cap_task_prctl(int option, unsigned long arg2, unsigned long arg3,
-		   unsigned long arg4, unsigned long arg5, long *rc_p)
+		   unsigned long arg4, unsigned long arg5)
 {
-	struct cred *cred = current_cred();
+	struct cred *new;
 	long error = 0;
 
+	new = prepare_creds();
+	if (!new)
+		return -ENOMEM;
+
 	switch (option) {
 	case PR_CAPBSET_READ:
+		error = -EINVAL;
 		if (!cap_valid(arg2))
-			error = -EINVAL;
-		else
-			error = !!cap_raised(cred->cap_bset, arg2);
-		break;
+			goto error;
+		error = !!cap_raised(new->cap_bset, arg2);
+		goto no_change;
+
 #ifdef CONFIG_SECURITY_FILE_CAPABILITIES
 	case PR_CAPBSET_DROP:
-		error = cap_prctl_drop(arg2);
-		break;
+		error = cap_prctl_drop(new, arg2);
+		if (error < 0)
+			goto error;
+		goto changed;
 
 	/*
 	 * The next four prctl's remain to assist with transitioning a
@@ -692,12 +676,12 @@ int cap_task_prctl(int option, unsigned long arg2, unsigned long arg3,
 	 * capability-based-privilege environment.
 	 */
 	case PR_SET_SECUREBITS:
-		if ((((cred->securebits & SECURE_ALL_LOCKS) >> 1)
-		     & (cred->securebits ^ arg2))                  /*[1]*/
-		    || ((cred->securebits & SECURE_ALL_LOCKS
-			 & ~arg2))                                    /*[2]*/
-		    || (arg2 & ~(SECURE_ALL_LOCKS | SECURE_ALL_BITS)) /*[3]*/
-		    || (cap_capable(current, CAP_SETPCAP, SECURITY_CAP_AUDIT) != 0)) { /*[4]*/
+		error = -EPERM;
+		if ((((new->securebits & SECURE_ALL_LOCKS) >> 1)
+		     & (new->securebits ^ arg2))			/*[1]*/
+		    || ((new->securebits & SECURE_ALL_LOCKS & ~arg2))	/*[2]*/
+		    || (arg2 & ~(SECURE_ALL_LOCKS | SECURE_ALL_BITS))	/*[3]*/
+		    || (cap_capable(current, CAP_SETPCAP, SECURITY_CAP_AUDIT) != 0) /*[4]*/
 			/*
 			 * [1] no changing of bits that are locked
 			 * [2] no unlocking of locks
@@ -705,50 +689,51 @@ int cap_task_prctl(int option, unsigned long arg2, unsigned long arg3,
 			 * [4] doing anything requires privilege (go read about
 			 *     the "sendmail capabilities bug")
 			 */
-			error = -EPERM;  /* cannot change a locked bit */
-		} else {
-			cred->securebits = arg2;
-		}
-		break;
+		    )
+			/* cannot change a locked bit */
+			goto error;
+		new->securebits = arg2;
+		goto changed;
+
 	case PR_GET_SECUREBITS:
-		error = cred->securebits;
-		break;
+		error = new->securebits;
+		goto no_change;
 
 #endif /* def CONFIG_SECURITY_FILE_CAPABILITIES */
 
 	case PR_GET_KEEPCAPS:
 		if (issecure(SECURE_KEEP_CAPS))
 			error = 1;
-		break;
+		goto no_change;
+
 	case PR_SET_KEEPCAPS:
+		error = -EINVAL;
 		if (arg2 > 1) /* Note, we rely on arg2 being unsigned here */
-			error = -EINVAL;
-		else if (issecure(SECURE_KEEP_CAPS_LOCKED))
-			error = -EPERM;
-		else if (arg2)
-			cred->securebits |= issecure_mask(SECURE_KEEP_CAPS);
+			goto error;
+		error = -EPERM;
+		if (issecure(SECURE_KEEP_CAPS_LOCKED))
+			goto error;
+		if (arg2)
+			new->securebits |= issecure_mask(SECURE_KEEP_CAPS);
 		else
-			cred->securebits &= ~issecure_mask(SECURE_KEEP_CAPS);
-		break;
+			new->securebits &= ~issecure_mask(SECURE_KEEP_CAPS);
+		goto changed;
 
 	default:
 		/* No functionality available - continue with default */
-		return 0;
+		error = -ENOSYS;
+		goto error;
 	}
 
 	/* Functionality provided */
-	*rc_p = error;
-	return 1;
-}
-
-void cap_task_reparent_to_init (struct task_struct *p)
-{
-	struct cred *cred = p->cred;
-
-	cap_set_init_eff(cred->cap_effective);
-	cap_clear(cred->cap_inheritable);
-	cap_set_full(cred->cap_permitted);
-	p->cred->securebits = SECUREBITS_DEFAULT;
+changed:
+	return commit_creds(new);
+
+no_change:
+	error = 0;
+error:
+	abort_creds(new);
+	return error;
 }
 
 int cap_syslog (int type)
diff --git a/security/keys/internal.h b/security/keys/internal.h
index d1586c62978..81932abefe7 100644
--- a/security/keys/internal.h
+++ b/security/keys/internal.h
@@ -12,6 +12,7 @@
 #ifndef _INTERNAL_H
 #define _INTERNAL_H
 
+#include <linux/sched.h>
 #include <linux/key-type.h>
 
 static inline __attribute__((format(printf, 1, 2)))
@@ -25,7 +26,7 @@ void no_printk(const char *fmt, ...)
 #define kleave(FMT, ...) \
 	printk(KERN_DEBUG "<== %s()"FMT"\n", __func__, ##__VA_ARGS__)
 #define kdebug(FMT, ...) \
-	printk(KERN_DEBUG "xxx" FMT"yyy\n", ##__VA_ARGS__)
+	printk(KERN_DEBUG "   "FMT"\n", ##__VA_ARGS__)
 #else
 #define kenter(FMT, ...) \
 	no_printk(KERN_DEBUG "==> %s("FMT")\n", __func__, ##__VA_ARGS__)
@@ -97,7 +98,7 @@ extern struct key *keyring_search_instkey(struct key *keyring,
 typedef int (*key_match_func_t)(const struct key *, const void *);
 
 extern key_ref_t keyring_search_aux(key_ref_t keyring_ref,
-				    struct task_struct *tsk,
+				    const struct cred *cred,
 				    struct key_type *type,
 				    const void *description,
 				    key_match_func_t match);
@@ -105,13 +106,13 @@ extern key_ref_t keyring_search_aux(key_ref_t keyring_ref,
 extern key_ref_t search_process_keyrings(struct key_type *type,
 					 const void *description,
 					 key_match_func_t match,
-					 struct task_struct *tsk);
+					 const struct cred *cred);
 
 extern struct key *find_keyring_by_name(const char *name, bool skip_perm_check);
 
 extern int install_user_keyrings(void);
-extern int install_thread_keyring(void);
-extern int install_process_keyring(void);
+extern int install_thread_keyring_to_cred(struct cred *);
+extern int install_process_keyring_to_cred(struct cred *);
 
 extern struct key *request_key_and_link(struct key_type *type,
 					const char *description,
@@ -130,12 +131,12 @@ extern long join_session_keyring(const char *name);
  * check to see whether permission is granted to use a key in the desired way
  */
 extern int key_task_permission(const key_ref_t key_ref,
-			       struct task_struct *context,
+			       const struct cred *cred,
 			       key_perm_t perm);
 
 static inline int key_permission(const key_ref_t key_ref, key_perm_t perm)
 {
-	return key_task_permission(key_ref, current, perm);
+	return key_task_permission(key_ref, current_cred(), perm);
 }
 
 /* required permissions */
@@ -153,7 +154,7 @@ static inline int key_permission(const key_ref_t key_ref, key_perm_t perm)
 struct request_key_auth {
 	struct key		*target_key;
 	struct key		*dest_keyring;
-	struct task_struct	*context;
+	const struct cred	*cred;
 	void			*callout_info;
 	size_t			callout_len;
 	pid_t			pid;
diff --git a/security/keys/key.c b/security/keys/key.c
index a6ca39ed3b0..f76c8a546fd 100644
--- a/security/keys/key.c
+++ b/security/keys/key.c
@@ -218,7 +218,7 @@ serial_exists:
  *   instantiate the key or discard it before returning
  */
 struct key *key_alloc(struct key_type *type, const char *desc,
-		      uid_t uid, gid_t gid, struct task_struct *ctx,
+		      uid_t uid, gid_t gid, const struct cred *cred,
 		      key_perm_t perm, unsigned long flags)
 {
 	struct key_user *user = NULL;
@@ -294,7 +294,7 @@ struct key *key_alloc(struct key_type *type, const char *desc,
 #endif
 
 	/* let the security module know about the key */
-	ret = security_key_alloc(key, ctx, flags);
+	ret = security_key_alloc(key, cred, flags);
 	if (ret < 0)
 		goto security_error;
 
@@ -391,7 +391,7 @@ static int __key_instantiate_and_link(struct key *key,
 				      const void *data,
 				      size_t datalen,
 				      struct key *keyring,
-				      struct key *instkey)
+				      struct key *authkey)
 {
 	int ret, awaken;
 
@@ -421,8 +421,8 @@ static int __key_instantiate_and_link(struct key *key,
 				ret = __key_link(keyring, key);
 
 			/* disable the authorisation key */
-			if (instkey)
-				key_revoke(instkey);
+			if (authkey)
+				key_revoke(authkey);
 		}
 	}
 
@@ -444,14 +444,14 @@ int key_instantiate_and_link(struct key *key,
 			     const void *data,
 			     size_t datalen,
 			     struct key *keyring,
-			     struct key *instkey)
+			     struct key *authkey)
 {
 	int ret;
 
 	if (keyring)
 		down_write(&keyring->sem);
 
-	ret = __key_instantiate_and_link(key, data, datalen, keyring, instkey);
+	ret = __key_instantiate_and_link(key, data, datalen, keyring, authkey);
 
 	if (keyring)
 		up_write(&keyring->sem);
@@ -469,7 +469,7 @@ EXPORT_SYMBOL(key_instantiate_and_link);
 int key_negate_and_link(struct key *key,
 			unsigned timeout,
 			struct key *keyring,
-			struct key *instkey)
+			struct key *authkey)
 {
 	struct timespec now;
 	int ret, awaken;
@@ -504,8 +504,8 @@ int key_negate_and_link(struct key *key,
 			ret = __key_link(keyring, key);
 
 		/* disable the authorisation key */
-		if (instkey)
-			key_revoke(instkey);
+		if (authkey)
+			key_revoke(authkey);
 	}
 
 	mutex_unlock(&key_construction_mutex);
@@ -743,6 +743,7 @@ key_ref_t key_create_or_update(key_ref_t keyring_ref,
 			       key_perm_t perm,
 			       unsigned long flags)
 {
+	const struct cred *cred = current_cred();
 	struct key_type *ktype;
 	struct key *keyring, *key = NULL;
 	key_ref_t key_ref;
@@ -802,8 +803,8 @@ key_ref_t key_create_or_update(key_ref_t keyring_ref,
 	}
 
 	/* allocate a new key */
-	key = key_alloc(ktype, description, current_fsuid(), current_fsgid(),
-			current, perm, flags);
+	key = key_alloc(ktype, description, cred->fsuid, cred->fsgid, cred,
+			perm, flags);
 	if (IS_ERR(key)) {
 		key_ref = ERR_CAST(key);
 		goto error_3;
diff --git a/security/keys/keyctl.c b/security/keys/keyctl.c
index 8833b447ade..7c72baa02f2 100644
--- a/security/keys/keyctl.c
+++ b/security/keys/keyctl.c
@@ -866,6 +866,23 @@ static long get_instantiation_keyring(key_serial_t ringid,
 	return -ENOKEY;
 }
 
+/*
+ * change the request_key authorisation key on the current process
+ */
+static int keyctl_change_reqkey_auth(struct key *key)
+{
+	struct cred *new;
+
+	new = prepare_creds();
+	if (!new)
+		return -ENOMEM;
+
+	key_put(new->request_key_auth);
+	new->request_key_auth = key_get(key);
+
+	return commit_creds(new);
+}
+
 /*****************************************************************************/
 /*
  * instantiate the key with the specified payload, and, if one is given, link
@@ -876,12 +893,15 @@ long keyctl_instantiate_key(key_serial_t id,
 			    size_t plen,
 			    key_serial_t ringid)
 {
+	const struct cred *cred = current_cred();
 	struct request_key_auth *rka;
 	struct key *instkey, *dest_keyring;
 	void *payload;
 	long ret;
 	bool vm = false;
 
+	kenter("%d,,%zu,%d", id, plen, ringid);
+
 	ret = -EINVAL;
 	if (plen > 1024 * 1024 - 1)
 		goto error;
@@ -889,7 +909,7 @@ long keyctl_instantiate_key(key_serial_t id,
 	/* the appropriate instantiation authorisation key must have been
 	 * assumed before calling this */
 	ret = -EPERM;
-	instkey = current->cred->request_key_auth;
+	instkey = cred->request_key_auth;
 	if (!instkey)
 		goto error;
 
@@ -931,10 +951,8 @@ long keyctl_instantiate_key(key_serial_t id,
 
 	/* discard the assumed authority if it's just been disabled by
 	 * instantiation of the key */
-	if (ret == 0) {
-		key_put(current->cred->request_key_auth);
-		current->cred->request_key_auth = NULL;
-	}
+	if (ret == 0)
+		keyctl_change_reqkey_auth(NULL);
 
 error2:
 	if (!vm)
@@ -953,14 +971,17 @@ error:
  */
 long keyctl_negate_key(key_serial_t id, unsigned timeout, key_serial_t ringid)
 {
+	const struct cred *cred = current_cred();
 	struct request_key_auth *rka;
 	struct key *instkey, *dest_keyring;
 	long ret;
 
+	kenter("%d,%u,%d", id, timeout, ringid);
+
 	/* the appropriate instantiation authorisation key must have been
 	 * assumed before calling this */
 	ret = -EPERM;
-	instkey = current->cred->request_key_auth;
+	instkey = cred->request_key_auth;
 	if (!instkey)
 		goto error;
 
@@ -982,10 +1003,8 @@ long keyctl_negate_key(key_serial_t id, unsigned timeout, key_serial_t ringid)
 
 	/* discard the assumed authority if it's just been disabled by
 	 * instantiation of the key */
-	if (ret == 0) {
-		key_put(current->cred->request_key_auth);
-		current->cred->request_key_auth = NULL;
-	}
+	if (ret == 0)
+		keyctl_change_reqkey_auth(NULL);
 
 error:
 	return ret;
@@ -999,36 +1018,56 @@ error:
  */
 long keyctl_set_reqkey_keyring(int reqkey_defl)
 {
-	struct cred *cred = current->cred;
-	int ret;
+	struct cred *new;
+	int ret, old_setting;
+
+	old_setting = current_cred_xxx(jit_keyring);
+
+	if (reqkey_defl == KEY_REQKEY_DEFL_NO_CHANGE)
+		return old_setting;
+
+	new = prepare_creds();
+	if (!new)
+		return -ENOMEM;
 
 	switch (reqkey_defl) {
 	case KEY_REQKEY_DEFL_THREAD_KEYRING:
-		ret = install_thread_keyring();
+		ret = install_thread_keyring_to_cred(new);
 		if (ret < 0)
-			return ret;
+			goto error;
 		goto set;
 
 	case KEY_REQKEY_DEFL_PROCESS_KEYRING:
-		ret = install_process_keyring();
-		if (ret < 0)
-			return ret;
+		ret = install_process_keyring_to_cred(new);
+		if (ret < 0) {
+			if (ret != -EEXIST)
+				goto error;
+			ret = 0;
+		}
+		goto set;
 
 	case KEY_REQKEY_DEFL_DEFAULT:
 	case KEY_REQKEY_DEFL_SESSION_KEYRING:
 	case KEY_REQKEY_DEFL_USER_KEYRING:
 	case KEY_REQKEY_DEFL_USER_SESSION_KEYRING:
-	set:
-		cred->jit_keyring = reqkey_defl;
+	case KEY_REQKEY_DEFL_REQUESTOR_KEYRING:
+		goto set;
 
 	case KEY_REQKEY_DEFL_NO_CHANGE:
-		return cred->jit_keyring;
-
 	case KEY_REQKEY_DEFL_GROUP_KEYRING:
 	default:
-		return -EINVAL;
+		ret = -EINVAL;
+		goto error;
 	}
 
+set:
+	new->jit_keyring = reqkey_defl;
+	commit_creds(new);
+	return old_setting;
+error:
+	abort_creds(new);
+	return -EINVAL;
+
 } /* end keyctl_set_reqkey_keyring() */
 
 /*****************************************************************************/
@@ -1087,9 +1126,7 @@ long keyctl_assume_authority(key_serial_t id)
 
 	/* we divest ourselves of authority if given an ID of 0 */
 	if (id == 0) {
-		key_put(current->cred->request_key_auth);
-		current->cred->request_key_auth = NULL;
-		ret = 0;
+		ret = keyctl_change_reqkey_auth(NULL);
 		goto error;
 	}
 
@@ -1104,10 +1141,12 @@ long keyctl_assume_authority(key_serial_t id)
 		goto error;
 	}
 
-	key_put(current->cred->request_key_auth);
-	current->cred->request_key_auth = authkey;
-	ret = authkey->serial;
+	ret = keyctl_change_reqkey_auth(authkey);
+	if (ret < 0)
+		goto error;
+	key_put(authkey);
 
+	ret = authkey->serial;
 error:
 	return ret;
 
diff --git a/security/keys/keyring.c b/security/keys/keyring.c
index fdf75f90199..ed851574d07 100644
--- a/security/keys/keyring.c
+++ b/security/keys/keyring.c
@@ -245,14 +245,14 @@ static long keyring_read(const struct key *keyring,
  * allocate a keyring and link into the destination keyring
  */
 struct key *keyring_alloc(const char *description, uid_t uid, gid_t gid,
-			  struct task_struct *ctx, unsigned long flags,
+			  const struct cred *cred, unsigned long flags,
 			  struct key *dest)
 {
 	struct key *keyring;
 	int ret;
 
 	keyring = key_alloc(&key_type_keyring, description,
-			    uid, gid, ctx,
+			    uid, gid, cred,
 			    (KEY_POS_ALL & ~KEY_POS_SETATTR) | KEY_USR_ALL,
 			    flags);
 
@@ -281,7 +281,7 @@ struct key *keyring_alloc(const char *description, uid_t uid, gid_t gid,
  * - we propagate the possession attribute from the keyring ref to the key ref
  */
 key_ref_t keyring_search_aux(key_ref_t keyring_ref,
-			     struct task_struct *context,
+			     const struct cred *cred,
 			     struct key_type *type,
 			     const void *description,
 			     key_match_func_t match)
@@ -304,7 +304,7 @@ key_ref_t keyring_search_aux(key_ref_t keyring_ref,
 	key_check(keyring);
 
 	/* top keyring must have search permission to begin the search */
-        err = key_task_permission(keyring_ref, context, KEY_SEARCH);
+        err = key_task_permission(keyring_ref, cred, KEY_SEARCH);
 	if (err < 0) {
 		key_ref = ERR_PTR(err);
 		goto error;
@@ -377,7 +377,7 @@ descend:
 
 		/* key must have search permissions */
 		if (key_task_permission(make_key_ref(key, possessed),
-					context, KEY_SEARCH) < 0)
+					cred, KEY_SEARCH) < 0)
 			continue;
 
 		/* we set a different error code if we pass a negative key */
@@ -404,7 +404,7 @@ ascend:
 			continue;
 
 		if (key_task_permission(make_key_ref(key, possessed),
-					context, KEY_SEARCH) < 0)
+					cred, KEY_SEARCH) < 0)
 			continue;
 
 		/* stack the current position */
@@ -459,7 +459,7 @@ key_ref_t keyring_search(key_ref_t keyring,
 	if (!type->match)
 		return ERR_PTR(-ENOKEY);
 
-	return keyring_search_aux(keyring, current,
+	return keyring_search_aux(keyring, current->cred,
 				  type, description, type->match);
 
 } /* end keyring_search() */
diff --git a/security/keys/permission.c b/security/keys/permission.c
index 13c36164f28..5d9fc7b93f2 100644
--- a/security/keys/permission.c
+++ b/security/keys/permission.c
@@ -14,24 +14,27 @@
 #include "internal.h"
 
 /*****************************************************************************/
-/*
- * check to see whether permission is granted to use a key in the desired way,
- * but permit the security modules to override
+/**
+ * key_task_permission - Check a key can be used
+ * @key_ref: The key to check
+ * @cred: The credentials to use
+ * @perm: The permissions to check for
+ *
+ * Check to see whether permission is granted to use a key in the desired way,
+ * but permit the security modules to override.
+ *
+ * The caller must hold either a ref on cred or must hold the RCU readlock or a
+ * spinlock.
  */
-int key_task_permission(const key_ref_t key_ref,
-			struct task_struct *context,
+int key_task_permission(const key_ref_t key_ref, const struct cred *cred,
 			key_perm_t perm)
 {
-	const struct cred *cred;
 	struct key *key;
 	key_perm_t kperm;
 	int ret;
 
 	key = key_ref_to_ptr(key_ref);
 
-	rcu_read_lock();
-	cred = __task_cred(context);
-
 	/* use the second 8-bits of permissions for keys the caller owns */
 	if (key->uid == cred->fsuid) {
 		kperm = key->perm >> 16;
@@ -57,7 +60,6 @@ int key_task_permission(const key_ref_t key_ref,
 	kperm = key->perm;
 
 use_these_perms:
-	rcu_read_lock();
 
 	/* use the top 8-bits of permissions for keys the caller possesses
 	 * - possessor permissions are additive with other permissions
@@ -71,7 +73,7 @@ use_these_perms:
 		return -EACCES;
 
 	/* let LSM be the final arbiter */
-	return security_key_permission(key_ref, context, perm);
+	return security_key_permission(key_ref, cred, perm);
 
 } /* end key_task_permission() */
 
diff --git a/security/keys/proc.c b/security/keys/proc.c
index f619170da76..7f508def50e 100644
--- a/security/keys/proc.c
+++ b/security/keys/proc.c
@@ -136,8 +136,12 @@ static int proc_keys_show(struct seq_file *m, void *v)
 	int rc;
 
 	/* check whether the current task is allowed to view the key (assuming
-	 * non-possession) */
-	rc = key_task_permission(make_key_ref(key, 0), current, KEY_VIEW);
+	 * non-possession)
+	 * - the caller holds a spinlock, and thus the RCU read lock, making our
+	 *   access to __current_cred() safe
+	 */
+	rc = key_task_permission(make_key_ref(key, 0), current_cred(),
+				 KEY_VIEW);
 	if (rc < 0)
 		return 0;
 
diff --git a/security/keys/process_keys.c b/security/keys/process_keys.c
index 70ee93406f3..df329f684a6 100644
--- a/security/keys/process_keys.c
+++ b/security/keys/process_keys.c
@@ -42,11 +42,15 @@ struct key_user root_key_user = {
  */
 int install_user_keyrings(void)
 {
-	struct user_struct *user = current->cred->user;
+	struct user_struct *user;
+	const struct cred *cred;
 	struct key *uid_keyring, *session_keyring;
 	char buf[20];
 	int ret;
 
+	cred = current_cred();
+	user = cred->user;
+
 	kenter("%p{%u}", user, user->uid);
 
 	if (user->uid_keyring) {
@@ -67,7 +71,7 @@ int install_user_keyrings(void)
 		uid_keyring = find_keyring_by_name(buf, true);
 		if (IS_ERR(uid_keyring)) {
 			uid_keyring = keyring_alloc(buf, user->uid, (gid_t) -1,
-						    current, KEY_ALLOC_IN_QUOTA,
+						    cred, KEY_ALLOC_IN_QUOTA,
 						    NULL);
 			if (IS_ERR(uid_keyring)) {
 				ret = PTR_ERR(uid_keyring);
@@ -83,8 +87,7 @@ int install_user_keyrings(void)
 		if (IS_ERR(session_keyring)) {
 			session_keyring =
 				keyring_alloc(buf, user->uid, (gid_t) -1,
-					      current, KEY_ALLOC_IN_QUOTA,
-					      NULL);
+					      cred, KEY_ALLOC_IN_QUOTA, NULL);
 			if (IS_ERR(session_keyring)) {
 				ret = PTR_ERR(session_keyring);
 				goto error_release;
@@ -116,142 +119,128 @@ error:
 	return ret;
 }
 
-/*****************************************************************************/
 /*
- * deal with the UID changing
+ * install a fresh thread keyring directly to new credentials
  */
-void switch_uid_keyring(struct user_struct *new_user)
+int install_thread_keyring_to_cred(struct cred *new)
 {
-#if 0 /* do nothing for now */
-	struct key *old;
-
-	/* switch to the new user's session keyring if we were running under
-	 * root's default session keyring */
-	if (new_user->uid != 0 &&
-	    current->session_keyring == &root_session_keyring
-	    ) {
-		atomic_inc(&new_user->session_keyring->usage);
-
-		task_lock(current);
-		old = current->session_keyring;
-		current->session_keyring = new_user->session_keyring;
-		task_unlock(current);
+	struct key *keyring;
 
-		key_put(old);
-	}
-#endif
+	keyring = keyring_alloc("_tid", new->uid, new->gid, new,
+				KEY_ALLOC_QUOTA_OVERRUN, NULL);
+	if (IS_ERR(keyring))
+		return PTR_ERR(keyring);
 
-} /* end switch_uid_keyring() */
+	new->thread_keyring = keyring;
+	return 0;
+}
 
-/*****************************************************************************/
 /*
  * install a fresh thread keyring, discarding the old one
  */
-int install_thread_keyring(void)
+static int install_thread_keyring(void)
 {
-	struct task_struct *tsk = current;
-	struct key *keyring, *old;
-	char buf[20];
+	struct cred *new;
 	int ret;
 
-	sprintf(buf, "_tid.%u", tsk->pid);
+	new = prepare_creds();
+	if (!new)
+		return -ENOMEM;
 
-	keyring = keyring_alloc(buf, tsk->cred->uid, tsk->cred->gid, tsk,
-				KEY_ALLOC_QUOTA_OVERRUN, NULL);
-	if (IS_ERR(keyring)) {
-		ret = PTR_ERR(keyring);
-		goto error;
+	BUG_ON(new->thread_keyring);
+
+	ret = install_thread_keyring_to_cred(new);
+	if (ret < 0) {
+		abort_creds(new);
+		return ret;
 	}
 
-	task_lock(tsk);
-	old = tsk->cred->thread_keyring;
-	tsk->cred->thread_keyring = keyring;
-	task_unlock(tsk);
+	return commit_creds(new);
+}
 
-	ret = 0;
+/*
+ * install a process keyring directly to a credentials struct
+ * - returns -EEXIST if there was already a process keyring, 0 if one installed,
+ *   and other -ve on any other error
+ */
+int install_process_keyring_to_cred(struct cred *new)
+{
+	struct key *keyring;
+	int ret;
 
-	key_put(old);
-error:
+	if (new->tgcred->process_keyring)
+		return -EEXIST;
+
+	keyring = keyring_alloc("_pid", new->uid, new->gid,
+				new, KEY_ALLOC_QUOTA_OVERRUN, NULL);
+	if (IS_ERR(keyring))
+		return PTR_ERR(keyring);
+
+	spin_lock_irq(&new->tgcred->lock);
+	if (!new->tgcred->process_keyring) {
+		new->tgcred->process_keyring = keyring;
+		keyring = NULL;
+		ret = 0;
+	} else {
+		ret = -EEXIST;
+	}
+	spin_unlock_irq(&new->tgcred->lock);
+	key_put(keyring);
 	return ret;
+}
 
-} /* end install_thread_keyring() */
-
-/*****************************************************************************/
 /*
  * make sure a process keyring is installed
+ * - we
  */
-int install_process_keyring(void)
+static int install_process_keyring(void)
 {
-	struct task_struct *tsk = current;
-	struct key *keyring;
-	char buf[20];
+	struct cred *new;
 	int ret;
 
-	might_sleep();
-
-	if (!tsk->cred->tgcred->process_keyring) {
-		sprintf(buf, "_pid.%u", tsk->tgid);
-
-		keyring = keyring_alloc(buf, tsk->cred->uid, tsk->cred->gid, tsk,
-					KEY_ALLOC_QUOTA_OVERRUN, NULL);
-		if (IS_ERR(keyring)) {
-			ret = PTR_ERR(keyring);
-			goto error;
-		}
-
-		/* attach keyring */
-		spin_lock_irq(&tsk->cred->tgcred->lock);
-		if (!tsk->cred->tgcred->process_keyring) {
-			tsk->cred->tgcred->process_keyring = keyring;
-			keyring = NULL;
-		}
-		spin_unlock_irq(&tsk->cred->tgcred->lock);
+	new = prepare_creds();
+	if (!new)
+		return -ENOMEM;
 
-		key_put(keyring);
+	ret = install_process_keyring_to_cred(new);
+	if (ret < 0) {
+		abort_creds(new);
+		return ret != -EEXIST ?: 0;
 	}
 
-	ret = 0;
-error:
-	return ret;
-
-} /* end install_process_keyring() */
+	return commit_creds(new);
+}
 
-/*****************************************************************************/
 /*
- * install a session keyring, discarding the old one
- * - if a keyring is not supplied, an empty one is invented
+ * install a session keyring directly to a credentials struct
  */
-static int install_session_keyring(struct key *keyring)
+static int install_session_keyring_to_cred(struct cred *cred,
+					   struct key *keyring)
 {
-	struct task_struct *tsk = current;
 	unsigned long flags;
 	struct key *old;
-	char buf[20];
 
 	might_sleep();
 
 	/* create an empty session keyring */
 	if (!keyring) {
-		sprintf(buf, "_ses.%u", tsk->tgid);
-
 		flags = KEY_ALLOC_QUOTA_OVERRUN;
-		if (tsk->cred->tgcred->session_keyring)
+		if (cred->tgcred->session_keyring)
 			flags = KEY_ALLOC_IN_QUOTA;
 
-		keyring = keyring_alloc(buf, tsk->cred->uid, tsk->cred->gid,
-					tsk, flags, NULL);
+		keyring = keyring_alloc("_ses", cred->uid, cred->gid,
+					cred, flags, NULL);
 		if (IS_ERR(keyring))
 			return PTR_ERR(keyring);
-	}
-	else {
+	} else {
 		atomic_inc(&keyring->usage);
 	}
 
 	/* install the keyring */
-	spin_lock_irq(&tsk->cred->tgcred->lock);
-	old = tsk->cred->tgcred->session_keyring;
-	rcu_assign_pointer(tsk->cred->tgcred->session_keyring, keyring);
-	spin_unlock_irq(&tsk->cred->tgcred->lock);
+	spin_lock_irq(&cred->tgcred->lock);
+	old = cred->tgcred->session_keyring;
+	rcu_assign_pointer(cred->tgcred->session_keyring, keyring);
+	spin_unlock_irq(&cred->tgcred->lock);
 
 	/* we're using RCU on the pointer, but there's no point synchronising
 	 * on it if it didn't previously point to anything */
@@ -261,38 +250,29 @@ static int install_session_keyring(struct key *keyring)
 	}
 
 	return 0;
+}
 
-} /* end install_session_keyring() */
-
-/*****************************************************************************/
 /*
- * copy the keys for fork
+ * install a session keyring, discarding the old one
+ * - if a keyring is not supplied, an empty one is invented
  */
-int copy_keys(unsigned long clone_flags, struct task_struct *tsk)
+static int install_session_keyring(struct key *keyring)
 {
-	key_check(tsk->cred->thread_keyring);
-	key_check(tsk->cred->request_key_auth);
-
-	/* no thread keyring yet */
-	tsk->cred->thread_keyring = NULL;
-
-	/* copy the request_key() authorisation for this thread */
-	key_get(tsk->cred->request_key_auth);
-
-	return 0;
+	struct cred *new;
+	int ret;
 
-} /* end copy_keys() */
+	new = prepare_creds();
+	if (!new)
+		return -ENOMEM;
 
-/*****************************************************************************/
-/*
- * dispose of per-thread keys upon thread exit
- */
-void exit_keys(struct task_struct *tsk)
-{
-	key_put(tsk->cred->thread_keyring);
-	key_put(tsk->cred->request_key_auth);
+	ret = install_session_keyring_to_cred(new, NULL);
+	if (ret < 0) {
+		abort_creds(new);
+		return ret;
+	}
 
-} /* end exit_keys() */
+	return commit_creds(new);
+}
 
 /*****************************************************************************/
 /*
@@ -300,38 +280,41 @@ void exit_keys(struct task_struct *tsk)
  */
 int exec_keys(struct task_struct *tsk)
 {
-	struct key *old;
+	struct thread_group_cred *tgcred = NULL;
+	struct cred *new;
 
-	/* newly exec'd tasks don't get a thread keyring */
-	task_lock(tsk);
-	old = tsk->cred->thread_keyring;
-	tsk->cred->thread_keyring = NULL;
-	task_unlock(tsk);
+#ifdef CONFIG_KEYS
+	tgcred = kmalloc(sizeof(*tgcred), GFP_KERNEL);
+	if (!tgcred)
+		return -ENOMEM;
+#endif
 
-	key_put(old);
+	new = prepare_creds();
+	if (new < 0)
+		return -ENOMEM;
 
-	/* discard the process keyring from a newly exec'd task */
-	spin_lock_irq(&tsk->cred->tgcred->lock);
-	old = tsk->cred->tgcred->process_keyring;
-	tsk->cred->tgcred->process_keyring = NULL;
-	spin_unlock_irq(&tsk->cred->tgcred->lock);
+	/* newly exec'd tasks don't get a thread keyring */
+	key_put(new->thread_keyring);
+	new->thread_keyring = NULL;
 
-	key_put(old);
+	/* create a new per-thread-group creds for all this set of threads to
+	 * share */
+	memcpy(tgcred, new->tgcred, sizeof(struct thread_group_cred));
 
-	return 0;
+	atomic_set(&tgcred->usage, 1);
+	spin_lock_init(&tgcred->lock);
 
-} /* end exec_keys() */
+	/* inherit the session keyring; new process keyring */
+	key_get(tgcred->session_keyring);
+	tgcred->process_keyring = NULL;
 
-/*****************************************************************************/
-/*
- * deal with SUID programs
- * - we might want to make this invent a new session keyring
- */
-int suid_keys(struct task_struct *tsk)
-{
+	release_tgcred(new);
+	new->tgcred = tgcred;
+
+	commit_creds(new);
 	return 0;
 
-} /* end suid_keys() */
+} /* end exec_keys() */
 
 /*****************************************************************************/
 /*
@@ -376,16 +359,13 @@ void key_fsgid_changed(struct task_struct *tsk)
 key_ref_t search_process_keyrings(struct key_type *type,
 				  const void *description,
 				  key_match_func_t match,
-				  struct task_struct *context)
+				  const struct cred *cred)
 {
 	struct request_key_auth *rka;
-	struct cred *cred;
 	key_ref_t key_ref, ret, err;
 
 	might_sleep();
 
-	cred = get_task_cred(context);
-
 	/* we want to return -EAGAIN or -ENOKEY if any of the keyrings were
 	 * searchable, but we failed to find a key or we found a negative key;
 	 * otherwise we want to return a sample error (probably -EACCES) if
@@ -401,7 +381,7 @@ key_ref_t search_process_keyrings(struct key_type *type,
 	if (cred->thread_keyring) {
 		key_ref = keyring_search_aux(
 			make_key_ref(cred->thread_keyring, 1),
-			context, type, description, match);
+			cred, type, description, match);
 		if (!IS_ERR(key_ref))
 			goto found;
 
@@ -422,7 +402,7 @@ key_ref_t search_process_keyrings(struct key_type *type,
 	if (cred->tgcred->process_keyring) {
 		key_ref = keyring_search_aux(
 			make_key_ref(cred->tgcred->process_keyring, 1),
-			context, type, description, match);
+			cred, type, description, match);
 		if (!IS_ERR(key_ref))
 			goto found;
 
@@ -446,7 +426,7 @@ key_ref_t search_process_keyrings(struct key_type *type,
 			make_key_ref(rcu_dereference(
 					     cred->tgcred->session_keyring),
 				     1),
-			context, type, description, match);
+			cred, type, description, match);
 		rcu_read_unlock();
 
 		if (!IS_ERR(key_ref))
@@ -468,7 +448,7 @@ key_ref_t search_process_keyrings(struct key_type *type,
 	else if (cred->user->session_keyring) {
 		key_ref = keyring_search_aux(
 			make_key_ref(cred->user->session_keyring, 1),
-			context, type, description, match);
+			cred, type, description, match);
 		if (!IS_ERR(key_ref))
 			goto found;
 
@@ -490,7 +470,7 @@ key_ref_t search_process_keyrings(struct key_type *type,
 	 * - we don't permit access to request_key auth keys via this method
 	 */
 	if (cred->request_key_auth &&
-	    context == current &&
+	    cred == current_cred() &&
 	    type != &key_type_request_key_auth
 	    ) {
 		/* defend against the auth key being revoked */
@@ -500,7 +480,7 @@ key_ref_t search_process_keyrings(struct key_type *type,
 			rka = cred->request_key_auth->payload.data;
 
 			key_ref = search_process_keyrings(type, description,
-							  match, rka->context);
+							  match, rka->cred);
 
 			up_read(&cred->request_key_auth->sem);
 
@@ -527,7 +507,6 @@ key_ref_t search_process_keyrings(struct key_type *type,
 	key_ref = ret ? ret : err;
 
 found:
-	put_cred(cred);
 	return key_ref;
 
 } /* end search_process_keyrings() */
@@ -552,8 +531,7 @@ key_ref_t lookup_user_key(key_serial_t id, int create, int partial,
 			  key_perm_t perm)
 {
 	struct request_key_auth *rka;
-	struct task_struct *t = current;
-	struct cred *cred;
+	const struct cred *cred;
 	struct key *key;
 	key_ref_t key_ref, skey_ref;
 	int ret;
@@ -608,6 +586,7 @@ try_again:
 				goto error;
 			ret = install_session_keyring(
 				cred->user->session_keyring);
+
 			if (ret < 0)
 				goto error;
 			goto reget_creds;
@@ -693,7 +672,7 @@ try_again:
 		/* check to see if we possess the key */
 		skey_ref = search_process_keyrings(key->type, key,
 						   lookup_user_key_possessed,
-						   current);
+						   cred);
 
 		if (!IS_ERR(skey_ref)) {
 			key_put(key);
@@ -725,7 +704,7 @@ try_again:
 		goto invalid_key;
 
 	/* check the permissions */
-	ret = key_task_permission(key_ref, t, perm);
+	ret = key_task_permission(key_ref, cred, perm);
 	if (ret < 0)
 		goto invalid_key;
 
@@ -755,21 +734,33 @@ reget_creds:
  */
 long join_session_keyring(const char *name)
 {
-	struct task_struct *tsk = current;
-	struct cred *cred = current->cred;
+	const struct cred *old;
+	struct cred *new;
 	struct key *keyring;
-	long ret;
+	long ret, serial;
+
+	/* only permit this if there's a single thread in the thread group -
+	 * this avoids us having to adjust the creds on all threads and risking
+	 * ENOMEM */
+	if (!is_single_threaded(current))
+		return -EMLINK;
+
+	new = prepare_creds();
+	if (!new)
+		return -ENOMEM;
+	old = current_cred();
 
 	/* if no name is provided, install an anonymous keyring */
 	if (!name) {
-		ret = install_session_keyring(NULL);
+		ret = install_session_keyring_to_cred(new, NULL);
 		if (ret < 0)
 			goto error;
 
-		rcu_read_lock();
-		ret = rcu_dereference(cred->tgcred->session_keyring)->serial;
-		rcu_read_unlock();
-		goto error;
+		serial = new->tgcred->session_keyring->serial;
+		ret = commit_creds(new);
+		if (ret == 0)
+			ret = serial;
+		goto okay;
 	}
 
 	/* allow the user to join or create a named keyring */
@@ -779,29 +770,33 @@ long join_session_keyring(const char *name)
 	keyring = find_keyring_by_name(name, false);
 	if (PTR_ERR(keyring) == -ENOKEY) {
 		/* not found - try and create a new one */
-		keyring = keyring_alloc(name, cred->uid, cred->gid, tsk,
+		keyring = keyring_alloc(name, old->uid, old->gid, old,
 					KEY_ALLOC_IN_QUOTA, NULL);
 		if (IS_ERR(keyring)) {
 			ret = PTR_ERR(keyring);
 			goto error2;
 		}
-	}
-	else if (IS_ERR(keyring)) {
+	} else if (IS_ERR(keyring)) {
 		ret = PTR_ERR(keyring);
 		goto error2;
 	}
 
 	/* we've got a keyring - now to install it */
-	ret = install_session_keyring(keyring);
+	ret = install_session_keyring_to_cred(new, keyring);
 	if (ret < 0)
 		goto error2;
 
+	commit_creds(new);
+	mutex_unlock(&key_session_mutex);
+
 	ret = keyring->serial;
 	key_put(keyring);
+okay:
+	return ret;
 
 error2:
 	mutex_unlock(&key_session_mutex);
 error:
+	abort_creds(new);
 	return ret;
-
-} /* end join_session_keyring() */
+}
diff --git a/security/keys/request_key.c b/security/keys/request_key.c
index 3d12558362d..0e04f72ef2d 100644
--- a/security/keys/request_key.c
+++ b/security/keys/request_key.c
@@ -83,8 +83,10 @@ static int call_sbin_request_key(struct key_construction *cons,
 	/* allocate a new session keyring */
 	sprintf(desc, "_req.%u", key->serial);
 
-	keyring = keyring_alloc(desc, current_fsuid(), current_fsgid(), current,
+	cred = get_current_cred();
+	keyring = keyring_alloc(desc, cred->fsuid, cred->fsgid, cred,
 				KEY_ALLOC_QUOTA_OVERRUN, NULL);
+	put_cred(cred);
 	if (IS_ERR(keyring)) {
 		ret = PTR_ERR(keyring);
 		goto error_alloc;
@@ -104,8 +106,7 @@ static int call_sbin_request_key(struct key_construction *cons,
 
 	/* we specify the process's default keyrings */
 	sprintf(keyring_str[0], "%d",
-		cred->thread_keyring ?
-		cred->thread_keyring->serial : 0);
+		cred->thread_keyring ? cred->thread_keyring->serial : 0);
 
 	prkey = 0;
 	if (cred->tgcred->process_keyring)
@@ -155,8 +156,8 @@ error_link:
 	key_put(keyring);
 
 error_alloc:
-	kleave(" = %d", ret);
 	complete_request_key(cons, ret);
+	kleave(" = %d", ret);
 	return ret;
 }
 
@@ -295,6 +296,7 @@ static int construct_alloc_key(struct key_type *type,
 			       struct key_user *user,
 			       struct key **_key)
 {
+	const struct cred *cred = current_cred();
 	struct key *key;
 	key_ref_t key_ref;
 
@@ -302,9 +304,8 @@ static int construct_alloc_key(struct key_type *type,
 
 	mutex_lock(&user->cons_lock);
 
-	key = key_alloc(type, description,
-			current_fsuid(), current_fsgid(), current, KEY_POS_ALL,
-			flags);
+	key = key_alloc(type, description, cred->fsuid, cred->fsgid, cred,
+			KEY_POS_ALL, flags);
 	if (IS_ERR(key))
 		goto alloc_failed;
 
@@ -317,8 +318,7 @@ static int construct_alloc_key(struct key_type *type,
 	 * waited for locks */
 	mutex_lock(&key_construction_mutex);
 
-	key_ref = search_process_keyrings(type, description, type->match,
-					  current);
+	key_ref = search_process_keyrings(type, description, type->match, cred);
 	if (!IS_ERR(key_ref))
 		goto key_already_present;
 
@@ -363,6 +363,8 @@ static struct key *construct_key_and_link(struct key_type *type,
 	struct key *key;
 	int ret;
 
+	kenter("");
+
 	user = key_user_lookup(current_fsuid());
 	if (!user)
 		return ERR_PTR(-ENOMEM);
@@ -376,17 +378,21 @@ static struct key *construct_key_and_link(struct key_type *type,
 	if (ret == 0) {
 		ret = construct_key(key, callout_info, callout_len, aux,
 				    dest_keyring);
-		if (ret < 0)
+		if (ret < 0) {
+			kdebug("cons failed");
 			goto construction_failed;
+		}
 	}
 
 	key_put(dest_keyring);
+	kleave(" = key %d", key_serial(key));
 	return key;
 
 construction_failed:
 	key_negate_and_link(key, key_negative_timeout, NULL, NULL);
 	key_put(key);
 	key_put(dest_keyring);
+	kleave(" = %d", ret);
 	return ERR_PTR(ret);
 }
 
@@ -405,6 +411,7 @@ struct key *request_key_and_link(struct key_type *type,
 				 struct key *dest_keyring,
 				 unsigned long flags)
 {
+	const struct cred *cred = current_cred();
 	struct key *key;
 	key_ref_t key_ref;
 
@@ -414,7 +421,7 @@ struct key *request_key_and_link(struct key_type *type,
 
 	/* search all the process keyrings for a key */
 	key_ref = search_process_keyrings(type, description, type->match,
-					  current);
+					  cred);
 
 	if (!IS_ERR(key_ref)) {
 		key = key_ref_to_ptr(key_ref);
diff --git a/security/keys/request_key_auth.c b/security/keys/request_key_auth.c
index 2125579d5d7..86747151ee5 100644
--- a/security/keys/request_key_auth.c
+++ b/security/keys/request_key_auth.c
@@ -105,9 +105,9 @@ static void request_key_auth_revoke(struct key *key)
 
 	kenter("{%d}", key->serial);
 
-	if (rka->context) {
-		put_task_struct(rka->context);
-		rka->context = NULL;
+	if (rka->cred) {
+		put_cred(rka->cred);
+		rka->cred = NULL;
 	}
 
 } /* end request_key_auth_revoke() */
@@ -122,9 +122,9 @@ static void request_key_auth_destroy(struct key *key)
 
 	kenter("{%d}", key->serial);
 
-	if (rka->context) {
-		put_task_struct(rka->context);
-		rka->context = NULL;
+	if (rka->cred) {
+		put_cred(rka->cred);
+		rka->cred = NULL;
 	}
 
 	key_put(rka->target_key);
@@ -143,6 +143,7 @@ struct key *request_key_auth_new(struct key *target, const void *callout_info,
 				 size_t callout_len, struct key *dest_keyring)
 {
 	struct request_key_auth *rka, *irka;
+	const struct cred *cred = current->cred;
 	struct key *authkey = NULL;
 	char desc[20];
 	int ret;
@@ -164,28 +165,25 @@ struct key *request_key_auth_new(struct key *target, const void *callout_info,
 
 	/* see if the calling process is already servicing the key request of
 	 * another process */
-	if (current->cred->request_key_auth) {
+	if (cred->request_key_auth) {
 		/* it is - use that instantiation context here too */
-		down_read(&current->cred->request_key_auth->sem);
+		down_read(&cred->request_key_auth->sem);
 
 		/* if the auth key has been revoked, then the key we're
 		 * servicing is already instantiated */
-		if (test_bit(KEY_FLAG_REVOKED,
-			     &current->cred->request_key_auth->flags))
+		if (test_bit(KEY_FLAG_REVOKED, &cred->request_key_auth->flags))
 			goto auth_key_revoked;
 
-		irka = current->cred->request_key_auth->payload.data;
-		rka->context = irka->context;
+		irka = cred->request_key_auth->payload.data;
+		rka->cred = get_cred(irka->cred);
 		rka->pid = irka->pid;
-		get_task_struct(rka->context);
 
-		up_read(&current->cred->request_key_auth->sem);
+		up_read(&cred->request_key_auth->sem);
 	}
 	else {
 		/* it isn't - use this process as the context */
-		rka->context = current;
+		rka->cred = get_cred(cred);
 		rka->pid = current->pid;
-		get_task_struct(rka->context);
 	}
 
 	rka->target_key = key_get(target);
@@ -197,7 +195,7 @@ struct key *request_key_auth_new(struct key *target, const void *callout_info,
 	sprintf(desc, "%x", target->serial);
 
 	authkey = key_alloc(&key_type_request_key_auth, desc,
-			    current_fsuid(), current_fsgid(), current,
+			    cred->fsuid, cred->fsgid, cred,
 			    KEY_POS_VIEW | KEY_POS_READ | KEY_POS_SEARCH |
 			    KEY_USR_VIEW, KEY_ALLOC_NOT_IN_QUOTA);
 	if (IS_ERR(authkey)) {
@@ -205,16 +203,16 @@ struct key *request_key_auth_new(struct key *target, const void *callout_info,
 		goto error_alloc;
 	}
 
-	/* construct and attach to the keyring */
+	/* construct the auth key */
 	ret = key_instantiate_and_link(authkey, rka, 0, NULL, NULL);
 	if (ret < 0)
 		goto error_inst;
 
-	kleave(" = {%d}", authkey->serial);
+	kleave(" = {%d,%d}", authkey->serial, atomic_read(&authkey->usage));
 	return authkey;
 
 auth_key_revoked:
-	up_read(&current->cred->request_key_auth->sem);
+	up_read(&cred->request_key_auth->sem);
 	kfree(rka->callout_info);
 	kfree(rka);
 	kleave("= -EKEYREVOKED");
@@ -257,6 +255,7 @@ static int key_get_instantiation_authkey_match(const struct key *key,
  */
 struct key *key_get_instantiation_authkey(key_serial_t target_id)
 {
+	const struct cred *cred = current_cred();
 	struct key *authkey;
 	key_ref_t authkey_ref;
 
@@ -264,7 +263,7 @@ struct key *key_get_instantiation_authkey(key_serial_t target_id)
 		&key_type_request_key_auth,
 		(void *) (unsigned long) target_id,
 		key_get_instantiation_authkey_match,
-		current);
+		cred);
 
 	if (IS_ERR(authkey_ref)) {
 		authkey = ERR_CAST(authkey_ref);
diff --git a/security/security.c b/security/security.c
index f40a0a04c3c..a55d739c686 100644
--- a/security/security.c
+++ b/security/security.c
@@ -145,18 +145,13 @@ int security_capget(struct task_struct *target,
 	return security_ops->capget(target, effective, inheritable, permitted);
 }
 
-int security_capset_check(const kernel_cap_t *effective,
-			  const kernel_cap_t *inheritable,
-			  const kernel_cap_t *permitted)
+int security_capset(struct cred *new, const struct cred *old,
+		    const kernel_cap_t *effective,
+		    const kernel_cap_t *inheritable,
+		    const kernel_cap_t *permitted)
 {
-	return security_ops->capset_check(effective, inheritable, permitted);
-}
-
-void security_capset_set(const kernel_cap_t *effective,
-			 const kernel_cap_t *inheritable,
-			 const kernel_cap_t *permitted)
-{
-	security_ops->capset_set(effective, inheritable, permitted);
+	return security_ops->capset(new, old,
+				    effective, inheritable, permitted);
 }
 
 int security_capable(struct task_struct *tsk, int cap)
@@ -228,9 +223,9 @@ void security_bprm_free(struct linux_binprm *bprm)
 	security_ops->bprm_free_security(bprm);
 }
 
-void security_bprm_apply_creds(struct linux_binprm *bprm, int unsafe)
+int security_bprm_apply_creds(struct linux_binprm *bprm, int unsafe)
 {
-	security_ops->bprm_apply_creds(bprm, unsafe);
+	return security_ops->bprm_apply_creds(bprm, unsafe);
 }
 
 void security_bprm_post_apply_creds(struct linux_binprm *bprm)
@@ -616,14 +611,19 @@ int security_task_create(unsigned long clone_flags)
 	return security_ops->task_create(clone_flags);
 }
 
-int security_cred_alloc(struct cred *cred)
+void security_cred_free(struct cred *cred)
 {
-	return security_ops->cred_alloc_security(cred);
+	security_ops->cred_free(cred);
 }
 
-void security_cred_free(struct cred *cred)
+int security_prepare_creds(struct cred *new, const struct cred *old, gfp_t gfp)
 {
-	security_ops->cred_free(cred);
+	return security_ops->cred_prepare(new, old, gfp);
+}
+
+void security_commit_creds(struct cred *new, const struct cred *old)
+{
+	return security_ops->cred_commit(new, old);
 }
 
 int security_task_setuid(uid_t id0, uid_t id1, uid_t id2, int flags)
@@ -631,10 +631,10 @@ int security_task_setuid(uid_t id0, uid_t id1, uid_t id2, int flags)
 	return security_ops->task_setuid(id0, id1, id2, flags);
 }
 
-int security_task_post_setuid(uid_t old_ruid, uid_t old_euid,
-			       uid_t old_suid, int flags)
+int security_task_fix_setuid(struct cred *new, const struct cred *old,
+			     int flags)
 {
-	return security_ops->task_post_setuid(old_ruid, old_euid, old_suid, flags);
+	return security_ops->task_fix_setuid(new, old, flags);
 }
 
 int security_task_setgid(gid_t id0, gid_t id1, gid_t id2, int flags)
@@ -716,14 +716,9 @@ int security_task_wait(struct task_struct *p)
 }
 
 int security_task_prctl(int option, unsigned long arg2, unsigned long arg3,
-			 unsigned long arg4, unsigned long arg5, long *rc_p)
-{
-	return security_ops->task_prctl(option, arg2, arg3, arg4, arg5, rc_p);
-}
-
-void security_task_reparent_to_init(struct task_struct *p)
+			 unsigned long arg4, unsigned long arg5)
 {
-	security_ops->task_reparent_to_init(p);
+	return security_ops->task_prctl(option, arg2, arg3, arg4, arg5);
 }
 
 void security_task_to_inode(struct task_struct *p, struct inode *inode)
@@ -1123,9 +1118,10 @@ EXPORT_SYMBOL(security_skb_classify_flow);
 
 #ifdef CONFIG_KEYS
 
-int security_key_alloc(struct key *key, struct task_struct *tsk, unsigned long flags)
+int security_key_alloc(struct key *key, const struct cred *cred,
+		       unsigned long flags)
 {
-	return security_ops->key_alloc(key, tsk, flags);
+	return security_ops->key_alloc(key, cred, flags);
 }
 
 void security_key_free(struct key *key)
@@ -1134,9 +1130,9 @@ void security_key_free(struct key *key)
 }
 
 int security_key_permission(key_ref_t key_ref,
-			    struct task_struct *context, key_perm_t perm)
+			    const struct cred *cred, key_perm_t perm)
 {
-	return security_ops->key_permission(key_ref, context, perm);
+	return security_ops->key_permission(key_ref, cred, perm);
 }
 
 int security_key_getsecurity(struct key *key, char **_buffer)
diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index f20cbd681ba..c71bba78872 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -156,20 +156,20 @@ static int selinux_secmark_enabled(void)
 	return (atomic_read(&selinux_secmark_refcount) > 0);
 }
 
-/* Allocate and free functions for each kind of security blob. */
-
-static int cred_alloc_security(struct cred *cred)
+/*
+ * initialise the security for the init task
+ */
+static void cred_init_security(void)
 {
+	struct cred *cred = (struct cred *) current->cred;
 	struct task_security_struct *tsec;
 
 	tsec = kzalloc(sizeof(struct task_security_struct), GFP_KERNEL);
 	if (!tsec)
-		return -ENOMEM;
+		panic("SELinux:  Failed to initialize initial task.\n");
 
-	tsec->osid = tsec->sid = SECINITSID_UNLABELED;
+	tsec->osid = tsec->sid = SECINITSID_KERNEL;
 	cred->security = tsec;
-
-	return 0;
 }
 
 /*
@@ -1378,6 +1378,19 @@ static inline u32 signal_to_av(int sig)
 	return perm;
 }
 
+/*
+ * Check permission between a pair of credentials
+ * fork check, ptrace check, etc.
+ */
+static int cred_has_perm(const struct cred *actor,
+			 const struct cred *target,
+			 u32 perms)
+{
+	u32 asid = cred_sid(actor), tsid = cred_sid(target);
+
+	return avc_has_perm(asid, tsid, SECCLASS_PROCESS, perms, NULL);
+}
+
 /*
  * Check permission between a pair of tasks, e.g. signal checks,
  * fork check, ptrace check, etc.
@@ -1820,24 +1833,19 @@ static int selinux_capget(struct task_struct *target, kernel_cap_t *effective,
 	return secondary_ops->capget(target, effective, inheritable, permitted);
 }
 
-static int selinux_capset_check(const kernel_cap_t *effective,
-				const kernel_cap_t *inheritable,
-				const kernel_cap_t *permitted)
+static int selinux_capset(struct cred *new, const struct cred *old,
+			  const kernel_cap_t *effective,
+			  const kernel_cap_t *inheritable,
+			  const kernel_cap_t *permitted)
 {
 	int error;
 
-	error = secondary_ops->capset_check(effective, inheritable, permitted);
+	error = secondary_ops->capset(new, old,
+				      effective, inheritable, permitted);
 	if (error)
 		return error;
 
-	return task_has_perm(current, current, PROCESS__SETCAP);
-}
-
-static void selinux_capset_set(const kernel_cap_t *effective,
-			       const kernel_cap_t *inheritable,
-			       const kernel_cap_t *permitted)
-{
-	secondary_ops->capset_set(effective, inheritable, permitted);
+	return cred_has_perm(old, new, PROCESS__SETCAP);
 }
 
 static int selinux_capable(struct task_struct *tsk, int cap, int audit)
@@ -2244,16 +2252,23 @@ static inline void flush_unauthorized_files(const struct cred *cred,
 	spin_unlock(&files->file_lock);
 }
 
-static void selinux_bprm_apply_creds(struct linux_binprm *bprm, int unsafe)
+static int selinux_bprm_apply_creds(struct linux_binprm *bprm, int unsafe)
 {
 	struct task_security_struct *tsec;
 	struct bprm_security_struct *bsec;
+	struct cred *new;
 	u32 sid;
 	int rc;
 
-	secondary_ops->bprm_apply_creds(bprm, unsafe);
+	rc = secondary_ops->bprm_apply_creds(bprm, unsafe);
+	if (rc < 0)
+		return rc;
 
-	tsec = current_security();
+	new = prepare_creds();
+	if (!new)
+		return -ENOMEM;
+
+	tsec = new->security;
 
 	bsec = bprm->security;
 	sid = bsec->sid;
@@ -2268,7 +2283,7 @@ static void selinux_bprm_apply_creds(struct linux_binprm *bprm, int unsafe)
 					PROCESS__SHARE, NULL);
 			if (rc) {
 				bsec->unsafe = 1;
-				return;
+				goto out;
 			}
 		}
 
@@ -2292,12 +2307,16 @@ static void selinux_bprm_apply_creds(struct linux_binprm *bprm, int unsafe)
 						  PROCESS__PTRACE, NULL);
 				if (rc) {
 					bsec->unsafe = 1;
-					return;
+					goto out;
 				}
 			}
 		}
 		tsec->sid = sid;
 	}
+
+out:
+	commit_creds(new);
+	return 0;
 }
 
 /*
@@ -3021,6 +3040,7 @@ static int selinux_file_ioctl(struct file *file, unsigned int cmd,
 static int file_map_prot_check(struct file *file, unsigned long prot, int shared)
 {
 	const struct cred *cred = current_cred();
+	int rc = 0;
 
 #ifndef CONFIG_PPC32
 	if ((prot & PROT_EXEC) && (!file || (!shared && (prot & PROT_WRITE)))) {
@@ -3029,9 +3049,9 @@ static int file_map_prot_check(struct file *file, unsigned long prot, int shared
 		 * private file mapping that will also be writable.
 		 * This has an additional check.
 		 */
-		int rc = task_has_perm(current, current, PROCESS__EXECMEM);
+		rc = cred_has_perm(cred, cred, PROCESS__EXECMEM);
 		if (rc)
-			return rc;
+			goto error;
 	}
 #endif
 
@@ -3048,7 +3068,9 @@ static int file_map_prot_check(struct file *file, unsigned long prot, int shared
 
 		return file_has_perm(cred, file, av);
 	}
-	return 0;
+
+error:
+	return rc;
 }
 
 static int selinux_file_mmap(struct file *file, unsigned long reqprot,
@@ -3090,8 +3112,7 @@ static int selinux_file_mprotect(struct vm_area_struct *vma,
 		rc = 0;
 		if (vma->vm_start >= vma->vm_mm->start_brk &&
 		    vma->vm_end <= vma->vm_mm->brk) {
-			rc = task_has_perm(current, current,
-					   PROCESS__EXECHEAP);
+			rc = cred_has_perm(cred, cred, PROCESS__EXECHEAP);
 		} else if (!vma->vm_file &&
 			   vma->vm_start <= vma->vm_mm->start_stack &&
 			   vma->vm_end >= vma->vm_mm->start_stack) {
@@ -3104,8 +3125,7 @@ static int selinux_file_mprotect(struct vm_area_struct *vma,
 			 * modified content.  This typically should only
 			 * occur for text relocations.
 			 */
-			rc = file_has_perm(cred, vma->vm_file,
-					   FILE__EXECMOD);
+			rc = file_has_perm(cred, vma->vm_file, FILE__EXECMOD);
 		}
 		if (rc)
 			return rc;
@@ -3211,6 +3231,7 @@ static int selinux_dentry_open(struct file *file, const struct cred *cred)
 	struct file_security_struct *fsec;
 	struct inode *inode;
 	struct inode_security_struct *isec;
+
 	inode = file->f_path.dentry->d_inode;
 	fsec = file->f_security;
 	isec = inode->i_security;
@@ -3247,38 +3268,41 @@ static int selinux_task_create(unsigned long clone_flags)
 	return task_has_perm(current, current, PROCESS__FORK);
 }
 
-static int selinux_cred_alloc_security(struct cred *cred)
+/*
+ * detach and free the LSM part of a set of credentials
+ */
+static void selinux_cred_free(struct cred *cred)
 {
-	struct task_security_struct *tsec1, *tsec2;
-	int rc;
-
-	tsec1 = current_security();
+	struct task_security_struct *tsec = cred->security;
+	cred->security = NULL;
+	kfree(tsec);
+}
 
-	rc = cred_alloc_security(cred);
-	if (rc)
-		return rc;
-	tsec2 = cred->security;
+/*
+ * prepare a new set of credentials for modification
+ */
+static int selinux_cred_prepare(struct cred *new, const struct cred *old,
+				gfp_t gfp)
+{
+	const struct task_security_struct *old_tsec;
+	struct task_security_struct *tsec;
 
-	tsec2->osid = tsec1->osid;
-	tsec2->sid = tsec1->sid;
+	old_tsec = old->security;
 
-	/* Retain the exec, fs, key, and sock SIDs across fork */
-	tsec2->exec_sid = tsec1->exec_sid;
-	tsec2->create_sid = tsec1->create_sid;
-	tsec2->keycreate_sid = tsec1->keycreate_sid;
-	tsec2->sockcreate_sid = tsec1->sockcreate_sid;
+	tsec = kmemdup(old_tsec, sizeof(struct task_security_struct), gfp);
+	if (!tsec)
+		return -ENOMEM;
 
+	new->security = tsec;
 	return 0;
 }
 
 /*
- * detach and free the LSM part of a set of credentials
+ * commit new credentials
  */
-static void selinux_cred_free(struct cred *cred)
+static void selinux_cred_commit(struct cred *new, const struct cred *old)
 {
-	struct task_security_struct *tsec = cred->security;
-	cred->security = NULL;
-	kfree(tsec);
+	secondary_ops->cred_commit(new, old);
 }
 
 static int selinux_task_setuid(uid_t id0, uid_t id1, uid_t id2, int flags)
@@ -3292,9 +3316,10 @@ static int selinux_task_setuid(uid_t id0, uid_t id1, uid_t id2, int flags)
 	return 0;
 }
 
-static int selinux_task_post_setuid(uid_t id0, uid_t id1, uid_t id2, int flags)
+static int selinux_task_fix_setuid(struct cred *new, const struct cred *old,
+				   int flags)
 {
-	return secondary_ops->task_post_setuid(id0, id1, id2, flags);
+	return secondary_ops->task_fix_setuid(new, old, flags);
 }
 
 static int selinux_task_setgid(gid_t id0, gid_t id1, gid_t id2, int flags)
@@ -3368,7 +3393,7 @@ static int selinux_task_setrlimit(unsigned int resource, struct rlimit *new_rlim
 	/* Control the ability to change the hard limit (whether
 	   lowering or raising it), so that the hard limit can
 	   later be used as a safe reset point for the soft limit
-	   upon context transitions. See selinux_bprm_apply_creds. */
+	   upon context transitions.  See selinux_bprm_committing_creds. */
 	if (old_rlim->rlim_max != new_rlim->rlim_max)
 		return task_has_perm(current, current, PROCESS__SETRLIMIT);
 
@@ -3422,13 +3447,12 @@ static int selinux_task_prctl(int option,
 			      unsigned long arg2,
 			      unsigned long arg3,
 			      unsigned long arg4,
-			      unsigned long arg5,
-			      long *rc_p)
+			      unsigned long arg5)
 {
 	/* The current prctl operations do not appear to require
 	   any SELinux controls since they merely observe or modify
 	   the state of the current process. */
-	return secondary_ops->task_prctl(option, arg2, arg3, arg4, arg5, rc_p);
+	return secondary_ops->task_prctl(option, arg2, arg3, arg4, arg5);
 }
 
 static int selinux_task_wait(struct task_struct *p)
@@ -3436,18 +3460,6 @@ static int selinux_task_wait(struct task_struct *p)
 	return task_has_perm(p, current, PROCESS__SIGCHLD);
 }
 
-static void selinux_task_reparent_to_init(struct task_struct *p)
-{
-	struct task_security_struct *tsec;
-
-	secondary_ops->task_reparent_to_init(p);
-
-	tsec = p->cred->security;
-	tsec->osid = tsec->sid;
-	tsec->sid = SECINITSID_KERNEL;
-	return;
-}
-
 static void selinux_task_to_inode(struct task_struct *p,
 				  struct inode *inode)
 {
@@ -5325,7 +5337,8 @@ static int selinux_setprocattr(struct task_struct *p,
 {
 	struct task_security_struct *tsec;
 	struct task_struct *tracer;
-	u32 sid = 0;
+	struct cred *new;
+	u32 sid = 0, ptsid;
 	int error;
 	char *str = value;
 
@@ -5372,86 +5385,75 @@ static int selinux_setprocattr(struct task_struct *p,
 			return error;
 	}
 
+	new = prepare_creds();
+	if (!new)
+		return -ENOMEM;
+
 	/* Permission checking based on the specified context is
 	   performed during the actual operation (execve,
 	   open/mkdir/...), when we know the full context of the
-	   operation.  See selinux_bprm_set_security for the execve
+	   operation.  See selinux_bprm_set_creds for the execve
 	   checks and may_create for the file creation checks. The
 	   operation will then fail if the context is not permitted. */
-	tsec = p->cred->security;
-	if (!strcmp(name, "exec"))
+	tsec = new->security;
+	if (!strcmp(name, "exec")) {
 		tsec->exec_sid = sid;
-	else if (!strcmp(name, "fscreate"))
+	} else if (!strcmp(name, "fscreate")) {
 		tsec->create_sid = sid;
-	else if (!strcmp(name, "keycreate")) {
+	} else if (!strcmp(name, "keycreate")) {
 		error = may_create_key(sid, p);
 		if (error)
-			return error;
+			goto abort_change;
 		tsec->keycreate_sid = sid;
-	} else if (!strcmp(name, "sockcreate"))
+	} else if (!strcmp(name, "sockcreate")) {
 		tsec->sockcreate_sid = sid;
-	else if (!strcmp(name, "current")) {
-		struct av_decision avd;
-
+	} else if (!strcmp(name, "current")) {
+		error = -EINVAL;
 		if (sid == 0)
-			return -EINVAL;
-		/*
-		 * SELinux allows to change context in the following case only.
-		 *  - Single threaded processes.
-		 *  - Multi threaded processes intend to change its context into
-		 *    more restricted domain (defined by TYPEBOUNDS statement).
-		 */
-		if (atomic_read(&p->mm->mm_users) != 1) {
-			struct task_struct *g, *t;
-			struct mm_struct *mm = p->mm;
-			read_lock(&tasklist_lock);
-			do_each_thread(g, t) {
-				if (t->mm == mm && t != p) {
-					read_unlock(&tasklist_lock);
-					error = security_bounded_transition(tsec->sid, sid);
-					if (!error)
-						goto boundary_ok;
-
-					return error;
-				}
-			} while_each_thread(g, t);
-			read_unlock(&tasklist_lock);
+			goto abort_change;
+
+		/* Only allow single threaded processes to change context */
+		error = -EPERM;
+		if (!is_single_threaded(p)) {
+			error = security_bounded_transition(tsec->sid, sid);
+			if (error)
+				goto abort_change;
 		}
-boundary_ok:
 
 		/* Check permissions for the transition. */
 		error = avc_has_perm(tsec->sid, sid, SECCLASS_PROCESS,
 				     PROCESS__DYNTRANSITION, NULL);
 		if (error)
-			return error;
+			goto abort_change;
 
 		/* Check for ptracing, and update the task SID if ok.
 		   Otherwise, leave SID unchanged and fail. */
+		ptsid = 0;
 		task_lock(p);
-		rcu_read_lock();
 		tracer = tracehook_tracer_task(p);
-		if (tracer != NULL) {
-			u32 ptsid = task_sid(tracer);
-			rcu_read_unlock();
-			error = avc_has_perm_noaudit(ptsid, sid,
-						     SECCLASS_PROCESS,
-						     PROCESS__PTRACE, 0, &avd);
-			if (!error)
-				tsec->sid = sid;
-			task_unlock(p);
-			avc_audit(ptsid, sid, SECCLASS_PROCESS,
-				  PROCESS__PTRACE, &avd, error, NULL);
+		if (tracer)
+			ptsid = task_sid(tracer);
+		task_unlock(p);
+
+		if (tracer) {
+			error = avc_has_perm(ptsid, sid, SECCLASS_PROCESS,
+					     PROCESS__PTRACE, NULL);
 			if (error)
-				return error;
-		} else {
-			rcu_read_unlock();
-			tsec->sid = sid;
-			task_unlock(p);
+				goto abort_change;
 		}
-	} else
-		return -EINVAL;
 
+		tsec->sid = sid;
+	} else {
+		error = -EINVAL;
+		goto abort_change;
+	}
+
+	commit_creds(new);
 	return size;
+
+abort_change:
+	abort_creds(new);
+	return error;
 }
 
 static int selinux_secid_to_secctx(u32 secid, char **secdata, u32 *seclen)
@@ -5471,23 +5473,21 @@ static void selinux_release_secctx(char *secdata, u32 seclen)
 
 #ifdef CONFIG_KEYS
 
-static int selinux_key_alloc(struct key *k, struct task_struct *tsk,
+static int selinux_key_alloc(struct key *k, const struct cred *cred,
 			     unsigned long flags)
 {
-	const struct task_security_struct *__tsec;
+	const struct task_security_struct *tsec;
 	struct key_security_struct *ksec;
 
 	ksec = kzalloc(sizeof(struct key_security_struct), GFP_KERNEL);
 	if (!ksec)
 		return -ENOMEM;
 
-	rcu_read_lock();
-	__tsec = __task_cred(tsk)->security;
-	if (__tsec->keycreate_sid)
-		ksec->sid = __tsec->keycreate_sid;
+	tsec = cred->security;
+	if (tsec->keycreate_sid)
+		ksec->sid = tsec->keycreate_sid;
 	else
-		ksec->sid = __tsec->sid;
-	rcu_read_unlock();
+		ksec->sid = tsec->sid;
 
 	k->security = ksec;
 	return 0;
@@ -5502,8 +5502,8 @@ static void selinux_key_free(struct key *k)
 }
 
 static int selinux_key_permission(key_ref_t key_ref,
-			    struct task_struct *ctx,
-			    key_perm_t perm)
+				  const struct cred *cred,
+				  key_perm_t perm)
 {
 	struct key *key;
 	struct key_security_struct *ksec;
@@ -5515,7 +5515,7 @@ static int selinux_key_permission(key_ref_t key_ref,
 	if (perm == 0)
 		return 0;
 
-	sid = task_sid(ctx);
+	sid = cred_sid(cred);
 
 	key = key_ref_to_ptr(key_ref);
 	ksec = key->security;
@@ -5545,8 +5545,7 @@ static struct security_operations selinux_ops = {
 	.ptrace_may_access =		selinux_ptrace_may_access,
 	.ptrace_traceme =		selinux_ptrace_traceme,
 	.capget =			selinux_capget,
-	.capset_check =			selinux_capset_check,
-	.capset_set =			selinux_capset_set,
+	.capset =			selinux_capset,
 	.sysctl =			selinux_sysctl,
 	.capable =			selinux_capable,
 	.quotactl =			selinux_quotactl,
@@ -5621,10 +5620,11 @@ static struct security_operations selinux_ops = {
 	.dentry_open =			selinux_dentry_open,
 
 	.task_create =			selinux_task_create,
-	.cred_alloc_security =		selinux_cred_alloc_security,
 	.cred_free =			selinux_cred_free,
+	.cred_prepare =			selinux_cred_prepare,
+	.cred_commit =			selinux_cred_commit,
 	.task_setuid =			selinux_task_setuid,
-	.task_post_setuid =		selinux_task_post_setuid,
+	.task_fix_setuid =		selinux_task_fix_setuid,
 	.task_setgid =			selinux_task_setgid,
 	.task_setpgid =			selinux_task_setpgid,
 	.task_getpgid =			selinux_task_getpgid,
@@ -5641,7 +5641,6 @@ static struct security_operations selinux_ops = {
 	.task_kill =			selinux_task_kill,
 	.task_wait =			selinux_task_wait,
 	.task_prctl =			selinux_task_prctl,
-	.task_reparent_to_init =	selinux_task_reparent_to_init,
 	.task_to_inode =		selinux_task_to_inode,
 
 	.ipc_permission =		selinux_ipc_permission,
@@ -5737,8 +5736,6 @@ static struct security_operations selinux_ops = {
 
 static __init int selinux_init(void)
 {
-	struct task_security_struct *tsec;
-
 	if (!security_module_enable(&selinux_ops)) {
 		selinux_enabled = 0;
 		return 0;
@@ -5752,10 +5749,7 @@ static __init int selinux_init(void)
 	printk(KERN_INFO "SELinux:  Initializing.\n");
 
 	/* Set the security state for the initial task. */
-	if (cred_alloc_security(current->cred))
-		panic("SELinux:  Failed to initialize initial task.\n");
-	tsec = current->cred->security;
-	tsec->osid = tsec->sid = SECINITSID_KERNEL;
+	cred_init_security();
 
 	sel_inode_cache = kmem_cache_create("selinux_inode_security",
 					    sizeof(struct inode_security_struct),
diff --git a/security/smack/smack_lsm.c b/security/smack/smack_lsm.c
index 11167fd567b..e952b397153 100644
--- a/security/smack/smack_lsm.c
+++ b/security/smack/smack_lsm.c
@@ -104,8 +104,7 @@ static int smack_ptrace_may_access(struct task_struct *ctp, unsigned int mode)
 	if (rc != 0)
 		return rc;
 
-	rc = smk_access(current->cred->security, ctp->cred->security,
-			MAY_READWRITE);
+	rc = smk_access(current_security(), task_security(ctp), MAY_READWRITE);
 	if (rc != 0 && capable(CAP_MAC_OVERRIDE))
 		return 0;
 	return rc;
@@ -127,8 +126,7 @@ static int smack_ptrace_traceme(struct task_struct *ptp)
 	if (rc != 0)
 		return rc;
 
-	rc = smk_access(ptp->cred->security, current->cred->security,
-			MAY_READWRITE);
+	rc = smk_access(task_security(ptp), current_security(), MAY_READWRITE);
 	if (rc != 0 && has_capability(ptp, CAP_MAC_OVERRIDE))
 		return 0;
 	return rc;
@@ -976,22 +974,6 @@ static int smack_file_receive(struct file *file)
  * Task hooks
  */
 
-/**
- * smack_cred_alloc_security - "allocate" a task cred blob
- * @cred: the task creds in need of a blob
- *
- * Smack isn't using copies of blobs. Everyone
- * points to an immutable list. No alloc required.
- * No data copy required.
- *
- * Always returns 0
- */
-static int smack_cred_alloc_security(struct cred *cred)
-{
-	cred->security = current_security();
-	return 0;
-}
-
 /**
  * smack_cred_free - "free" task-level security credentials
  * @cred: the credentials in question
@@ -1005,6 +987,30 @@ static void smack_cred_free(struct cred *cred)
 	cred->security = NULL;
 }
 
+/**
+ * smack_cred_prepare - prepare new set of credentials for modification
+ * @new: the new credentials
+ * @old: the original credentials
+ * @gfp: the atomicity of any memory allocations
+ *
+ * Prepare a new set of credentials for modification.
+ */
+static int smack_cred_prepare(struct cred *new, const struct cred *old,
+			      gfp_t gfp)
+{
+	new->security = old->security;
+	return 0;
+}
+
+/*
+ * commit new credentials
+ * @new: the new credentials
+ * @old: the original credentials
+ */
+static void smack_cred_commit(struct cred *new, const struct cred *old)
+{
+}
+
 /**
  * smack_task_setpgid - Smack check on setting pgid
  * @p: the task object
@@ -2036,6 +2042,7 @@ static int smack_getprocattr(struct task_struct *p, char *name, char **value)
 static int smack_setprocattr(struct task_struct *p, char *name,
 			     void *value, size_t size)
 {
+	struct cred *new;
 	char *newsmack;
 
 	/*
@@ -2058,7 +2065,11 @@ static int smack_setprocattr(struct task_struct *p, char *name,
 	if (newsmack == NULL)
 		return -EINVAL;
 
-	p->cred->security = newsmack;
+	new = prepare_creds();
+	if (!new)
+		return -ENOMEM;
+	new->security = newsmack;
+	commit_creds(new);
 	return size;
 }
 
@@ -2354,17 +2365,17 @@ static int smack_inet_conn_request(struct sock *sk, struct sk_buff *skb,
 /**
  * smack_key_alloc - Set the key security blob
  * @key: object
- * @tsk: the task associated with the key
+ * @cred: the credentials to use
  * @flags: unused
  *
  * No allocation required
  *
  * Returns 0
  */
-static int smack_key_alloc(struct key *key, struct task_struct *tsk,
+static int smack_key_alloc(struct key *key, const struct cred *cred,
 			   unsigned long flags)
 {
-	key->security = tsk->cred->security;
+	key->security = cred->security;
 	return 0;
 }
 
@@ -2382,14 +2393,14 @@ static void smack_key_free(struct key *key)
 /*
  * smack_key_permission - Smack access on a key
  * @key_ref: gets to the object
- * @context: task involved
+ * @cred: the credentials to use
  * @perm: unused
  *
  * Return 0 if the task has read and write to the object,
  * an error code otherwise
  */
 static int smack_key_permission(key_ref_t key_ref,
-				struct task_struct *context, key_perm_t perm)
+				const struct cred *cred, key_perm_t perm)
 {
 	struct key *keyp;
 
@@ -2405,11 +2416,10 @@ static int smack_key_permission(key_ref_t key_ref,
 	/*
 	 * This should not occur
 	 */
-	if (context->cred->security == NULL)
+	if (cred->security == NULL)
 		return -EACCES;
 
-	return smk_access(context->cred->security, keyp->security,
-			  MAY_READWRITE);
+	return smk_access(cred->security, keyp->security, MAY_READWRITE);
 }
 #endif /* CONFIG_KEYS */
 
@@ -2580,8 +2590,7 @@ struct security_operations smack_ops = {
 	.ptrace_may_access =		smack_ptrace_may_access,
 	.ptrace_traceme =		smack_ptrace_traceme,
 	.capget = 			cap_capget,
-	.capset_check = 		cap_capset_check,
-	.capset_set = 			cap_capset_set,
+	.capset = 			cap_capset,
 	.capable = 			cap_capable,
 	.syslog = 			smack_syslog,
 	.settime = 			cap_settime,
@@ -2630,9 +2639,10 @@ struct security_operations smack_ops = {
 	.file_send_sigiotask = 		smack_file_send_sigiotask,
 	.file_receive = 		smack_file_receive,
 
-	.cred_alloc_security = 		smack_cred_alloc_security,
 	.cred_free =			smack_cred_free,
-	.task_post_setuid =		cap_task_post_setuid,
+	.cred_prepare =			smack_cred_prepare,
+	.cred_commit =			smack_cred_commit,
+	.task_fix_setuid =		cap_task_fix_setuid,
 	.task_setpgid = 		smack_task_setpgid,
 	.task_getpgid = 		smack_task_getpgid,
 	.task_getsid = 			smack_task_getsid,
@@ -2645,7 +2655,6 @@ struct security_operations smack_ops = {
 	.task_movememory = 		smack_task_movememory,
 	.task_kill = 			smack_task_kill,
 	.task_wait = 			smack_task_wait,
-	.task_reparent_to_init =	cap_task_reparent_to_init,
 	.task_to_inode = 		smack_task_to_inode,
 	.task_prctl =			cap_task_prctl,
 
@@ -2721,6 +2730,8 @@ struct security_operations smack_ops = {
  */
 static __init int smack_init(void)
 {
+	struct cred *cred;
+
 	if (!security_module_enable(&smack_ops))
 		return 0;
 
@@ -2729,7 +2740,8 @@ static __init int smack_init(void)
 	/*
 	 * Set the security state for the initial task.
 	 */
-	current->cred->security = &smack_known_floor.smk_known;
+	cred = (struct cred *) current->cred;
+	cred->security = &smack_known_floor.smk_known;
 
 	/*
 	 * Initialize locks
-- 
cgit v1.2.3-70-g09d2


From a6f76f23d297f70e2a6b3ec607f7aeeea9e37e8d Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 14 Nov 2008 10:39:24 +1100
Subject: CRED: Make execve() take advantage of copy-on-write credentials

Make execve() take advantage of copy-on-write credentials, allowing it to set
up the credentials in advance, and then commit the whole lot after the point
of no return.

This patch and the preceding patches have been tested with the LTP SELinux
testsuite.

This patch makes several logical sets of alteration:

 (1) execve().

     The credential bits from struct linux_binprm are, for the most part,
     replaced with a single credentials pointer (bprm->cred).  This means that
     all the creds can be calculated in advance and then applied at the point
     of no return with no possibility of failure.

     I would like to replace bprm->cap_effective with:

	cap_isclear(bprm->cap_effective)

     but this seems impossible due to special behaviour for processes of pid 1
     (they always retain their parent's capability masks where normally they'd
     be changed - see cap_bprm_set_creds()).

     The following sequence of events now happens:

     (a) At the start of do_execve, the current task's cred_exec_mutex is
     	 locked to prevent PTRACE_ATTACH from obsoleting the calculation of
     	 creds that we make.

     (a) prepare_exec_creds() is then called to make a copy of the current
     	 task's credentials and prepare it.  This copy is then assigned to
     	 bprm->cred.

  	 This renders security_bprm_alloc() and security_bprm_free()
     	 unnecessary, and so they've been removed.

     (b) The determination of unsafe execution is now performed immediately
     	 after (a) rather than later on in the code.  The result is stored in
     	 bprm->unsafe for future reference.

     (c) prepare_binprm() is called, possibly multiple times.

     	 (i) This applies the result of set[ug]id binaries to the new creds
     	     attached to bprm->cred.  Personality bit clearance is recorded,
     	     but now deferred on the basis that the exec procedure may yet
     	     fail.

         (ii) This then calls the new security_bprm_set_creds().  This should
	     calculate the new LSM and capability credentials into *bprm->cred.

	     This folds together security_bprm_set() and parts of
	     security_bprm_apply_creds() (these two have been removed).
	     Anything that might fail must be done at this point.

         (iii) bprm->cred_prepared is set to 1.

	     bprm->cred_prepared is 0 on the first pass of the security
	     calculations, and 1 on all subsequent passes.  This allows SELinux
	     in (ii) to base its calculations only on the initial script and
	     not on the interpreter.

     (d) flush_old_exec() is called to commit the task to execution.  This
     	 performs the following steps with regard to credentials:

	 (i) Clear pdeath_signal and set dumpable on certain circumstances that
	     may not be covered by commit_creds().

         (ii) Clear any bits in current->personality that were deferred from
             (c.i).

     (e) install_exec_creds() [compute_creds() as was] is called to install the
     	 new credentials.  This performs the following steps with regard to
     	 credentials:

         (i) Calls security_bprm_committing_creds() to apply any security
             requirements, such as flushing unauthorised files in SELinux, that
             must be done before the credentials are changed.

	     This is made up of bits of security_bprm_apply_creds() and
	     security_bprm_post_apply_creds(), both of which have been removed.
	     This function is not allowed to fail; anything that might fail
	     must have been done in (c.ii).

         (ii) Calls commit_creds() to apply the new credentials in a single
             assignment (more or less).  Possibly pdeath_signal and dumpable
             should be part of struct creds.

	 (iii) Unlocks the task's cred_replace_mutex, thus allowing
	     PTRACE_ATTACH to take place.

         (iv) Clears The bprm->cred pointer as the credentials it was holding
             are now immutable.

         (v) Calls security_bprm_committed_creds() to apply any security
             alterations that must be done after the creds have been changed.
             SELinux uses this to flush signals and signal handlers.

     (f) If an error occurs before (d.i), bprm_free() will call abort_creds()
     	 to destroy the proposed new credentials and will then unlock
     	 cred_replace_mutex.  No changes to the credentials will have been
     	 made.

 (2) LSM interface.

     A number of functions have been changed, added or removed:

     (*) security_bprm_alloc(), ->bprm_alloc_security()
     (*) security_bprm_free(), ->bprm_free_security()

     	 Removed in favour of preparing new credentials and modifying those.

     (*) security_bprm_apply_creds(), ->bprm_apply_creds()
     (*) security_bprm_post_apply_creds(), ->bprm_post_apply_creds()

     	 Removed; split between security_bprm_set_creds(),
     	 security_bprm_committing_creds() and security_bprm_committed_creds().

     (*) security_bprm_set(), ->bprm_set_security()

     	 Removed; folded into security_bprm_set_creds().

     (*) security_bprm_set_creds(), ->bprm_set_creds()

     	 New.  The new credentials in bprm->creds should be checked and set up
     	 as appropriate.  bprm->cred_prepared is 0 on the first call, 1 on the
     	 second and subsequent calls.

     (*) security_bprm_committing_creds(), ->bprm_committing_creds()
     (*) security_bprm_committed_creds(), ->bprm_committed_creds()

     	 New.  Apply the security effects of the new credentials.  This
     	 includes closing unauthorised files in SELinux.  This function may not
     	 fail.  When the former is called, the creds haven't yet been applied
     	 to the process; when the latter is called, they have.

 	 The former may access bprm->cred, the latter may not.

 (3) SELinux.

     SELinux has a number of changes, in addition to those to support the LSM
     interface changes mentioned above:

     (a) The bprm_security_struct struct has been removed in favour of using
     	 the credentials-under-construction approach.

     (c) flush_unauthorized_files() now takes a cred pointer and passes it on
     	 to inode_has_perm(), file_has_perm() and dentry_open().

Signed-off-by: David Howells <dhowells@redhat.com>
Acked-by: James Morris <jmorris@namei.org>
Acked-by: Serge Hallyn <serue@us.ibm.com>
Signed-off-by: James Morris <jmorris@namei.org>
---
 arch/x86/ia32/ia32_aout.c         |   2 +-
 fs/binfmt_aout.c                  |   2 +-
 fs/binfmt_elf.c                   |   2 +-
 fs/binfmt_elf_fdpic.c             |   2 +-
 fs/binfmt_flat.c                  |   2 +-
 fs/binfmt_som.c                   |   2 +-
 fs/compat.c                       |  42 +++---
 fs/exec.c                         | 149 +++++++++++---------
 fs/internal.h                     |   6 +
 include/linux/audit.h             |  16 ---
 include/linux/binfmts.h           |  16 ++-
 include/linux/cred.h              |   3 +-
 include/linux/key.h               |   2 -
 include/linux/security.h          | 103 +++++---------
 kernel/cred.c                     |  46 ++++++-
 security/capability.c             |  19 +--
 security/commoncap.c              | 152 ++++++++++----------
 security/keys/process_keys.c      |  42 ------
 security/root_plug.c              |  13 +-
 security/security.c               |  26 ++--
 security/selinux/hooks.c          | 283 ++++++++++++++++----------------------
 security/selinux/include/objsec.h |  11 --
 security/smack/smack_lsm.c        |   3 +-
 23 files changed, 429 insertions(+), 515 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/ia32/ia32_aout.c b/arch/x86/ia32/ia32_aout.c
index 127ec3f0721..2a4d073d2cf 100644
--- a/arch/x86/ia32/ia32_aout.c
+++ b/arch/x86/ia32/ia32_aout.c
@@ -327,7 +327,7 @@ static int load_aout_binary(struct linux_binprm *bprm, struct pt_regs *regs)
 	current->mm->cached_hole_size = 0;
 
 	current->mm->mmap = NULL;
-	compute_creds(bprm);
+	install_exec_creds(bprm);
 	current->flags &= ~PF_FORKNOEXEC;
 
 	if (N_MAGIC(ex) == OMAGIC) {
diff --git a/fs/binfmt_aout.c b/fs/binfmt_aout.c
index 204cfd1d767..f1f3f4192a6 100644
--- a/fs/binfmt_aout.c
+++ b/fs/binfmt_aout.c
@@ -320,7 +320,7 @@ static int load_aout_binary(struct linux_binprm * bprm, struct pt_regs * regs)
 	current->mm->free_area_cache = current->mm->mmap_base;
 	current->mm->cached_hole_size = 0;
 
-	compute_creds(bprm);
+	install_exec_creds(bprm);
  	current->flags &= ~PF_FORKNOEXEC;
 #ifdef __sparc__
 	if (N_MAGIC(ex) == NMAGIC) {
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 9142ff5dc8e..f458c1217c5 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -956,7 +956,7 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs)
 	}
 #endif /* ARCH_HAS_SETUP_ADDITIONAL_PAGES */
 
-	compute_creds(bprm);
+	install_exec_creds(bprm);
 	current->flags &= ~PF_FORKNOEXEC;
 	retval = create_elf_tables(bprm, &loc->elf_ex,
 			  load_addr, interp_load_addr);
diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c
index 45dabd59936..aa5b43205e3 100644
--- a/fs/binfmt_elf_fdpic.c
+++ b/fs/binfmt_elf_fdpic.c
@@ -404,7 +404,7 @@ static int load_elf_fdpic_binary(struct linux_binprm *bprm,
 	current->mm->start_stack = current->mm->start_brk + stack_size;
 #endif
 
-	compute_creds(bprm);
+	install_exec_creds(bprm);
 	current->flags &= ~PF_FORKNOEXEC;
 	if (create_elf_fdpic_tables(bprm, current->mm,
 				    &exec_params, &interp_params) < 0)
diff --git a/fs/binfmt_flat.c b/fs/binfmt_flat.c
index ccb781a6a80..7bbd5c6b372 100644
--- a/fs/binfmt_flat.c
+++ b/fs/binfmt_flat.c
@@ -880,7 +880,7 @@ static int load_flat_binary(struct linux_binprm * bprm, struct pt_regs * regs)
 					(libinfo.lib_list[j].loaded)?
 						libinfo.lib_list[j].start_data:UNLOADED_LIB;
 
-	compute_creds(bprm);
+	install_exec_creds(bprm);
  	current->flags &= ~PF_FORKNOEXEC;
 
 	set_binfmt(&flat_format);
diff --git a/fs/binfmt_som.c b/fs/binfmt_som.c
index 74e587a5279..08644a61616 100644
--- a/fs/binfmt_som.c
+++ b/fs/binfmt_som.c
@@ -255,7 +255,7 @@ load_som_binary(struct linux_binprm * bprm, struct pt_regs * regs)
 	kfree(hpuxhdr);
 
 	set_binfmt(&som_format);
-	compute_creds(bprm);
+	install_exec_creds(bprm);
 	setup_arg_pages(bprm, STACK_TOP, EXSTACK_DEFAULT);
 
 	create_som_tables(bprm);
diff --git a/fs/compat.c b/fs/compat.c
index e5f49f53850..d1ece79b641 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -1393,10 +1393,20 @@ int compat_do_execve(char * filename,
 	if (!bprm)
 		goto out_ret;
 
+	retval = mutex_lock_interruptible(&current->cred_exec_mutex);
+	if (retval < 0)
+		goto out_free;
+
+	retval = -ENOMEM;
+	bprm->cred = prepare_exec_creds();
+	if (!bprm->cred)
+		goto out_unlock;
+	check_unsafe_exec(bprm);
+
 	file = open_exec(filename);
 	retval = PTR_ERR(file);
 	if (IS_ERR(file))
-		goto out_kfree;
+		goto out_unlock;
 
 	sched_exec();
 
@@ -1410,14 +1420,10 @@ int compat_do_execve(char * filename,
 
 	bprm->argc = compat_count(argv, MAX_ARG_STRINGS);
 	if ((retval = bprm->argc) < 0)
-		goto out_mm;
+		goto out;
 
 	bprm->envc = compat_count(envp, MAX_ARG_STRINGS);
 	if ((retval = bprm->envc) < 0)
-		goto out_mm;
-
-	retval = security_bprm_alloc(bprm);
-	if (retval)
 		goto out;
 
 	retval = prepare_binprm(bprm);
@@ -1438,19 +1444,16 @@ int compat_do_execve(char * filename,
 		goto out;
 
 	retval = search_binary_handler(bprm, regs);
-	if (retval >= 0) {
-		/* execve success */
-		security_bprm_free(bprm);
-		acct_update_integrals(current);
-		free_bprm(bprm);
-		return retval;
-	}
+	if (retval < 0)
+		goto out;
 
-out:
-	if (bprm->security)
-		security_bprm_free(bprm);
+	/* execve succeeded */
+	mutex_unlock(&current->cred_exec_mutex);
+	acct_update_integrals(current);
+	free_bprm(bprm);
+	return retval;
 
-out_mm:
+out:
 	if (bprm->mm)
 		mmput(bprm->mm);
 
@@ -1460,7 +1463,10 @@ out_file:
 		fput(bprm->file);
 	}
 
-out_kfree:
+out_unlock:
+	mutex_unlock(&current->cred_exec_mutex);
+
+out_free:
 	free_bprm(bprm);
 
 out_ret:
diff --git a/fs/exec.c b/fs/exec.c
index 9bd3559ddec..32f13e29941 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -55,6 +55,7 @@
 #include <asm/uaccess.h>
 #include <asm/mmu_context.h>
 #include <asm/tlb.h>
+#include "internal.h"
 
 #ifdef __alpha__
 /* for /sbin/loader handling in search_binary_handler() */
@@ -1007,15 +1008,17 @@ int flush_old_exec(struct linux_binprm * bprm)
 	 */
 	current->mm->task_size = TASK_SIZE;
 
-	if (bprm->e_uid != current_euid() ||
-	    bprm->e_gid != current_egid()) {
-		set_dumpable(current->mm, suid_dumpable);
+	/* install the new credentials */
+	if (bprm->cred->uid != current_euid() ||
+	    bprm->cred->gid != current_egid()) {
 		current->pdeath_signal = 0;
 	} else if (file_permission(bprm->file, MAY_READ) ||
-			(bprm->interp_flags & BINPRM_FLAGS_ENFORCE_NONDUMP)) {
+		   bprm->interp_flags & BINPRM_FLAGS_ENFORCE_NONDUMP) {
 		set_dumpable(current->mm, suid_dumpable);
 	}
 
+	current->personality &= ~bprm->per_clear;
+
 	/* An exec changes our domain. We are no longer part of the thread
 	   group */
 
@@ -1032,13 +1035,50 @@ out:
 
 EXPORT_SYMBOL(flush_old_exec);
 
+/*
+ * install the new credentials for this executable
+ */
+void install_exec_creds(struct linux_binprm *bprm)
+{
+	security_bprm_committing_creds(bprm);
+
+	commit_creds(bprm->cred);
+	bprm->cred = NULL;
+
+	/* cred_exec_mutex must be held at least to this point to prevent
+	 * ptrace_attach() from altering our determination of the task's
+	 * credentials; any time after this it may be unlocked */
+
+	security_bprm_committed_creds(bprm);
+}
+EXPORT_SYMBOL(install_exec_creds);
+
+/*
+ * determine how safe it is to execute the proposed program
+ * - the caller must hold current->cred_exec_mutex to protect against
+ *   PTRACE_ATTACH
+ */
+void check_unsafe_exec(struct linux_binprm *bprm)
+{
+	struct task_struct *p = current;
+
+	bprm->unsafe = tracehook_unsafe_exec(p);
+
+	if (atomic_read(&p->fs->count) > 1 ||
+	    atomic_read(&p->files->count) > 1 ||
+	    atomic_read(&p->sighand->count) > 1)
+		bprm->unsafe |= LSM_UNSAFE_SHARE;
+}
+
 /* 
  * Fill the binprm structure from the inode. 
  * Check permissions, then read the first 128 (BINPRM_BUF_SIZE) bytes
+ *
+ * This may be called multiple times for binary chains (scripts for example).
  */
 int prepare_binprm(struct linux_binprm *bprm)
 {
-	int mode;
+	umode_t mode;
 	struct inode * inode = bprm->file->f_path.dentry->d_inode;
 	int retval;
 
@@ -1046,14 +1086,15 @@ int prepare_binprm(struct linux_binprm *bprm)
 	if (bprm->file->f_op == NULL)
 		return -EACCES;
 
-	bprm->e_uid = current_euid();
-	bprm->e_gid = current_egid();
+	/* clear any previous set[ug]id data from a previous binary */
+	bprm->cred->euid = current_euid();
+	bprm->cred->egid = current_egid();
 
-	if(!(bprm->file->f_path.mnt->mnt_flags & MNT_NOSUID)) {
+	if (!(bprm->file->f_path.mnt->mnt_flags & MNT_NOSUID)) {
 		/* Set-uid? */
 		if (mode & S_ISUID) {
-			current->personality &= ~PER_CLEAR_ON_SETID;
-			bprm->e_uid = inode->i_uid;
+			bprm->per_clear |= PER_CLEAR_ON_SETID;
+			bprm->cred->euid = inode->i_uid;
 		}
 
 		/* Set-gid? */
@@ -1063,50 +1104,23 @@ int prepare_binprm(struct linux_binprm *bprm)
 		 * executable.
 		 */
 		if ((mode & (S_ISGID | S_IXGRP)) == (S_ISGID | S_IXGRP)) {
-			current->personality &= ~PER_CLEAR_ON_SETID;
-			bprm->e_gid = inode->i_gid;
+			bprm->per_clear |= PER_CLEAR_ON_SETID;
+			bprm->cred->egid = inode->i_gid;
 		}
 	}
 
 	/* fill in binprm security blob */
-	retval = security_bprm_set(bprm);
+	retval = security_bprm_set_creds(bprm);
 	if (retval)
 		return retval;
+	bprm->cred_prepared = 1;
 
-	memset(bprm->buf,0,BINPRM_BUF_SIZE);
-	return kernel_read(bprm->file,0,bprm->buf,BINPRM_BUF_SIZE);
+	memset(bprm->buf, 0, BINPRM_BUF_SIZE);
+	return kernel_read(bprm->file, 0, bprm->buf, BINPRM_BUF_SIZE);
 }
 
 EXPORT_SYMBOL(prepare_binprm);
 
-static int unsafe_exec(struct task_struct *p)
-{
-	int unsafe = tracehook_unsafe_exec(p);
-
-	if (atomic_read(&p->fs->count) > 1 ||
-	    atomic_read(&p->files->count) > 1 ||
-	    atomic_read(&p->sighand->count) > 1)
-		unsafe |= LSM_UNSAFE_SHARE;
-
-	return unsafe;
-}
-
-void compute_creds(struct linux_binprm *bprm)
-{
-	int unsafe;
-
-	if (bprm->e_uid != current_uid())
-		current->pdeath_signal = 0;
-	exec_keys(current);
-
-	task_lock(current);
-	unsafe = unsafe_exec(current);
-	security_bprm_apply_creds(bprm, unsafe);
-	task_unlock(current);
-	security_bprm_post_apply_creds(bprm);
-}
-EXPORT_SYMBOL(compute_creds);
-
 /*
  * Arguments are '\0' separated strings found at the location bprm->p
  * points to; chop off the first by relocating brpm->p to right after
@@ -1259,6 +1273,8 @@ EXPORT_SYMBOL(search_binary_handler);
 void free_bprm(struct linux_binprm *bprm)
 {
 	free_arg_pages(bprm);
+	if (bprm->cred)
+		abort_creds(bprm->cred);
 	kfree(bprm);
 }
 
@@ -1284,10 +1300,20 @@ int do_execve(char * filename,
 	if (!bprm)
 		goto out_files;
 
+	retval = mutex_lock_interruptible(&current->cred_exec_mutex);
+	if (retval < 0)
+		goto out_free;
+
+	retval = -ENOMEM;
+	bprm->cred = prepare_exec_creds();
+	if (!bprm->cred)
+		goto out_unlock;
+	check_unsafe_exec(bprm);
+
 	file = open_exec(filename);
 	retval = PTR_ERR(file);
 	if (IS_ERR(file))
-		goto out_kfree;
+		goto out_unlock;
 
 	sched_exec();
 
@@ -1301,14 +1327,10 @@ int do_execve(char * filename,
 
 	bprm->argc = count(argv, MAX_ARG_STRINGS);
 	if ((retval = bprm->argc) < 0)
-		goto out_mm;
+		goto out;
 
 	bprm->envc = count(envp, MAX_ARG_STRINGS);
 	if ((retval = bprm->envc) < 0)
-		goto out_mm;
-
-	retval = security_bprm_alloc(bprm);
-	if (retval)
 		goto out;
 
 	retval = prepare_binprm(bprm);
@@ -1330,21 +1352,18 @@ int do_execve(char * filename,
 
 	current->flags &= ~PF_KTHREAD;
 	retval = search_binary_handler(bprm,regs);
-	if (retval >= 0) {
-		/* execve success */
-		security_bprm_free(bprm);
-		acct_update_integrals(current);
-		free_bprm(bprm);
-		if (displaced)
-			put_files_struct(displaced);
-		return retval;
-	}
+	if (retval < 0)
+		goto out;
 
-out:
-	if (bprm->security)
-		security_bprm_free(bprm);
+	/* execve succeeded */
+	mutex_unlock(&current->cred_exec_mutex);
+	acct_update_integrals(current);
+	free_bprm(bprm);
+	if (displaced)
+		put_files_struct(displaced);
+	return retval;
 
-out_mm:
+out:
 	if (bprm->mm)
 		mmput (bprm->mm);
 
@@ -1353,7 +1372,11 @@ out_file:
 		allow_write_access(bprm->file);
 		fput(bprm->file);
 	}
-out_kfree:
+
+out_unlock:
+	mutex_unlock(&current->cred_exec_mutex);
+
+out_free:
 	free_bprm(bprm);
 
 out_files:
diff --git a/fs/internal.h b/fs/internal.h
index 80aa9a02337..53af885f173 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -10,6 +10,7 @@
  */
 
 struct super_block;
+struct linux_binprm;
 
 /*
  * block_dev.c
@@ -39,6 +40,11 @@ static inline int sb_is_blkdev_sb(struct super_block *sb)
  */
 extern void __init chrdev_init(void);
 
+/*
+ * exec.c
+ */
+extern void check_unsafe_exec(struct linux_binprm *);
+
 /*
  * namespace.c
  */
diff --git a/include/linux/audit.h b/include/linux/audit.h
index 0b2fcb698a6..e8ce2c4c7ac 100644
--- a/include/linux/audit.h
+++ b/include/linux/audit.h
@@ -508,22 +508,6 @@ static inline int audit_mq_getsetattr(mqd_t mqdes, struct mq_attr *mqstat)
 	return 0;
 }
 
-/*
- * ieieeeeee, an audit function without a return code!
- *
- * This function might fail!  I decided that it didn't matter.  We are too late
- * to fail the syscall and the information isn't REQUIRED for any purpose.  It's
- * just nice to have.  We should be able to look at past audit logs to figure
- * out this process's current cap set along with the fcaps from the PATH record
- * and use that to come up with the final set.  Yeah, its ugly, but all the info
- * is still in the audit log.  So I'm not going to bother mentioning we failed
- * if we couldn't allocate memory.
- *
- * If someone changes their mind they could create the aux record earlier and
- * then search here and use that earlier allocation.  But I don't wanna.
- *
- * -Eric
- */
 static inline int audit_log_bprm_fcaps(struct linux_binprm *bprm,
 				       const struct cred *new,
 				       const struct cred *old)
diff --git a/include/linux/binfmts.h b/include/linux/binfmts.h
index 7394b5b349f..6cbfbe29718 100644
--- a/include/linux/binfmts.h
+++ b/include/linux/binfmts.h
@@ -35,16 +35,20 @@ struct linux_binprm{
 	struct mm_struct *mm;
 	unsigned long p; /* current top of mem */
 	unsigned int sh_bang:1,
-		     misc_bang:1;
+		misc_bang:1,
+		cred_prepared:1,/* true if creds already prepared (multiple
+				 * preps happen for interpreters) */
+		cap_effective:1;/* true if has elevated effective capabilities,
+				 * false if not; except for init which inherits
+				 * its parent's caps anyway */
 #ifdef __alpha__
 	unsigned int taso:1;
 #endif
 	unsigned int recursion_depth;
 	struct file * file;
-	int e_uid, e_gid;
-	kernel_cap_t cap_post_exec_permitted;
-	bool cap_effective;
-	void *security;
+	struct cred *cred;	/* new credentials */
+	int unsafe;		/* how unsafe this exec is (mask of LSM_UNSAFE_*) */
+	unsigned int per_clear;	/* bits to clear in current->personality */
 	int argc, envc;
 	char * filename;	/* Name of binary as seen by procps */
 	char * interp;		/* Name of the binary really executed. Most
@@ -101,7 +105,7 @@ extern int setup_arg_pages(struct linux_binprm * bprm,
 			   int executable_stack);
 extern int bprm_mm_init(struct linux_binprm *bprm);
 extern int copy_strings_kernel(int argc,char ** argv,struct linux_binprm *bprm);
-extern void compute_creds(struct linux_binprm *binprm);
+extern void install_exec_creds(struct linux_binprm *bprm);
 extern int do_coredump(long signr, int exit_code, struct pt_regs * regs);
 extern int set_binfmt(struct linux_binfmt *new);
 extern void free_bprm(struct linux_binprm *);
diff --git a/include/linux/cred.h b/include/linux/cred.h
index eaf6fa695a0..8edb4d1d542 100644
--- a/include/linux/cred.h
+++ b/include/linux/cred.h
@@ -84,8 +84,6 @@ struct thread_group_cred {
 	struct key	*process_keyring;	/* keyring private to this process */
 	struct rcu_head	rcu;			/* RCU deletion hook */
 };
-
-extern void release_tgcred(struct cred *cred);
 #endif
 
 /*
@@ -144,6 +142,7 @@ struct cred {
 extern void __put_cred(struct cred *);
 extern int copy_creds(struct task_struct *, unsigned long);
 extern struct cred *prepare_creds(void);
+extern struct cred *prepare_exec_creds(void);
 extern struct cred *prepare_usermodehelper_creds(void);
 extern int commit_creds(struct cred *);
 extern void abort_creds(struct cred *);
diff --git a/include/linux/key.h b/include/linux/key.h
index 69ecf0934b0..21d32a142c0 100644
--- a/include/linux/key.h
+++ b/include/linux/key.h
@@ -278,7 +278,6 @@ extern ctl_table key_sysctls[];
  * the userspace interface
  */
 extern int install_thread_keyring_to_cred(struct cred *cred);
-extern int exec_keys(struct task_struct *tsk);
 extern void key_fsuid_changed(struct task_struct *tsk);
 extern void key_fsgid_changed(struct task_struct *tsk);
 extern void key_init(void);
@@ -294,7 +293,6 @@ extern void key_init(void);
 #define make_key_ref(k, p)		NULL
 #define key_ref_to_ptr(k)		NULL
 #define is_key_possessed(k)		0
-#define exec_keys(t)			do { } while(0)
 #define key_fsuid_changed(t)		do { } while(0)
 #define key_fsgid_changed(t)		do { } while(0)
 #define key_init()			do { } while(0)
diff --git a/include/linux/security.h b/include/linux/security.h
index 68be1125144..56a0eed6567 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -57,8 +57,7 @@ extern int cap_capset(struct cred *new, const struct cred *old,
 		      const kernel_cap_t *effective,
 		      const kernel_cap_t *inheritable,
 		      const kernel_cap_t *permitted);
-extern int cap_bprm_set_security(struct linux_binprm *bprm);
-extern int cap_bprm_apply_creds(struct linux_binprm *bprm, int unsafe);
+extern int cap_bprm_set_creds(struct linux_binprm *bprm);
 extern int cap_bprm_secureexec(struct linux_binprm *bprm);
 extern int cap_inode_setxattr(struct dentry *dentry, const char *name,
 			      const void *value, size_t size, int flags);
@@ -110,7 +109,7 @@ extern unsigned long mmap_min_addr;
 struct sched_param;
 struct request_sock;
 
-/* bprm_apply_creds unsafe reasons */
+/* bprm->unsafe reasons */
 #define LSM_UNSAFE_SHARE	1
 #define LSM_UNSAFE_PTRACE	2
 #define LSM_UNSAFE_PTRACE_CAP	4
@@ -154,36 +153,7 @@ static inline void security_free_mnt_opts(struct security_mnt_opts *opts)
  *
  * Security hooks for program execution operations.
  *
- * @bprm_alloc_security:
- *	Allocate and attach a security structure to the @bprm->security field.
- *	The security field is initialized to NULL when the bprm structure is
- *	allocated.
- *	@bprm contains the linux_binprm structure to be modified.
- *	Return 0 if operation was successful.
- * @bprm_free_security:
- *	@bprm contains the linux_binprm structure to be modified.
- *	Deallocate and clear the @bprm->security field.
- * @bprm_apply_creds:
- *	Compute and set the security attributes of a process being transformed
- *	by an execve operation based on the old attributes (current->security)
- *	and the information saved in @bprm->security by the set_security hook.
- *	Since this function may return an error, in which case the process will
- *      be killed.  However, it can leave the security attributes of the
- *	process unchanged if an access failure occurs at this point.
- *	bprm_apply_creds is called under task_lock.  @unsafe indicates various
- *	reasons why it may be unsafe to change security state.
- *	@bprm contains the linux_binprm structure.
- * @bprm_post_apply_creds:
- *	Runs after bprm_apply_creds with the task_lock dropped, so that
- *	functions which cannot be called safely under the task_lock can
- *	be used.  This hook is a good place to perform state changes on
- *	the process such as closing open file descriptors to which access
- *	is no longer granted if the attributes were changed.
- *	Note that a security module might need to save state between
- *	bprm_apply_creds and bprm_post_apply_creds to store the decision
- *	on whether the process may proceed.
- *	@bprm contains the linux_binprm structure.
- * @bprm_set_security:
+ * @bprm_set_creds:
  *	Save security information in the bprm->security field, typically based
  *	on information about the bprm->file, for later use by the apply_creds
  *	hook.  This hook may also optionally check permissions (e.g. for
@@ -196,15 +166,30 @@ static inline void security_free_mnt_opts(struct security_mnt_opts *opts)
  *	@bprm contains the linux_binprm structure.
  *	Return 0 if the hook is successful and permission is granted.
  * @bprm_check_security:
- *	This hook mediates the point when a search for a binary handler	will
- *	begin.  It allows a check the @bprm->security value which is set in
- *	the preceding set_security call.  The primary difference from
- *	set_security is that the argv list and envp list are reliably
- *	available in @bprm.  This hook may be called multiple times
- *	during a single execve; and in each pass set_security is called
- *	first.
+ *	This hook mediates the point when a search for a binary handler will
+ *	begin.  It allows a check the @bprm->security value which is set in the
+ *	preceding set_creds call.  The primary difference from set_creds is
+ *	that the argv list and envp list are reliably available in @bprm.  This
+ *	hook may be called multiple times during a single execve; and in each
+ *	pass set_creds is called first.
  *	@bprm contains the linux_binprm structure.
  *	Return 0 if the hook is successful and permission is granted.
+ * @bprm_committing_creds:
+ *	Prepare to install the new security attributes of a process being
+ *	transformed by an execve operation, based on the old credentials
+ *	pointed to by @current->cred and the information set in @bprm->cred by
+ *	the bprm_set_creds hook.  @bprm points to the linux_binprm structure.
+ *	This hook is a good place to perform state changes on the process such
+ *	as closing open file descriptors to which access will no longer be
+ *	granted when the attributes are changed.  This is called immediately
+ *	before commit_creds().
+ * @bprm_committed_creds:
+ *	Tidy up after the installation of the new security attributes of a
+ *	process being transformed by an execve operation.  The new credentials
+ *	have, by this point, been set to @current->cred.  @bprm points to the
+ *	linux_binprm structure.  This hook is a good place to perform state
+ *	changes on the process such as clearing out non-inheritable signal
+ *	state.  This is called immediately after commit_creds().
  * @bprm_secureexec:
  *	Return a boolean value (0 or 1) indicating whether a "secure exec"
  *	is required.  The flag is passed in the auxiliary table
@@ -1301,13 +1286,11 @@ struct security_operations {
 	int (*settime) (struct timespec *ts, struct timezone *tz);
 	int (*vm_enough_memory) (struct mm_struct *mm, long pages);
 
-	int (*bprm_alloc_security) (struct linux_binprm *bprm);
-	void (*bprm_free_security) (struct linux_binprm *bprm);
-	int (*bprm_apply_creds) (struct linux_binprm *bprm, int unsafe);
-	void (*bprm_post_apply_creds) (struct linux_binprm *bprm);
-	int (*bprm_set_security) (struct linux_binprm *bprm);
+	int (*bprm_set_creds) (struct linux_binprm *bprm);
 	int (*bprm_check_security) (struct linux_binprm *bprm);
 	int (*bprm_secureexec) (struct linux_binprm *bprm);
+	void (*bprm_committing_creds) (struct linux_binprm *bprm);
+	void (*bprm_committed_creds) (struct linux_binprm *bprm);
 
 	int (*sb_alloc_security) (struct super_block *sb);
 	void (*sb_free_security) (struct super_block *sb);
@@ -1569,12 +1552,10 @@ int security_settime(struct timespec *ts, struct timezone *tz);
 int security_vm_enough_memory(long pages);
 int security_vm_enough_memory_mm(struct mm_struct *mm, long pages);
 int security_vm_enough_memory_kern(long pages);
-int security_bprm_alloc(struct linux_binprm *bprm);
-void security_bprm_free(struct linux_binprm *bprm);
-int security_bprm_apply_creds(struct linux_binprm *bprm, int unsafe);
-void security_bprm_post_apply_creds(struct linux_binprm *bprm);
-int security_bprm_set(struct linux_binprm *bprm);
+int security_bprm_set_creds(struct linux_binprm *bprm);
 int security_bprm_check(struct linux_binprm *bprm);
+void security_bprm_committing_creds(struct linux_binprm *bprm);
+void security_bprm_committed_creds(struct linux_binprm *bprm);
 int security_bprm_secureexec(struct linux_binprm *bprm);
 int security_sb_alloc(struct super_block *sb);
 void security_sb_free(struct super_block *sb);
@@ -1812,32 +1793,22 @@ static inline int security_vm_enough_memory_mm(struct mm_struct *mm, long pages)
 	return cap_vm_enough_memory(mm, pages);
 }
 
-static inline int security_bprm_alloc(struct linux_binprm *bprm)
-{
-	return 0;
-}
-
-static inline void security_bprm_free(struct linux_binprm *bprm)
-{ }
-
-static inline int security_bprm_apply_creds(struct linux_binprm *bprm, int unsafe)
+static inline int security_bprm_set_creds(struct linux_binprm *bprm)
 {
-	return cap_bprm_apply_creds(bprm, unsafe);
+	return cap_bprm_set_creds(bprm);
 }
 
-static inline void security_bprm_post_apply_creds(struct linux_binprm *bprm)
+static inline int security_bprm_check(struct linux_binprm *bprm)
 {
-	return;
+	return 0;
 }
 
-static inline int security_bprm_set(struct linux_binprm *bprm)
+static inline void security_bprm_committing_creds(struct linux_binprm *bprm)
 {
-	return cap_bprm_set_security(bprm);
 }
 
-static inline int security_bprm_check(struct linux_binprm *bprm)
+static inline void security_bprm_committed_creds(struct linux_binprm *bprm)
 {
-	return 0;
 }
 
 static inline int security_bprm_secureexec(struct linux_binprm *bprm)
diff --git a/kernel/cred.c b/kernel/cred.c
index cb6b5eda978..e6fcdd67b2e 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -68,7 +68,7 @@ static void release_tgcred_rcu(struct rcu_head *rcu)
 /*
  * Release a set of thread group credentials.
  */
-void release_tgcred(struct cred *cred)
+static void release_tgcred(struct cred *cred)
 {
 #ifdef CONFIG_KEYS
 	struct thread_group_cred *tgcred = cred->tgcred;
@@ -163,6 +163,50 @@ error:
 }
 EXPORT_SYMBOL(prepare_creds);
 
+/*
+ * Prepare credentials for current to perform an execve()
+ * - The caller must hold current->cred_exec_mutex
+ */
+struct cred *prepare_exec_creds(void)
+{
+	struct thread_group_cred *tgcred = NULL;
+	struct cred *new;
+
+#ifdef CONFIG_KEYS
+	tgcred = kmalloc(sizeof(*tgcred), GFP_KERNEL);
+	if (!tgcred)
+		return NULL;
+#endif
+
+	new = prepare_creds();
+	if (!new) {
+		kfree(tgcred);
+		return new;
+	}
+
+#ifdef CONFIG_KEYS
+	/* newly exec'd tasks don't get a thread keyring */
+	key_put(new->thread_keyring);
+	new->thread_keyring = NULL;
+
+	/* create a new per-thread-group creds for all this set of threads to
+	 * share */
+	memcpy(tgcred, new->tgcred, sizeof(struct thread_group_cred));
+
+	atomic_set(&tgcred->usage, 1);
+	spin_lock_init(&tgcred->lock);
+
+	/* inherit the session keyring; new process keyring */
+	key_get(tgcred->session_keyring);
+	tgcred->process_keyring = NULL;
+
+	release_tgcred(new);
+	new->tgcred = tgcred;
+#endif
+
+	return new;
+}
+
 /*
  * prepare new credentials for the usermode helper dispatcher
  */
diff --git a/security/capability.c b/security/capability.c
index efeb6d9e0e6..185804f99ad 100644
--- a/security/capability.c
+++ b/security/capability.c
@@ -32,24 +32,19 @@ static int cap_quota_on(struct dentry *dentry)
 	return 0;
 }
 
-static int cap_bprm_alloc_security(struct linux_binprm *bprm)
+static int cap_bprm_check_security (struct linux_binprm *bprm)
 {
 	return 0;
 }
 
-static void cap_bprm_free_security(struct linux_binprm *bprm)
+static void cap_bprm_committing_creds(struct linux_binprm *bprm)
 {
 }
 
-static void cap_bprm_post_apply_creds(struct linux_binprm *bprm)
+static void cap_bprm_committed_creds(struct linux_binprm *bprm)
 {
 }
 
-static int cap_bprm_check_security(struct linux_binprm *bprm)
-{
-	return 0;
-}
-
 static int cap_sb_alloc_security(struct super_block *sb)
 {
 	return 0;
@@ -827,11 +822,9 @@ void security_fixup_ops(struct security_operations *ops)
 	set_to_cap_if_null(ops, syslog);
 	set_to_cap_if_null(ops, settime);
 	set_to_cap_if_null(ops, vm_enough_memory);
-	set_to_cap_if_null(ops, bprm_alloc_security);
-	set_to_cap_if_null(ops, bprm_free_security);
-	set_to_cap_if_null(ops, bprm_apply_creds);
-	set_to_cap_if_null(ops, bprm_post_apply_creds);
-	set_to_cap_if_null(ops, bprm_set_security);
+	set_to_cap_if_null(ops, bprm_set_creds);
+	set_to_cap_if_null(ops, bprm_committing_creds);
+	set_to_cap_if_null(ops, bprm_committed_creds);
 	set_to_cap_if_null(ops, bprm_check_security);
 	set_to_cap_if_null(ops, bprm_secureexec);
 	set_to_cap_if_null(ops, sb_alloc_security);
diff --git a/security/commoncap.c b/security/commoncap.c
index b5419273f92..51dfa11e8e5 100644
--- a/security/commoncap.c
+++ b/security/commoncap.c
@@ -167,7 +167,7 @@ int cap_capset(struct cred *new,
 
 static inline void bprm_clear_caps(struct linux_binprm *bprm)
 {
-	cap_clear(bprm->cap_post_exec_permitted);
+	cap_clear(bprm->cred->cap_permitted);
 	bprm->cap_effective = false;
 }
 
@@ -198,15 +198,15 @@ int cap_inode_killpriv(struct dentry *dentry)
 }
 
 static inline int bprm_caps_from_vfs_caps(struct cpu_vfs_cap_data *caps,
-					  struct linux_binprm *bprm)
+					  struct linux_binprm *bprm,
+					  bool *effective)
 {
+	struct cred *new = bprm->cred;
 	unsigned i;
 	int ret = 0;
 
 	if (caps->magic_etc & VFS_CAP_FLAGS_EFFECTIVE)
-		bprm->cap_effective = true;
-	else
-		bprm->cap_effective = false;
+		*effective = true;
 
 	CAP_FOR_EACH_U32(i) {
 		__u32 permitted = caps->permitted.cap[i];
@@ -215,16 +215,13 @@ static inline int bprm_caps_from_vfs_caps(struct cpu_vfs_cap_data *caps,
 		/*
 		 * pP' = (X & fP) | (pI & fI)
 		 */
-		bprm->cap_post_exec_permitted.cap[i] =
-			(current->cred->cap_bset.cap[i] & permitted) |
-			(current->cred->cap_inheritable.cap[i] & inheritable);
+		new->cap_permitted.cap[i] =
+			(new->cap_bset.cap[i] & permitted) |
+			(new->cap_inheritable.cap[i] & inheritable);
 
-		if (permitted & ~bprm->cap_post_exec_permitted.cap[i]) {
-			/*
-			 * insufficient to execute correctly
-			 */
+		if (permitted & ~new->cap_permitted.cap[i])
+			/* insufficient to execute correctly */
 			ret = -EPERM;
-		}
 	}
 
 	/*
@@ -232,7 +229,7 @@ static inline int bprm_caps_from_vfs_caps(struct cpu_vfs_cap_data *caps,
 	 * do not have enough capabilities, we return an error if they are
 	 * missing some "forced" (aka file-permitted) capabilities.
 	 */
-	return bprm->cap_effective ? ret : 0;
+	return *effective ? ret : 0;
 }
 
 int get_vfs_caps_from_disk(const struct dentry *dentry, struct cpu_vfs_cap_data *cpu_caps)
@@ -250,10 +247,9 @@ int get_vfs_caps_from_disk(const struct dentry *dentry, struct cpu_vfs_cap_data
 
 	size = inode->i_op->getxattr((struct dentry *)dentry, XATTR_NAME_CAPS, &caps,
 				   XATTR_CAPS_SZ);
-	if (size == -ENODATA || size == -EOPNOTSUPP) {
+	if (size == -ENODATA || size == -EOPNOTSUPP)
 		/* no data, that's ok */
 		return -ENODATA;
-	}
 	if (size < 0)
 		return size;
 
@@ -262,7 +258,7 @@ int get_vfs_caps_from_disk(const struct dentry *dentry, struct cpu_vfs_cap_data
 
 	cpu_caps->magic_etc = magic_etc = le32_to_cpu(caps.magic_etc);
 
-	switch ((magic_etc & VFS_CAP_REVISION_MASK)) {
+	switch (magic_etc & VFS_CAP_REVISION_MASK) {
 	case VFS_CAP_REVISION_1:
 		if (size != XATTR_CAPS_SZ_1)
 			return -EINVAL;
@@ -283,11 +279,12 @@ int get_vfs_caps_from_disk(const struct dentry *dentry, struct cpu_vfs_cap_data
 		cpu_caps->permitted.cap[i] = le32_to_cpu(caps.data[i].permitted);
 		cpu_caps->inheritable.cap[i] = le32_to_cpu(caps.data[i].inheritable);
 	}
+
 	return 0;
 }
 
 /* Locate any VFS capabilities: */
-static int get_file_caps(struct linux_binprm *bprm)
+static int get_file_caps(struct linux_binprm *bprm, bool *effective)
 {
 	struct dentry *dentry;
 	int rc = 0;
@@ -313,7 +310,10 @@ static int get_file_caps(struct linux_binprm *bprm)
 		goto out;
 	}
 
-	rc = bprm_caps_from_vfs_caps(&vcaps, bprm);
+	rc = bprm_caps_from_vfs_caps(&vcaps, bprm, effective);
+	if (rc == -EINVAL)
+		printk(KERN_NOTICE "%s: cap_from_disk returned %d for %s\n",
+		       __func__, rc, bprm->filename);
 
 out:
 	dput(dentry);
@@ -334,18 +334,27 @@ int cap_inode_killpriv(struct dentry *dentry)
 	return 0;
 }
 
-static inline int get_file_caps(struct linux_binprm *bprm)
+static inline int get_file_caps(struct linux_binprm *bprm, bool *effective)
 {
 	bprm_clear_caps(bprm);
 	return 0;
 }
 #endif
 
-int cap_bprm_set_security (struct linux_binprm *bprm)
+/*
+ * set up the new credentials for an exec'd task
+ */
+int cap_bprm_set_creds(struct linux_binprm *bprm)
 {
+	const struct cred *old = current_cred();
+	struct cred *new = bprm->cred;
+	bool effective;
 	int ret;
 
-	ret = get_file_caps(bprm);
+	effective = false;
+	ret = get_file_caps(bprm, &effective);
+	if (ret < 0)
+		return ret;
 
 	if (!issecure(SECURE_NOROOT)) {
 		/*
@@ -353,63 +362,47 @@ int cap_bprm_set_security (struct linux_binprm *bprm)
 		 * executables under compatibility mode, we override the
 		 * capability sets for the file.
 		 *
-		 * If only the real uid is 0, we do not set the effective
-		 * bit.
+		 * If only the real uid is 0, we do not set the effective bit.
 		 */
-		if (bprm->e_uid == 0 || current_uid() == 0) {
+		if (new->euid == 0 || new->uid == 0) {
 			/* pP' = (cap_bset & ~0) | (pI & ~0) */
-			bprm->cap_post_exec_permitted = cap_combine(
-				current->cred->cap_bset,
-				current->cred->cap_inheritable);
-			bprm->cap_effective = (bprm->e_uid == 0);
-			ret = 0;
+			new->cap_permitted = cap_combine(old->cap_bset,
+							 old->cap_inheritable);
 		}
+		if (new->euid == 0)
+			effective = true;
 	}
 
-	return ret;
-}
-
-int cap_bprm_apply_creds (struct linux_binprm *bprm, int unsafe)
-{
-	const struct cred *old = current_cred();
-	struct cred *new;
-
-	new = prepare_creds();
-	if (!new)
-		return -ENOMEM;
-
-	if (bprm->e_uid != old->uid || bprm->e_gid != old->gid ||
-	    !cap_issubset(bprm->cap_post_exec_permitted,
-			  old->cap_permitted)) {
-		set_dumpable(current->mm, suid_dumpable);
-		current->pdeath_signal = 0;
-
-		if (unsafe & ~LSM_UNSAFE_PTRACE_CAP) {
-			if (!capable(CAP_SETUID)) {
-				bprm->e_uid = old->uid;
-				bprm->e_gid = old->gid;
-			}
-			if (cap_limit_ptraced_target()) {
-				bprm->cap_post_exec_permitted = cap_intersect(
-					bprm->cap_post_exec_permitted,
-					new->cap_permitted);
-			}
+	/* Don't let someone trace a set[ug]id/setpcap binary with the revised
+	 * credentials unless they have the appropriate permit
+	 */
+	if ((new->euid != old->uid ||
+	     new->egid != old->gid ||
+	     !cap_issubset(new->cap_permitted, old->cap_permitted)) &&
+	    bprm->unsafe & ~LSM_UNSAFE_PTRACE_CAP) {
+		/* downgrade; they get no more than they had, and maybe less */
+		if (!capable(CAP_SETUID)) {
+			new->euid = new->uid;
+			new->egid = new->gid;
 		}
+		if (cap_limit_ptraced_target())
+			new->cap_permitted = cap_intersect(new->cap_permitted,
+							   old->cap_permitted);
 	}
 
-	new->suid = new->euid = new->fsuid = bprm->e_uid;
-	new->sgid = new->egid = new->fsgid = bprm->e_gid;
+	new->suid = new->fsuid = new->euid;
+	new->sgid = new->fsgid = new->egid;
 
-	/* For init, we want to retain the capabilities set
-	 * in the init_task struct. Thus we skip the usual
-	 * capability rules */
+	/* For init, we want to retain the capabilities set in the initial
+	 * task.  Thus we skip the usual capability rules
+	 */
 	if (!is_global_init(current)) {
-		new->cap_permitted = bprm->cap_post_exec_permitted;
-		if (bprm->cap_effective)
-			new->cap_effective = bprm->cap_post_exec_permitted;
+		if (effective)
+			new->cap_effective = new->cap_permitted;
 		else
 			cap_clear(new->cap_effective);
 	}
+	bprm->cap_effective = effective;
 
 	/*
 	 * Audit candidate if current->cap_effective is set
@@ -425,23 +418,31 @@ int cap_bprm_apply_creds (struct linux_binprm *bprm, int unsafe)
 	 */
 	if (!cap_isclear(new->cap_effective)) {
 		if (!cap_issubset(CAP_FULL_SET, new->cap_effective) ||
-		    bprm->e_uid != 0 || new->uid != 0 ||
-		    issecure(SECURE_NOROOT))
-			audit_log_bprm_fcaps(bprm, new, old);
+		    new->euid != 0 || new->uid != 0 ||
+		    issecure(SECURE_NOROOT)) {
+			ret = audit_log_bprm_fcaps(bprm, new, old);
+			if (ret < 0)
+				return ret;
+		}
 	}
 
 	new->securebits &= ~issecure_mask(SECURE_KEEP_CAPS);
-	return commit_creds(new);
+	return 0;
 }
 
-int cap_bprm_secureexec (struct linux_binprm *bprm)
+/*
+ * determine whether a secure execution is required
+ * - the creds have been committed at this point, and are no longer available
+ *   through bprm
+ */
+int cap_bprm_secureexec(struct linux_binprm *bprm)
 {
 	const struct cred *cred = current_cred();
 
 	if (cred->uid != 0) {
 		if (bprm->cap_effective)
 			return 1;
-		if (!cap_isclear(bprm->cap_post_exec_permitted))
+		if (!cap_isclear(cred->cap_permitted))
 			return 1;
 	}
 
@@ -477,7 +478,7 @@ int cap_inode_removexattr(struct dentry *dentry, const char *name)
 }
 
 /* moved from kernel/sys.c. */
-/* 
+/*
  * cap_emulate_setxuid() fixes the effective / permitted capabilities of
  * a process after a call to setuid, setreuid, or setresuid.
  *
@@ -491,10 +492,10 @@ int cap_inode_removexattr(struct dentry *dentry, const char *name)
  *  3) When set*uiding _from_ euid != 0 _to_ euid == 0, the effective
  *  capabilities are set to the permitted capabilities.
  *
- *  fsuid is handled elsewhere. fsuid == 0 and {r,e,s}uid!= 0 should 
+ *  fsuid is handled elsewhere. fsuid == 0 and {r,e,s}uid!= 0 should
  *  never happen.
  *
- *  -astor 
+ *  -astor
  *
  * cevans - New behaviour, Oct '99
  * A process may, via prctl(), elect to keep its capabilities when it
@@ -751,4 +752,3 @@ int cap_vm_enough_memory(struct mm_struct *mm, long pages)
 		cap_sys_admin = 1;
 	return __vm_enough_memory(mm, pages, cap_sys_admin);
 }
-
diff --git a/security/keys/process_keys.c b/security/keys/process_keys.c
index df329f684a6..2f5d89e92b8 100644
--- a/security/keys/process_keys.c
+++ b/security/keys/process_keys.c
@@ -274,48 +274,6 @@ static int install_session_keyring(struct key *keyring)
 	return commit_creds(new);
 }
 
-/*****************************************************************************/
-/*
- * deal with execve()
- */
-int exec_keys(struct task_struct *tsk)
-{
-	struct thread_group_cred *tgcred = NULL;
-	struct cred *new;
-
-#ifdef CONFIG_KEYS
-	tgcred = kmalloc(sizeof(*tgcred), GFP_KERNEL);
-	if (!tgcred)
-		return -ENOMEM;
-#endif
-
-	new = prepare_creds();
-	if (new < 0)
-		return -ENOMEM;
-
-	/* newly exec'd tasks don't get a thread keyring */
-	key_put(new->thread_keyring);
-	new->thread_keyring = NULL;
-
-	/* create a new per-thread-group creds for all this set of threads to
-	 * share */
-	memcpy(tgcred, new->tgcred, sizeof(struct thread_group_cred));
-
-	atomic_set(&tgcred->usage, 1);
-	spin_lock_init(&tgcred->lock);
-
-	/* inherit the session keyring; new process keyring */
-	key_get(tgcred->session_keyring);
-	tgcred->process_keyring = NULL;
-
-	release_tgcred(new);
-	new->tgcred = tgcred;
-
-	commit_creds(new);
-	return 0;
-
-} /* end exec_keys() */
-
 /*****************************************************************************/
 /*
  * the filesystem user ID changed
diff --git a/security/root_plug.c b/security/root_plug.c
index c3f68b5b372..40fb4f15e27 100644
--- a/security/root_plug.c
+++ b/security/root_plug.c
@@ -55,9 +55,9 @@ static int rootplug_bprm_check_security (struct linux_binprm *bprm)
 	struct usb_device *dev;
 
 	root_dbg("file %s, e_uid = %d, e_gid = %d\n",
-		 bprm->filename, bprm->e_uid, bprm->e_gid);
+		 bprm->filename, bprm->cred->euid, bprm->cred->egid);
 
-	if (bprm->e_gid == 0) {
+	if (bprm->cred->egid == 0) {
 		dev = usb_find_device(vendor_id, product_id);
 		if (!dev) {
 			root_dbg("e_gid = 0, and device not found, "
@@ -75,15 +75,12 @@ static struct security_operations rootplug_security_ops = {
 	.ptrace_may_access =		cap_ptrace_may_access,
 	.ptrace_traceme =		cap_ptrace_traceme,
 	.capget =			cap_capget,
-	.capset_check =			cap_capset_check,
-	.capset_set =			cap_capset_set,
+	.capset =			cap_capset,
 	.capable =			cap_capable,
 
-	.bprm_apply_creds =		cap_bprm_apply_creds,
-	.bprm_set_security =		cap_bprm_set_security,
+	.bprm_set_creds =		cap_bprm_set_creds,
 
-	.task_post_setuid =		cap_task_post_setuid,
-	.task_reparent_to_init =	cap_task_reparent_to_init,
+	.task_fix_setuid =		cap_task_fix_setuid,
 	.task_prctl =			cap_task_prctl,
 
 	.bprm_check_security =		rootplug_bprm_check_security,
diff --git a/security/security.c b/security/security.c
index a55d739c686..dc5babb2d6d 100644
--- a/security/security.c
+++ b/security/security.c
@@ -213,34 +213,24 @@ int security_vm_enough_memory_kern(long pages)
 	return security_ops->vm_enough_memory(current->mm, pages);
 }
 
-int security_bprm_alloc(struct linux_binprm *bprm)
+int security_bprm_set_creds(struct linux_binprm *bprm)
 {
-	return security_ops->bprm_alloc_security(bprm);
+	return security_ops->bprm_set_creds(bprm);
 }
 
-void security_bprm_free(struct linux_binprm *bprm)
-{
-	security_ops->bprm_free_security(bprm);
-}
-
-int security_bprm_apply_creds(struct linux_binprm *bprm, int unsafe)
-{
-	return security_ops->bprm_apply_creds(bprm, unsafe);
-}
-
-void security_bprm_post_apply_creds(struct linux_binprm *bprm)
+int security_bprm_check(struct linux_binprm *bprm)
 {
-	security_ops->bprm_post_apply_creds(bprm);
+	return security_ops->bprm_check_security(bprm);
 }
 
-int security_bprm_set(struct linux_binprm *bprm)
+void security_bprm_committing_creds(struct linux_binprm *bprm)
 {
-	return security_ops->bprm_set_security(bprm);
+	return security_ops->bprm_committing_creds(bprm);
 }
 
-int security_bprm_check(struct linux_binprm *bprm)
+void security_bprm_committed_creds(struct linux_binprm *bprm)
 {
-	return security_ops->bprm_check_security(bprm);
+	return security_ops->bprm_committed_creds(bprm);
 }
 
 int security_bprm_secureexec(struct linux_binprm *bprm)
diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index c71bba78872..21a59218463 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -2029,59 +2029,45 @@ static int selinux_vm_enough_memory(struct mm_struct *mm, long pages)
 
 /* binprm security operations */
 
-static int selinux_bprm_alloc_security(struct linux_binprm *bprm)
+static int selinux_bprm_set_creds(struct linux_binprm *bprm)
 {
-	struct bprm_security_struct *bsec;
-
-	bsec = kzalloc(sizeof(struct bprm_security_struct), GFP_KERNEL);
-	if (!bsec)
-		return -ENOMEM;
-
-	bsec->sid = SECINITSID_UNLABELED;
-	bsec->set = 0;
-
-	bprm->security = bsec;
-	return 0;
-}
-
-static int selinux_bprm_set_security(struct linux_binprm *bprm)
-{
-	struct task_security_struct *tsec;
-	struct inode *inode = bprm->file->f_path.dentry->d_inode;
+	const struct task_security_struct *old_tsec;
+	struct task_security_struct *new_tsec;
 	struct inode_security_struct *isec;
-	struct bprm_security_struct *bsec;
-	u32 newsid;
 	struct avc_audit_data ad;
+	struct inode *inode = bprm->file->f_path.dentry->d_inode;
 	int rc;
 
-	rc = secondary_ops->bprm_set_security(bprm);
+	rc = secondary_ops->bprm_set_creds(bprm);
 	if (rc)
 		return rc;
 
-	bsec = bprm->security;
-
-	if (bsec->set)
+	/* SELinux context only depends on initial program or script and not
+	 * the script interpreter */
+	if (bprm->cred_prepared)
 		return 0;
 
-	tsec = current_security();
+	old_tsec = current_security();
+	new_tsec = bprm->cred->security;
 	isec = inode->i_security;
 
 	/* Default to the current task SID. */
-	bsec->sid = tsec->sid;
+	new_tsec->sid = old_tsec->sid;
+	new_tsec->osid = old_tsec->sid;
 
 	/* Reset fs, key, and sock SIDs on execve. */
-	tsec->create_sid = 0;
-	tsec->keycreate_sid = 0;
-	tsec->sockcreate_sid = 0;
+	new_tsec->create_sid = 0;
+	new_tsec->keycreate_sid = 0;
+	new_tsec->sockcreate_sid = 0;
 
-	if (tsec->exec_sid) {
-		newsid = tsec->exec_sid;
+	if (old_tsec->exec_sid) {
+		new_tsec->sid = old_tsec->exec_sid;
 		/* Reset exec SID on execve. */
-		tsec->exec_sid = 0;
+		new_tsec->exec_sid = 0;
 	} else {
 		/* Check for a default transition on this program. */
-		rc = security_transition_sid(tsec->sid, isec->sid,
-					     SECCLASS_PROCESS, &newsid);
+		rc = security_transition_sid(old_tsec->sid, isec->sid,
+					     SECCLASS_PROCESS, &new_tsec->sid);
 		if (rc)
 			return rc;
 	}
@@ -2090,33 +2076,63 @@ static int selinux_bprm_set_security(struct linux_binprm *bprm)
 	ad.u.fs.path = bprm->file->f_path;
 
 	if (bprm->file->f_path.mnt->mnt_flags & MNT_NOSUID)
-		newsid = tsec->sid;
+		new_tsec->sid = old_tsec->sid;
 
-	if (tsec->sid == newsid) {
-		rc = avc_has_perm(tsec->sid, isec->sid,
+	if (new_tsec->sid == old_tsec->sid) {
+		rc = avc_has_perm(old_tsec->sid, isec->sid,
 				  SECCLASS_FILE, FILE__EXECUTE_NO_TRANS, &ad);
 		if (rc)
 			return rc;
 	} else {
 		/* Check permissions for the transition. */
-		rc = avc_has_perm(tsec->sid, newsid,
+		rc = avc_has_perm(old_tsec->sid, new_tsec->sid,
 				  SECCLASS_PROCESS, PROCESS__TRANSITION, &ad);
 		if (rc)
 			return rc;
 
-		rc = avc_has_perm(newsid, isec->sid,
+		rc = avc_has_perm(new_tsec->sid, isec->sid,
 				  SECCLASS_FILE, FILE__ENTRYPOINT, &ad);
 		if (rc)
 			return rc;
 
-		/* Clear any possibly unsafe personality bits on exec: */
-		current->personality &= ~PER_CLEAR_ON_SETID;
+		/* Check for shared state */
+		if (bprm->unsafe & LSM_UNSAFE_SHARE) {
+			rc = avc_has_perm(old_tsec->sid, new_tsec->sid,
+					  SECCLASS_PROCESS, PROCESS__SHARE,
+					  NULL);
+			if (rc)
+				return -EPERM;
+		}
+
+		/* Make sure that anyone attempting to ptrace over a task that
+		 * changes its SID has the appropriate permit */
+		if (bprm->unsafe &
+		    (LSM_UNSAFE_PTRACE | LSM_UNSAFE_PTRACE_CAP)) {
+			struct task_struct *tracer;
+			struct task_security_struct *sec;
+			u32 ptsid = 0;
+
+			rcu_read_lock();
+			tracer = tracehook_tracer_task(current);
+			if (likely(tracer != NULL)) {
+				sec = __task_cred(tracer)->security;
+				ptsid = sec->sid;
+			}
+			rcu_read_unlock();
+
+			if (ptsid != 0) {
+				rc = avc_has_perm(ptsid, new_tsec->sid,
+						  SECCLASS_PROCESS,
+						  PROCESS__PTRACE, NULL);
+				if (rc)
+					return -EPERM;
+			}
+		}
 
-		/* Set the security field to the new SID. */
-		bsec->sid = newsid;
+		/* Clear any possibly unsafe personality bits on exec: */
+		bprm->per_clear |= PER_CLEAR_ON_SETID;
 	}
 
-	bsec->set = 1;
 	return 0;
 }
 
@@ -2125,7 +2141,6 @@ static int selinux_bprm_check_security(struct linux_binprm *bprm)
 	return secondary_ops->bprm_check_security(bprm);
 }
 
-
 static int selinux_bprm_secureexec(struct linux_binprm *bprm)
 {
 	const struct cred *cred = current_cred();
@@ -2141,19 +2156,13 @@ static int selinux_bprm_secureexec(struct linux_binprm *bprm)
 		   the noatsecure permission is granted between
 		   the two SIDs, i.e. ahp returns 0. */
 		atsecure = avc_has_perm(osid, sid,
-					 SECCLASS_PROCESS,
-					 PROCESS__NOATSECURE, NULL);
+					SECCLASS_PROCESS,
+					PROCESS__NOATSECURE, NULL);
 	}
 
 	return (atsecure || secondary_ops->bprm_secureexec(bprm));
 }
 
-static void selinux_bprm_free_security(struct linux_binprm *bprm)
-{
-	kfree(bprm->security);
-	bprm->security = NULL;
-}
-
 extern struct vfsmount *selinuxfs_mount;
 extern struct dentry *selinux_null;
 
@@ -2252,108 +2261,78 @@ static inline void flush_unauthorized_files(const struct cred *cred,
 	spin_unlock(&files->file_lock);
 }
 
-static int selinux_bprm_apply_creds(struct linux_binprm *bprm, int unsafe)
+/*
+ * Prepare a process for imminent new credential changes due to exec
+ */
+static void selinux_bprm_committing_creds(struct linux_binprm *bprm)
 {
-	struct task_security_struct *tsec;
-	struct bprm_security_struct *bsec;
-	struct cred *new;
-	u32 sid;
-	int rc;
-
-	rc = secondary_ops->bprm_apply_creds(bprm, unsafe);
-	if (rc < 0)
-		return rc;
-
-	new = prepare_creds();
-	if (!new)
-		return -ENOMEM;
+	struct task_security_struct *new_tsec;
+	struct rlimit *rlim, *initrlim;
+	int rc, i;
 
-	tsec = new->security;
+	secondary_ops->bprm_committing_creds(bprm);
 
-	bsec = bprm->security;
-	sid = bsec->sid;
-
-	tsec->osid = tsec->sid;
-	bsec->unsafe = 0;
-	if (tsec->sid != sid) {
-		/* Check for shared state.  If not ok, leave SID
-		   unchanged and kill. */
-		if (unsafe & LSM_UNSAFE_SHARE) {
-			rc = avc_has_perm(tsec->sid, sid, SECCLASS_PROCESS,
-					PROCESS__SHARE, NULL);
-			if (rc) {
-				bsec->unsafe = 1;
-				goto out;
-			}
-		}
+	new_tsec = bprm->cred->security;
+	if (new_tsec->sid == new_tsec->osid)
+		return;
 
-		/* Check for ptracing, and update the task SID if ok.
-		   Otherwise, leave SID unchanged and kill. */
-		if (unsafe & (LSM_UNSAFE_PTRACE | LSM_UNSAFE_PTRACE_CAP)) {
-			struct task_struct *tracer;
-			struct task_security_struct *sec;
-			u32 ptsid = 0;
+	/* Close files for which the new task SID is not authorized. */
+	flush_unauthorized_files(bprm->cred, current->files);
 
-			rcu_read_lock();
-			tracer = tracehook_tracer_task(current);
-			if (likely(tracer != NULL)) {
-				sec = __task_cred(tracer)->security;
-				ptsid = sec->sid;
-			}
-			rcu_read_unlock();
+	/* Always clear parent death signal on SID transitions. */
+	current->pdeath_signal = 0;
 
-			if (ptsid != 0) {
-				rc = avc_has_perm(ptsid, sid, SECCLASS_PROCESS,
-						  PROCESS__PTRACE, NULL);
-				if (rc) {
-					bsec->unsafe = 1;
-					goto out;
-				}
-			}
+	/* Check whether the new SID can inherit resource limits from the old
+	 * SID.  If not, reset all soft limits to the lower of the current
+	 * task's hard limit and the init task's soft limit.
+	 *
+	 * Note that the setting of hard limits (even to lower them) can be
+	 * controlled by the setrlimit check.  The inclusion of the init task's
+	 * soft limit into the computation is to avoid resetting soft limits
+	 * higher than the default soft limit for cases where the default is
+	 * lower than the hard limit, e.g. RLIMIT_CORE or RLIMIT_STACK.
+	 */
+	rc = avc_has_perm(new_tsec->osid, new_tsec->sid, SECCLASS_PROCESS,
+			  PROCESS__RLIMITINH, NULL);
+	if (rc) {
+		for (i = 0; i < RLIM_NLIMITS; i++) {
+			rlim = current->signal->rlim + i;
+			initrlim = init_task.signal->rlim + i;
+			rlim->rlim_cur = min(rlim->rlim_max, initrlim->rlim_cur);
 		}
-		tsec->sid = sid;
+		update_rlimit_cpu(rlim->rlim_cur);
 	}
-
-out:
-	commit_creds(new);
-	return 0;
 }
 
 /*
- * called after apply_creds without the task lock held
+ * Clean up the process immediately after the installation of new credentials
+ * due to exec
  */
-static void selinux_bprm_post_apply_creds(struct linux_binprm *bprm)
+static void selinux_bprm_committed_creds(struct linux_binprm *bprm)
 {
-	const struct cred *cred = current_cred();
-	struct task_security_struct *tsec;
-	struct rlimit *rlim, *initrlim;
+	const struct task_security_struct *tsec = current_security();
 	struct itimerval itimer;
-	struct bprm_security_struct *bsec;
 	struct sighand_struct *psig;
+	u32 osid, sid;
 	int rc, i;
 	unsigned long flags;
 
-	tsec = current_security();
-	bsec = bprm->security;
+	secondary_ops->bprm_committed_creds(bprm);
 
-	if (bsec->unsafe) {
-		force_sig_specific(SIGKILL, current);
-		return;
-	}
-	if (tsec->osid == tsec->sid)
+	osid = tsec->osid;
+	sid = tsec->sid;
+
+	if (sid == osid)
 		return;
 
-	/* Close files for which the new task SID is not authorized. */
-	flush_unauthorized_files(cred, current->files);
-
-	/* Check whether the new SID can inherit signal state
-	   from the old SID.  If not, clear itimers to avoid
-	   subsequent signal generation and flush and unblock
-	   signals. This must occur _after_ the task SID has
-	  been updated so that any kill done after the flush
-	  will be checked against the new SID. */
-	rc = avc_has_perm(tsec->osid, tsec->sid, SECCLASS_PROCESS,
-			  PROCESS__SIGINH, NULL);
+	/* Check whether the new SID can inherit signal state from the old SID.
+	 * If not, clear itimers to avoid subsequent signal generation and
+	 * flush and unblock signals.
+	 *
+	 * This must occur _after_ the task SID has been updated so that any
+	 * kill done after the flush will be checked against the new SID.
+	 */
+	rc = avc_has_perm(osid, sid, SECCLASS_PROCESS, PROCESS__SIGINH, NULL);
 	if (rc) {
 		memset(&itimer, 0, sizeof itimer);
 		for (i = 0; i < 3; i++)
@@ -2366,32 +2345,8 @@ static void selinux_bprm_post_apply_creds(struct linux_binprm *bprm)
 		spin_unlock_irq(&current->sighand->siglock);
 	}
 
-	/* Always clear parent death signal on SID transitions. */
-	current->pdeath_signal = 0;
-
-	/* Check whether the new SID can inherit resource limits
-	   from the old SID.  If not, reset all soft limits to
-	   the lower of the current task's hard limit and the init
-	   task's soft limit.  Note that the setting of hard limits
-	   (even to lower them) can be controlled by the setrlimit
-	   check. The inclusion of the init task's soft limit into
-	   the computation is to avoid resetting soft limits higher
-	   than the default soft limit for cases where the default
-	   is lower than the hard limit, e.g. RLIMIT_CORE or
-	   RLIMIT_STACK.*/
-	rc = avc_has_perm(tsec->osid, tsec->sid, SECCLASS_PROCESS,
-			  PROCESS__RLIMITINH, NULL);
-	if (rc) {
-		for (i = 0; i < RLIM_NLIMITS; i++) {
-			rlim = current->signal->rlim + i;
-			initrlim = init_task.signal->rlim+i;
-			rlim->rlim_cur = min(rlim->rlim_max, initrlim->rlim_cur);
-		}
-		update_rlimit_cpu(rlim->rlim_cur);
-	}
-
-	/* Wake up the parent if it is waiting so that it can
-	   recheck wait permission to the new task SID. */
+	/* Wake up the parent if it is waiting so that it can recheck
+	 * wait permission to the new task SID. */
 	read_lock_irq(&tasklist_lock);
 	psig = current->parent->sighand;
 	spin_lock_irqsave(&psig->siglock, flags);
@@ -5556,12 +5511,10 @@ static struct security_operations selinux_ops = {
 	.netlink_send =			selinux_netlink_send,
 	.netlink_recv =			selinux_netlink_recv,
 
-	.bprm_alloc_security =		selinux_bprm_alloc_security,
-	.bprm_free_security =		selinux_bprm_free_security,
-	.bprm_apply_creds =		selinux_bprm_apply_creds,
-	.bprm_post_apply_creds =	selinux_bprm_post_apply_creds,
-	.bprm_set_security =		selinux_bprm_set_security,
+	.bprm_set_creds =		selinux_bprm_set_creds,
 	.bprm_check_security =		selinux_bprm_check_security,
+	.bprm_committing_creds =	selinux_bprm_committing_creds,
+	.bprm_committed_creds =		selinux_bprm_committed_creds,
 	.bprm_secureexec =		selinux_bprm_secureexec,
 
 	.sb_alloc_security =		selinux_sb_alloc_security,
diff --git a/security/selinux/include/objsec.h b/security/selinux/include/objsec.h
index f8be8d7fa26..3cc45168f67 100644
--- a/security/selinux/include/objsec.h
+++ b/security/selinux/include/objsec.h
@@ -77,17 +77,6 @@ struct ipc_security_struct {
 	u32 sid;	/* SID of IPC resource */
 };
 
-struct bprm_security_struct {
-	u32 sid;		/* SID for transformed process */
-	unsigned char set;
-
-	/*
-	 * unsafe is used to share failure information from bprm_apply_creds()
-	 * to bprm_post_apply_creds().
-	 */
-	char unsafe;
-};
-
 struct netif_security_struct {
 	int ifindex;			/* device index */
 	u32 sid;			/* SID for this interface */
diff --git a/security/smack/smack_lsm.c b/security/smack/smack_lsm.c
index e952b397153..de396742abf 100644
--- a/security/smack/smack_lsm.c
+++ b/security/smack/smack_lsm.c
@@ -2596,8 +2596,7 @@ struct security_operations smack_ops = {
 	.settime = 			cap_settime,
 	.vm_enough_memory = 		cap_vm_enough_memory,
 
-	.bprm_apply_creds = 		cap_bprm_apply_creds,
-	.bprm_set_security = 		cap_bprm_set_security,
+	.bprm_set_creds = 		cap_bprm_set_creds,
 	.bprm_secureexec = 		cap_bprm_secureexec,
 
 	.sb_alloc_security = 		smack_sb_alloc_security,
-- 
cgit v1.2.3-70-g09d2


From d76b0d9b2d87cfc95686e148767cbf7d0e22bdc0 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 14 Nov 2008 10:39:25 +1100
Subject: CRED: Use creds in file structs

Attach creds to file structs and discard f_uid/f_gid.

file_operations::open() methods (such as hppfs_open()) should use file->f_cred
rather than current_cred().  At the moment file->f_cred will be current_cred()
at this point.

Signed-off-by: David Howells <dhowells@redhat.com>
Reviewed-by: James Morris <jmorris@namei.org>
Signed-off-by: James Morris <jmorris@namei.org>
---
 arch/mips/kernel/vpe.c              |  4 ++--
 drivers/isdn/hysdn/hysdn_procconf.c |  6 ++++--
 fs/coda/file.c                      |  2 +-
 fs/file_table.c                     |  7 ++++---
 fs/hppfs/hppfs.c                    |  4 ++--
 include/linux/fs.h                  |  2 +-
 net/ipv4/netfilter/ipt_LOG.c        |  4 ++--
 net/ipv6/netfilter/ip6t_LOG.c       |  4 ++--
 net/netfilter/nfnetlink_log.c       |  5 +++--
 net/netfilter/xt_owner.c            | 16 ++++++++--------
 net/sched/cls_flow.c                |  4 ++--
 11 files changed, 31 insertions(+), 27 deletions(-)

(limited to 'include/linux')

diff --git a/arch/mips/kernel/vpe.c b/arch/mips/kernel/vpe.c
index 972b2d2b840..09786e49637 100644
--- a/arch/mips/kernel/vpe.c
+++ b/arch/mips/kernel/vpe.c
@@ -1085,8 +1085,8 @@ static int vpe_open(struct inode *inode, struct file *filp)
 	v->load_addr = NULL;
 	v->len = 0;
 
-	v->uid = filp->f_uid;
-	v->gid = filp->f_gid;
+	v->uid = filp->f_cred->fsuid;
+	v->gid = filp->f_cred->fsgid;
 
 #ifdef CONFIG_MIPS_APSP_KSPD
 	/* get kspd to tell us when a syscall_exit happens */
diff --git a/drivers/isdn/hysdn/hysdn_procconf.c b/drivers/isdn/hysdn/hysdn_procconf.c
index 484299b031f..8f9f4912de3 100644
--- a/drivers/isdn/hysdn/hysdn_procconf.c
+++ b/drivers/isdn/hysdn/hysdn_procconf.c
@@ -246,7 +246,8 @@ hysdn_conf_open(struct inode *ino, struct file *filep)
 	}
 	if (card->debug_flags & (LOG_PROC_OPEN | LOG_PROC_ALL))
 		hysdn_addlog(card, "config open for uid=%d gid=%d mode=0x%x",
-			     filep->f_uid, filep->f_gid, filep->f_mode);
+			     filep->f_cred->fsuid, filep->f_cred->fsgid,
+			     filep->f_mode);
 
 	if ((filep->f_mode & (FMODE_READ | FMODE_WRITE)) == FMODE_WRITE) {
 		/* write only access -> write boot file or conf line */
@@ -331,7 +332,8 @@ hysdn_conf_close(struct inode *ino, struct file *filep)
 	}
 	if (card->debug_flags & (LOG_PROC_OPEN | LOG_PROC_ALL))
 		hysdn_addlog(card, "config close for uid=%d gid=%d mode=0x%x",
-			     filep->f_uid, filep->f_gid, filep->f_mode);
+			     filep->f_cred->fsuid, filep->f_cred->fsgid,
+			     filep->f_mode);
 
 	if ((filep->f_mode & (FMODE_READ | FMODE_WRITE)) == FMODE_WRITE) {
 		/* write only access -> write boot file or conf line */
diff --git a/fs/coda/file.c b/fs/coda/file.c
index 29137ff3ca6..5a876998549 100644
--- a/fs/coda/file.c
+++ b/fs/coda/file.c
@@ -174,7 +174,7 @@ int coda_release(struct inode *coda_inode, struct file *coda_file)
 	BUG_ON(!cfi || cfi->cfi_magic != CODA_MAGIC);
 
 	err = venus_close(coda_inode->i_sb, coda_i2f(coda_inode),
-			  coda_flags, coda_file->f_uid);
+			  coda_flags, coda_file->f_cred->fsuid);
 
 	host_inode = cfi->cfi_container->f_path.dentry->d_inode;
 	cii = ITOC(coda_inode);
diff --git a/fs/file_table.c b/fs/file_table.c
index bc4563fe791..0fbcacc3ea7 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -36,7 +36,9 @@ static struct percpu_counter nr_files __cacheline_aligned_in_smp;
 
 static inline void file_free_rcu(struct rcu_head *head)
 {
-	struct file *f =  container_of(head, struct file, f_u.fu_rcuhead);
+	struct file *f = container_of(head, struct file, f_u.fu_rcuhead);
+
+	put_cred(f->f_cred);
 	kmem_cache_free(filp_cachep, f);
 }
 
@@ -121,8 +123,7 @@ struct file *get_empty_filp(void)
 	INIT_LIST_HEAD(&f->f_u.fu_list);
 	atomic_long_set(&f->f_count, 1);
 	rwlock_init(&f->f_owner.lock);
-	f->f_uid = cred->fsuid;
-	f->f_gid = cred->fsgid;
+	f->f_cred = get_cred(cred);
 	eventpoll_init_file(f);
 	/* f->f_version: 0 */
 	return f;
diff --git a/fs/hppfs/hppfs.c b/fs/hppfs/hppfs.c
index 795e2c130ad..b278f7f5202 100644
--- a/fs/hppfs/hppfs.c
+++ b/fs/hppfs/hppfs.c
@@ -426,7 +426,7 @@ static int file_mode(int fmode)
 
 static int hppfs_open(struct inode *inode, struct file *file)
 {
-	const struct cred *cred = current_cred();
+	const struct cred *cred = file->f_cred;
 	struct hppfs_private *data;
 	struct vfsmount *proc_mnt;
 	struct dentry *proc_dentry;
@@ -490,7 +490,7 @@ static int hppfs_open(struct inode *inode, struct file *file)
 
 static int hppfs_dir_open(struct inode *inode, struct file *file)
 {
-	const struct cred *cred = current_cred();
+	const struct cred *cred = file->f_cred;
 	struct hppfs_private *data;
 	struct vfsmount *proc_mnt;
 	struct dentry *proc_dentry;
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 3bfec1327b8..c0fb6d81d89 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -827,7 +827,7 @@ struct file {
 	fmode_t			f_mode;
 	loff_t			f_pos;
 	struct fown_struct	f_owner;
-	unsigned int		f_uid, f_gid;
+	const struct cred	*f_cred;
 	struct file_ra_state	f_ra;
 
 	u64			f_version;
diff --git a/net/ipv4/netfilter/ipt_LOG.c b/net/ipv4/netfilter/ipt_LOG.c
index fc6ce04a3e3..7b5dbe118c0 100644
--- a/net/ipv4/netfilter/ipt_LOG.c
+++ b/net/ipv4/netfilter/ipt_LOG.c
@@ -340,8 +340,8 @@ static void dump_packet(const struct nf_loginfo *info,
 		read_lock_bh(&skb->sk->sk_callback_lock);
 		if (skb->sk->sk_socket && skb->sk->sk_socket->file)
 			printk("UID=%u GID=%u ",
-				skb->sk->sk_socket->file->f_uid,
-				skb->sk->sk_socket->file->f_gid);
+				skb->sk->sk_socket->file->f_cred->fsuid,
+				skb->sk->sk_socket->file->f_cred->fsgid);
 		read_unlock_bh(&skb->sk->sk_callback_lock);
 	}
 
diff --git a/net/ipv6/netfilter/ip6t_LOG.c b/net/ipv6/netfilter/ip6t_LOG.c
index caa441d0956..871d157cec4 100644
--- a/net/ipv6/netfilter/ip6t_LOG.c
+++ b/net/ipv6/netfilter/ip6t_LOG.c
@@ -364,8 +364,8 @@ static void dump_packet(const struct nf_loginfo *info,
 		read_lock_bh(&skb->sk->sk_callback_lock);
 		if (skb->sk->sk_socket && skb->sk->sk_socket->file)
 			printk("UID=%u GID=%u ",
-				skb->sk->sk_socket->file->f_uid,
-				skb->sk->sk_socket->file->f_gid);
+				skb->sk->sk_socket->file->f_cred->fsuid,
+				skb->sk->sk_socket->file->f_cred->fsgid);
 		read_unlock_bh(&skb->sk->sk_callback_lock);
 	}
 
diff --git a/net/netfilter/nfnetlink_log.c b/net/netfilter/nfnetlink_log.c
index 41e0105d382..38f9efd90e8 100644
--- a/net/netfilter/nfnetlink_log.c
+++ b/net/netfilter/nfnetlink_log.c
@@ -474,8 +474,9 @@ __build_packet_message(struct nfulnl_instance *inst,
 	if (skb->sk) {
 		read_lock_bh(&skb->sk->sk_callback_lock);
 		if (skb->sk->sk_socket && skb->sk->sk_socket->file) {
-			__be32 uid = htonl(skb->sk->sk_socket->file->f_uid);
-			__be32 gid = htonl(skb->sk->sk_socket->file->f_gid);
+			struct file *file = skb->sk->sk_socket->file;
+			__be32 uid = htonl(file->f_cred->fsuid);
+			__be32 gid = htonl(file->f_cred->fsgid);
 			/* need to unlock here since NLA_PUT may goto */
 			read_unlock_bh(&skb->sk->sk_callback_lock);
 			NLA_PUT_BE32(inst->skb, NFULA_UID, uid);
diff --git a/net/netfilter/xt_owner.c b/net/netfilter/xt_owner.c
index f19ebd9b78f..22b2a5e881e 100644
--- a/net/netfilter/xt_owner.c
+++ b/net/netfilter/xt_owner.c
@@ -34,12 +34,12 @@ owner_mt_v0(const struct sk_buff *skb, const struct xt_match_param *par)
 		return false;
 
 	if (info->match & IPT_OWNER_UID)
-		if ((filp->f_uid != info->uid) ^
+		if ((filp->f_cred->fsuid != info->uid) ^
 		    !!(info->invert & IPT_OWNER_UID))
 			return false;
 
 	if (info->match & IPT_OWNER_GID)
-		if ((filp->f_gid != info->gid) ^
+		if ((filp->f_cred->fsgid != info->gid) ^
 		    !!(info->invert & IPT_OWNER_GID))
 			return false;
 
@@ -60,12 +60,12 @@ owner_mt6_v0(const struct sk_buff *skb, const struct xt_match_param *par)
 		return false;
 
 	if (info->match & IP6T_OWNER_UID)
-		if ((filp->f_uid != info->uid) ^
+		if ((filp->f_cred->fsuid != info->uid) ^
 		    !!(info->invert & IP6T_OWNER_UID))
 			return false;
 
 	if (info->match & IP6T_OWNER_GID)
-		if ((filp->f_gid != info->gid) ^
+		if ((filp->f_cred->fsgid != info->gid) ^
 		    !!(info->invert & IP6T_OWNER_GID))
 			return false;
 
@@ -93,14 +93,14 @@ owner_mt(const struct sk_buff *skb, const struct xt_match_param *par)
 		       (XT_OWNER_UID | XT_OWNER_GID)) == 0;
 
 	if (info->match & XT_OWNER_UID)
-		if ((filp->f_uid >= info->uid_min &&
-		    filp->f_uid <= info->uid_max) ^
+		if ((filp->f_cred->fsuid >= info->uid_min &&
+		    filp->f_cred->fsuid <= info->uid_max) ^
 		    !(info->invert & XT_OWNER_UID))
 			return false;
 
 	if (info->match & XT_OWNER_GID)
-		if ((filp->f_gid >= info->gid_min &&
-		    filp->f_gid <= info->gid_max) ^
+		if ((filp->f_cred->fsgid >= info->gid_min &&
+		    filp->f_cred->fsgid <= info->gid_max) ^
 		    !(info->invert & XT_OWNER_GID))
 			return false;
 
diff --git a/net/sched/cls_flow.c b/net/sched/cls_flow.c
index 0ebaff637e3..0ef4e3065bc 100644
--- a/net/sched/cls_flow.c
+++ b/net/sched/cls_flow.c
@@ -260,14 +260,14 @@ static u32 flow_get_rtclassid(const struct sk_buff *skb)
 static u32 flow_get_skuid(const struct sk_buff *skb)
 {
 	if (skb->sk && skb->sk->sk_socket && skb->sk->sk_socket->file)
-		return skb->sk->sk_socket->file->f_uid;
+		return skb->sk->sk_socket->file->f_cred->fsuid;
 	return 0;
 }
 
 static u32 flow_get_skgid(const struct sk_buff *skb)
 {
 	if (skb->sk && skb->sk->sk_socket && skb->sk->sk_socket->file)
-		return skb->sk->sk_socket->file->f_gid;
+		return skb->sk->sk_socket->file->f_cred->fsgid;
 	return 0;
 }
 
-- 
cgit v1.2.3-70-g09d2


From 98870ab0a5a3f1822aee681d2997017e1c87d026 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 14 Nov 2008 10:39:26 +1100
Subject: CRED: Documentation

Document credentials and the new credentials API.

Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: James Morris <jmorris@namei.org>
---
 Documentation/credentials.txt | 582 ++++++++++++++++++++++++++++++++++++++++++
 include/linux/cred.h          |  12 +-
 kernel/cred.c                 |   2 +-
 3 files changed, 594 insertions(+), 2 deletions(-)
 create mode 100644 Documentation/credentials.txt

(limited to 'include/linux')

diff --git a/Documentation/credentials.txt b/Documentation/credentials.txt
new file mode 100644
index 00000000000..df03169782e
--- /dev/null
+++ b/Documentation/credentials.txt
@@ -0,0 +1,582 @@
+			     ====================
+			     CREDENTIALS IN LINUX
+			     ====================
+
+By: David Howells <dhowells@redhat.com>
+
+Contents:
+
+ (*) Overview.
+
+ (*) Types of credentials.
+
+ (*) File markings.
+
+ (*) Task credentials.
+
+     - Immutable credentials.
+     - Accessing task credentials.
+     - Accessing another task's credentials.
+     - Altering credentials.
+     - Managing credentials.
+
+ (*) Open file credentials.
+
+ (*) Overriding the VFS's use of credentials.
+
+
+========
+OVERVIEW
+========
+
+There are several parts to the security check performed by Linux when one
+object acts upon another:
+
+ (1) Objects.
+
+     Objects are things in the system that may be acted upon directly by
+     userspace programs.  Linux has a variety of actionable objects, including:
+
+	- Tasks
+	- Files/inodes
+	- Sockets
+	- Message queues
+	- Shared memory segments
+	- Semaphores
+	- Keys
+
+     As a part of the description of all these objects there is a set of
+     credentials.  What's in the set depends on the type of object.
+
+ (2) Object ownership.
+
+     Amongst the credentials of most objects, there will be a subset that
+     indicates the ownership of that object.  This is used for resource
+     accounting and limitation (disk quotas and task rlimits for example).
+
+     In a standard UNIX filesystem, for instance, this will be defined by the
+     UID marked on the inode.
+
+ (3) The objective context.
+
+     Also amongst the credentials of those objects, there will be a subset that
+     indicates the 'objective context' of that object.  This may or may not be
+     the same set as in (2) - in standard UNIX files, for instance, this is the
+     defined by the UID and the GID marked on the inode.
+
+     The objective context is used as part of the security calculation that is
+     carried out when an object is acted upon.
+
+ (4) Subjects.
+
+     A subject is an object that is acting upon another object.
+
+     Most of the objects in the system are inactive: they don't act on other
+     objects within the system.  Processes/tasks are the obvious exception:
+     they do stuff; they access and manipulate things.
+
+     Objects other than tasks may under some circumstances also be subjects.
+     For instance an open file may send SIGIO to a task using the UID and EUID
+     given to it by a task that called fcntl(F_SETOWN) upon it.  In this case,
+     the file struct will have a subjective context too.
+
+ (5) The subjective context.
+
+     A subject has an additional interpretation of its credentials.  A subset
+     of its credentials forms the 'subjective context'.  The subjective context
+     is used as part of the security calculation that is carried out when a
+     subject acts.
+
+     A Linux task, for example, has the FSUID, FSGID and the supplementary
+     group list for when it is acting upon a file - which are quite separate
+     from the real UID and GID that normally form the objective context of the
+     task.
+
+ (6) Actions.
+
+     Linux has a number of actions available that a subject may perform upon an
+     object.  The set of actions available depends on the nature of the subject
+     and the object.
+
+     Actions include reading, writing, creating and deleting files; forking or
+     signalling and tracing tasks.
+
+ (7) Rules, access control lists and security calculations.
+
+     When a subject acts upon an object, a security calculation is made.  This
+     involves taking the subjective context, the objective context and the
+     action, and searching one or more sets of rules to see whether the subject
+     is granted or denied permission to act in the desired manner on the
+     object, given those contexts.
+
+     There are two main sources of rules:
+
+     (a) Discretionary access control (DAC):
+
+	 Sometimes the object will include sets of rules as part of its
+	 description.  This is an 'Access Control List' or 'ACL'.  A Linux
+	 file may supply more than one ACL.
+
+	 A traditional UNIX file, for example, includes a permissions mask that
+	 is an abbreviated ACL with three fixed classes of subject ('user',
+	 'group' and 'other'), each of which may be granted certain privileges
+	 ('read', 'write' and 'execute' - whatever those map to for the object
+	 in question).  UNIX file permissions do not allow the arbitrary
+	 specification of subjects, however, and so are of limited use.
+
+	 A Linux file might also sport a POSIX ACL.  This is a list of rules
+	 that grants various permissions to arbitrary subjects.
+
+     (b) Mandatory access control (MAC):
+
+	 The system as a whole may have one or more sets of rules that get
+	 applied to all subjects and objects, regardless of their source.
+	 SELinux and Smack are examples of this.
+
+	 In the case of SELinux and Smack, each object is given a label as part
+	 of its credentials.  When an action is requested, they take the
+	 subject label, the object label and the action and look for a rule
+	 that says that this action is either granted or denied.
+
+
+====================
+TYPES OF CREDENTIALS
+====================
+
+The Linux kernel supports the following types of credentials:
+
+ (1) Traditional UNIX credentials.
+
+	Real User ID
+	Real Group ID
+
+     The UID and GID are carried by most, if not all, Linux objects, even if in
+     some cases it has to be invented (FAT or CIFS files for example, which are
+     derived from Windows).  These (mostly) define the objective context of
+     that object, with tasks being slightly different in some cases.
+
+	Effective, Saved and FS User ID
+	Effective, Saved and FS Group ID
+	Supplementary groups
+
+     These are additional credentials used by tasks only.  Usually, an
+     EUID/EGID/GROUPS will be used as the subjective context, and real UID/GID
+     will be used as the objective.  For tasks, it should be noted that this is
+     not always true.
+
+ (2) Capabilities.
+
+	Set of permitted capabilities
+	Set of inheritable capabilities
+	Set of effective capabilities
+	Capability bounding set
+
+     These are only carried by tasks.  They indicate superior capabilities
+     granted piecemeal to a task that an ordinary task wouldn't otherwise have.
+     These are manipulated implicitly by changes to the traditional UNIX
+     credentials, but can also be manipulated directly by the capset() system
+     call.
+
+     The permitted capabilities are those caps that the process might grant
+     itself to its effective or permitted sets through capset().  This
+     inheritable set might also be so constrained.
+
+     The effective capabilities are the ones that a task is actually allowed to
+     make use of itself.
+
+     The inheritable capabilities are the ones that may get passed across
+     execve().
+
+     The bounding set limits the capabilities that may be inherited across
+     execve(), especially when a binary is executed that will execute as UID 0.
+
+ (3) Secure management flags (securebits).
+
+     These are only carried by tasks.  These govern the way the above
+     credentials are manipulated and inherited over certain operations such as
+     execve().  They aren't used directly as objective or subjective
+     credentials.
+
+ (4) Keys and keyrings.
+
+     These are only carried by tasks.  They carry and cache security tokens
+     that don't fit into the other standard UNIX credentials.  They are for
+     making such things as network filesystem keys available to the file
+     accesses performed by processes, without the necessity of ordinary
+     programs having to know about security details involved.
+
+     Keyrings are a special type of key.  They carry sets of other keys and can
+     be searched for the desired key.  Each process may subscribe to a number
+     of keyrings:
+
+	Per-thread keying
+	Per-process keyring
+	Per-session keyring
+
+     When a process accesses a key, if not already present, it will normally be
+     cached on one of these keyrings for future accesses to find.
+
+     For more information on using keys, see Documentation/keys.txt.
+
+ (5) LSM
+
+     The Linux Security Module allows extra controls to be placed over the
+     operations that a task may do.  Currently Linux supports two main
+     alternate LSM options: SELinux and Smack.
+
+     Both work by labelling the objects in a system and then applying sets of
+     rules (policies) that say what operations a task with one label may do to
+     an object with another label.
+
+ (6) AF_KEY
+
+     This is a socket-based approach to credential management for networking
+     stacks [RFC 2367].  It isn't discussed by this document as it doesn't
+     interact directly with task and file credentials; rather it keeps system
+     level credentials.
+
+
+When a file is opened, part of the opening task's subjective context is
+recorded in the file struct created.  This allows operations using that file
+struct to use those credentials instead of the subjective context of the task
+that issued the operation.  An example of this would be a file opened on a
+network filesystem where the credentials of the opened file should be presented
+to the server, regardless of who is actually doing a read or a write upon it.
+
+
+=============
+FILE MARKINGS
+=============
+
+Files on disk or obtained over the network may have annotations that form the
+objective security context of that file.  Depending on the type of filesystem,
+this may include one or more of the following:
+
+ (*) UNIX UID, GID, mode;
+
+ (*) Windows user ID;
+
+ (*) Access control list;
+
+ (*) LSM security label;
+
+ (*) UNIX exec privilege escalation bits (SUID/SGID);
+
+ (*) File capabilities exec privilege escalation bits.
+
+These are compared to the task's subjective security context, and certain
+operations allowed or disallowed as a result.  In the case of execve(), the
+privilege escalation bits come into play, and may allow the resulting process
+extra privileges, based on the annotations on the executable file.
+
+
+================
+TASK CREDENTIALS
+================
+
+In Linux, all of a task's credentials are held in (uid, gid) or through
+(groups, keys, LSM security) a refcounted structure of type 'struct cred'.
+Each task points to its credentials by a pointer called 'cred' in its
+task_struct.
+
+Once a set of credentials has been prepared and committed, it may not be
+changed, barring the following exceptions:
+
+ (1) its reference count may be changed;
+
+ (2) the reference count on the group_info struct it points to may be changed;
+
+ (3) the reference count on the security data it points to may be changed;
+
+ (4) the reference count on any keyrings it points to may be changed;
+
+ (5) any keyrings it points to may be revoked, expired or have their security
+     attributes changed; and
+
+ (6) the contents of any keyrings to which it points may be changed (the whole
+     point of keyrings being a shared set of credentials, modifiable by anyone
+     with appropriate access).
+
+To alter anything in the cred struct, the copy-and-replace principle must be
+adhered to.  First take a copy, then alter the copy and then use RCU to change
+the task pointer to make it point to the new copy.  There are wrappers to aid
+with this (see below).
+
+A task may only alter its _own_ credentials; it is no longer permitted for a
+task to alter another's credentials.  This means the capset() system call is no
+longer permitted to take any PID other than the one of the current process.
+Also keyctl_instantiate() and keyctl_negate() functions no longer permit
+attachment to process-specific keyrings in the requesting process as the
+instantiating process may need to create them.
+
+
+IMMUTABLE CREDENTIALS
+---------------------
+
+Once a set of credentials has been made public (by calling commit_creds() for
+example), it must be considered immutable, barring two exceptions:
+
+ (1) The reference count may be altered.
+
+ (2) Whilst the keyring subscriptions of a set of credentials may not be
+     changed, the keyrings subscribed to may have their contents altered.
+
+To catch accidental credential alteration at compile time, struct task_struct
+has _const_ pointers to its credential sets, as does struct file.  Furthermore,
+certain functions such as get_cred() and put_cred() operate on const pointers,
+thus rendering casts unnecessary, but require to temporarily ditch the const
+qualification to be able to alter the reference count.
+
+
+ACCESSING TASK CREDENTIALS
+--------------------------
+
+A task being able to alter only its own credentials permits the current process
+to read or replace its own credentials without the need for any form of locking
+- which simplifies things greatly.  It can just call:
+
+	const struct cred *current_cred()
+
+to get a pointer to its credentials structure, and it doesn't have to release
+it afterwards.
+
+There are convenience wrappers for retrieving specific aspects of a task's
+credentials (the value is simply returned in each case):
+
+	uid_t current_uid(void)		Current's real UID
+	gid_t current_gid(void)		Current's real GID
+	uid_t current_euid(void)	Current's effective UID
+	gid_t current_egid(void)	Current's effective GID
+	uid_t current_fsuid(void)	Current's file access UID
+	gid_t current_fsgid(void)	Current's file access GID
+	kernel_cap_t current_cap(void)	Current's effective capabilities
+	void *current_security(void)	Current's LSM security pointer
+	struct user_struct *current_user(void)  Current's user account
+
+There are also convenience wrappers for retrieving specific associated pairs of
+a task's credentials:
+
+	void current_uid_gid(uid_t *, gid_t *);
+	void current_euid_egid(uid_t *, gid_t *);
+	void current_fsuid_fsgid(uid_t *, gid_t *);
+
+which return these pairs of values through their arguments after retrieving
+them from the current task's credentials.
+
+
+In addition, there is a function for obtaining a reference on the current
+process's current set of credentials:
+
+	const struct cred *get_current_cred(void);
+
+and functions for getting references to one of the credentials that don't
+actually live in struct cred:
+
+	struct user_struct *get_current_user(void);
+	struct group_info *get_current_groups(void);
+
+which get references to the current process's user accounting structure and
+supplementary groups list respectively.
+
+Once a reference has been obtained, it must be released with put_cred(),
+free_uid() or put_group_info() as appropriate.
+
+
+ACCESSING ANOTHER TASK'S CREDENTIALS
+------------------------------------
+
+Whilst a task may access its own credentials without the need for locking, the
+same is not true of a task wanting to access another task's credentials.  It
+must use the RCU read lock and rcu_dereference().
+
+The rcu_dereference() is wrapped by:
+
+	const struct cred *__task_cred(struct task_struct *task);
+
+This should be used inside the RCU read lock, as in the following example:
+
+	void foo(struct task_struct *t, struct foo_data *f)
+	{
+		const struct cred *tcred;
+		...
+		rcu_read_lock();
+		tcred = __task_cred(t);
+		f->uid = tcred->uid;
+		f->gid = tcred->gid;
+		f->groups = get_group_info(tcred->groups);
+		rcu_read_unlock();
+		...
+	}
+
+A function need not get RCU read lock to use __task_cred() if it is holding a
+spinlock at the time as this implicitly holds the RCU read lock.
+
+Should it be necessary to hold another task's credentials for a long period of
+time, and possibly to sleep whilst doing so, then the caller should get a
+reference on them using:
+
+	const struct cred *get_task_cred(struct task_struct *task);
+
+This does all the RCU magic inside of it.  The caller must call put_cred() on
+the credentials so obtained when they're finished with.
+
+There are a couple of convenience functions to access bits of another task's
+credentials, hiding the RCU magic from the caller:
+
+	uid_t task_uid(task)		Task's real UID
+	uid_t task_euid(task)		Task's effective UID
+
+If the caller is holding a spinlock or the RCU read lock at the time anyway,
+then:
+
+	__task_cred(task)->uid
+	__task_cred(task)->euid
+
+should be used instead.  Similarly, if multiple aspects of a task's credentials
+need to be accessed, RCU read lock or a spinlock should be used, __task_cred()
+called, the result stored in a temporary pointer and then the credential
+aspects called from that before dropping the lock.  This prevents the
+potentially expensive RCU magic from being invoked multiple times.
+
+Should some other single aspect of another task's credentials need to be
+accessed, then this can be used:
+
+	task_cred_xxx(task, member)
+
+where 'member' is a non-pointer member of the cred struct.  For instance:
+
+	uid_t task_cred_xxx(task, suid);
+
+will retrieve 'struct cred::suid' from the task, doing the appropriate RCU
+magic.  This may not be used for pointer members as what they point to may
+disappear the moment the RCU read lock is dropped.
+
+
+ALTERING CREDENTIALS
+--------------------
+
+As previously mentioned, a task may only alter its own credentials, and may not
+alter those of another task.  This means that it doesn't need to use any
+locking to alter its own credentials.
+
+To alter the current process's credentials, a function should first prepare a
+new set of credentials by calling:
+
+	struct cred *prepare_creds(void);
+
+this locks current->cred_replace_mutex and then allocates and constructs a
+duplicate of the current process's credentials, returning with the mutex still
+held if successful.  It returns NULL if not successful (out of memory).
+
+The mutex prevents ptrace() from altering the ptrace state of a process whilst
+security checks on credentials construction and changing is taking place as
+the ptrace state may alter the outcome, particularly in the case of execve().
+
+The new credentials set should be altered appropriately, and any security
+checks and hooks done.  Both the current and the proposed sets of credentials
+are available for this purpose as current_cred() will return the current set
+still at this point.
+
+
+When the credential set is ready, it should be committed to the current process
+by calling:
+
+	int commit_creds(struct cred *new);
+
+This will alter various aspects of the credentials and the process, giving the
+LSM a chance to do likewise, then it will use rcu_assign_pointer() to actually
+commit the new credentials to current->cred, it will release
+current->cred_replace_mutex to allow ptrace() to take place, and it will notify
+the scheduler and others of the changes.
+
+This function is guaranteed to return 0, so that it can be tail-called at the
+end of such functions as sys_setresuid().
+
+Note that this function consumes the caller's reference to the new credentials.
+The caller should _not_ call put_cred() on the new credentials afterwards.
+
+Furthermore, once this function has been called on a new set of credentials,
+those credentials may _not_ be changed further.
+
+
+Should the security checks fail or some other error occur after prepare_creds()
+has been called, then the following function should be invoked:
+
+	void abort_creds(struct cred *new);
+
+This releases the lock on current->cred_replace_mutex that prepare_creds() got
+and then releases the new credentials.
+
+
+A typical credentials alteration function would look something like this:
+
+	int alter_suid(uid_t suid)
+	{
+		struct cred *new;
+		int ret;
+
+		new = prepare_creds();
+		if (!new)
+			return -ENOMEM;
+
+		new->suid = suid;
+		ret = security_alter_suid(new);
+		if (ret < 0) {
+			abort_creds(new);
+			return ret;
+		}
+
+		return commit_creds(new);
+	}
+
+
+MANAGING CREDENTIALS
+--------------------
+
+There are some functions to help manage credentials:
+
+ (*) void put_cred(const struct cred *cred);
+
+     This releases a reference to the given set of credentials.  If the
+     reference count reaches zero, the credentials will be scheduled for
+     destruction by the RCU system.
+
+ (*) const struct cred *get_cred(const struct cred *cred);
+
+     This gets a reference on a live set of credentials, returning a pointer to
+     that set of credentials.
+
+ (*) struct cred *get_new_cred(struct cred *cred);
+
+     This gets a reference on a set of credentials that is under construction
+     and is thus still mutable, returning a pointer to that set of credentials.
+
+
+=====================
+OPEN FILE CREDENTIALS
+=====================
+
+When a new file is opened, a reference is obtained on the opening task's
+credentials and this is attached to the file struct as 'f_cred' in place of
+'f_uid' and 'f_gid'.  Code that used to access file->f_uid and file->f_gid
+should now access file->f_cred->fsuid and file->f_cred->fsgid.
+
+It is safe to access f_cred without the use of RCU or locking because the
+pointer will not change over the lifetime of the file struct, and nor will the
+contents of the cred struct pointed to, barring the exceptions listed above
+(see the Task Credentials section).
+
+
+=======================================
+OVERRIDING THE VFS'S USE OF CREDENTIALS
+=======================================
+
+Under some circumstances it is desirable to override the credentials used by
+the VFS, and that can be done by calling into such as vfs_mkdir() with a
+different set of credentials.  This is done in the following places:
+
+ (*) sys_faccessat().
+
+ (*) do_coredump().
+
+ (*) nfs4recover.c.
diff --git a/include/linux/cred.h b/include/linux/cred.h
index 8edb4d1d542..794aab5c66e 100644
--- a/include/linux/cred.h
+++ b/include/linux/cred.h
@@ -1,4 +1,4 @@
-/* Credentials management
+/* Credentials management - see Documentation/credentials.txt
  *
  * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved.
  * Written by David Howells (dhowells@redhat.com)
@@ -169,6 +169,12 @@ static inline struct cred *get_new_cred(struct cred *cred)
  *
  * Get a reference on the specified set of credentials.  The caller must
  * release the reference.
+ *
+ * This is used to deal with a committed set of credentials.  Although the
+ * pointer is const, this will temporarily discard the const and increment the
+ * usage count.  The purpose of this is to attempt to catch at compile time the
+ * accidental alteration of a set of credentials that should be considered
+ * immutable.
  */
 static inline const struct cred *get_cred(const struct cred *cred)
 {
@@ -181,6 +187,10 @@ static inline const struct cred *get_cred(const struct cred *cred)
  *
  * Release a reference to a set of credentials, deleting them when the last ref
  * is released.
+ *
+ * This takes a const pointer to a set of credentials because the credentials
+ * on task_struct are attached by const pointers to prevent accidental
+ * alteration of otherwise immutable credential sets.
  */
 static inline void put_cred(const struct cred *_cred)
 {
diff --git a/kernel/cred.c b/kernel/cred.c
index e6fcdd67b2e..b8bd2f99d8c 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -1,4 +1,4 @@
-/* Task credentials management
+/* Task credentials management - see Documentation/credentials.txt
  *
  * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved.
  * Written by David Howells (dhowells@redhat.com)
-- 
cgit v1.2.3-70-g09d2


From 3b11a1decef07c19443d24ae926982bc8ec9f4c0 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 14 Nov 2008 10:39:26 +1100
Subject: CRED: Differentiate objective and effective subjective credentials on
 a task

Differentiate the objective and real subjective credentials from the effective
subjective credentials on a task by introducing a second credentials pointer
into the task_struct.

task_struct::real_cred then refers to the objective and apparent real
subjective credentials of a task, as perceived by the other tasks in the
system.

task_struct::cred then refers to the effective subjective credentials of a
task, as used by that task when it's actually running.  These are not visible
to the other tasks in the system.

__task_cred(task) then refers to the objective/real credentials of the task in
question.

current_cred() refers to the effective subjective credentials of the current
task.

prepare_creds() uses the objective creds as a base and commit_creds() changes
both pointers in the task_struct (indeed commit_creds() requires them to be the
same).

override_creds() and revert_creds() change the subjective creds pointer only,
and the former returns the old subjective creds.  These are used by NFSD,
faccessat() and do_coredump(), and will by used by CacheFiles.

In SELinux, current_has_perm() is provided as an alternative to
task_has_perm().  This uses the effective subjective context of current,
whereas task_has_perm() uses the objective/real context of the subject.

Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: James Morris <jmorris@namei.org>
---
 fs/nfsd/auth.c            |  5 +++-
 include/linux/cred.h      | 29 +++++++++++----------
 include/linux/init_task.h |  1 +
 include/linux/sched.h     |  5 +++-
 kernel/cred.c             | 38 ++++++++++++++++++---------
 kernel/fork.c             |  6 +++--
 security/selinux/hooks.c  | 65 ++++++++++++++++++++++++++++++-----------------
 7 files changed, 95 insertions(+), 54 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfsd/auth.c b/fs/nfsd/auth.c
index 836ffa1047d..0184fe9b514 100644
--- a/fs/nfsd/auth.c
+++ b/fs/nfsd/auth.c
@@ -34,6 +34,8 @@ int nfsd_setuser(struct svc_rqst *rqstp, struct svc_export *exp)
 	int flags = nfsexp_flags(rqstp, exp);
 	int ret;
 
+	/* discard any old override before preparing the new set */
+	revert_creds(get_cred(current->real_cred));
 	new = prepare_creds();
 	if (!new)
 		return -ENOMEM;
@@ -82,7 +84,8 @@ int nfsd_setuser(struct svc_rqst *rqstp, struct svc_export *exp)
 	else
 		new->cap_effective = cap_raise_nfsd_set(new->cap_effective,
 							new->cap_permitted);
-	return commit_creds(new);
+	put_cred(override_creds(new));
+	return 0;
 
 oom:
 	ret = -ENOMEM;
diff --git a/include/linux/cred.h b/include/linux/cred.h
index 794aab5c66e..55a9c995d69 100644
--- a/include/linux/cred.h
+++ b/include/linux/cred.h
@@ -146,8 +146,8 @@ extern struct cred *prepare_exec_creds(void);
 extern struct cred *prepare_usermodehelper_creds(void);
 extern int commit_creds(struct cred *);
 extern void abort_creds(struct cred *);
-extern const struct cred *override_creds(const struct cred *) __deprecated;
-extern void revert_creds(const struct cred *) __deprecated;
+extern const struct cred *override_creds(const struct cred *);
+extern void revert_creds(const struct cred *);
 extern void __init cred_init(void);
 
 /**
@@ -202,32 +202,32 @@ static inline void put_cred(const struct cred *_cred)
 }
 
 /**
- * current_cred - Access the current task's credentials
+ * current_cred - Access the current task's subjective credentials
  *
- * Access the credentials of the current task.
+ * Access the subjective credentials of the current task.
  */
 #define current_cred() \
 	(current->cred)
 
 /**
- * __task_cred - Access another task's credentials
+ * __task_cred - Access a task's objective credentials
  * @task: The task to query
  *
- * Access the credentials of another task.  The caller must hold the
- * RCU readlock.
+ * Access the objective credentials of a task.  The caller must hold the RCU
+ * readlock.
  *
  * The caller must make sure task doesn't go away, either by holding a ref on
  * task or by holding tasklist_lock to prevent it from being unlinked.
  */
 #define __task_cred(task) \
-	((const struct cred *)(rcu_dereference((task)->cred)))
+	((const struct cred *)(rcu_dereference((task)->real_cred)))
 
 /**
- * get_task_cred - Get another task's credentials
+ * get_task_cred - Get another task's objective credentials
  * @task: The task to query
  *
- * Get the credentials of a task, pinning them so that they can't go away.
- * Accessing a task's credentials directly is not permitted.
+ * Get the objective credentials of a task, pinning them so that they can't go
+ * away.  Accessing a task's credentials directly is not permitted.
  *
  * The caller must make sure task doesn't go away, either by holding a ref on
  * task or by holding tasklist_lock to prevent it from being unlinked.
@@ -243,10 +243,11 @@ static inline void put_cred(const struct cred *_cred)
 })
 
 /**
- * get_current_cred - Get the current task's credentials
+ * get_current_cred - Get the current task's subjective credentials
  *
- * Get the credentials of the current task, pinning them so that they can't go
- * away.  Accessing the current task's credentials directly is not permitted.
+ * Get the subjective credentials of the current task, pinning them so that
+ * they can't go away.  Accessing the current task's credentials directly is
+ * not permitted.
  */
 #define get_current_cred()				\
 	(get_cred(current_cred()))
diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index 08c3b24ad9a..2597858035c 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -149,6 +149,7 @@ extern struct cred init_cred;
 	.children	= LIST_HEAD_INIT(tsk.children),			\
 	.sibling	= LIST_HEAD_INIT(tsk.sibling),			\
 	.group_leader	= &tsk,						\
+	.real_cred	= &init_cred,					\
 	.cred		= &init_cred,					\
 	.cred_exec_mutex =						\
 		 __MUTEX_INITIALIZER(tsk.cred_exec_mutex),		\
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 121d655e460..3443123b070 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1145,7 +1145,10 @@ struct task_struct {
 	struct list_head cpu_timers[3];
 
 /* process credentials */
-	const struct cred *cred;	/* actual/objective task credentials (COW) */
+	const struct cred *real_cred;	/* objective and real subjective task
+					 * credentials (COW) */
+	const struct cred *cred;	/* effective (overridable) subjective task
+					 * credentials (COW) */
 	struct mutex cred_exec_mutex;	/* execve vs ptrace cred calculation mutex */
 
 	char comm[TASK_COMM_LEN]; /* executable name excluding path
diff --git a/kernel/cred.c b/kernel/cred.c
index b8bd2f99d8c..f3ca1066061 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -35,7 +35,7 @@ static struct thread_group_cred init_tgcred = {
  * The initial credentials for the initial task
  */
 struct cred init_cred = {
-	.usage			= ATOMIC_INIT(3),
+	.usage			= ATOMIC_INIT(4),
 	.securebits		= SECUREBITS_DEFAULT,
 	.cap_inheritable	= CAP_INIT_INH_SET,
 	.cap_permitted		= CAP_FULL_SET,
@@ -120,6 +120,8 @@ EXPORT_SYMBOL(__put_cred);
  * prepare a new copy, which the caller then modifies and then commits by
  * calling commit_creds().
  *
+ * Preparation involves making a copy of the objective creds for modification.
+ *
  * Returns a pointer to the new creds-to-be if successful, NULL otherwise.
  *
  * Call commit_creds() or abort_creds() to clean up.
@@ -130,7 +132,7 @@ struct cred *prepare_creds(void)
 	const struct cred *old;
 	struct cred *new;
 
-	BUG_ON(atomic_read(&task->cred->usage) < 1);
+	BUG_ON(atomic_read(&task->real_cred->usage) < 1);
 
 	new = kmem_cache_alloc(cred_jar, GFP_KERNEL);
 	if (!new)
@@ -262,6 +264,9 @@ error:
  *
  * We share if we can, but under some circumstances we have to generate a new
  * set.
+ *
+ * The new process gets the current process's subjective credentials as its
+ * objective and subjective credentials
  */
 int copy_creds(struct task_struct *p, unsigned long clone_flags)
 {
@@ -278,6 +283,7 @@ int copy_creds(struct task_struct *p, unsigned long clone_flags)
 #endif
 		clone_flags & CLONE_THREAD
 	    ) {
+		p->real_cred = get_cred(p->cred);
 		get_cred(p->cred);
 		atomic_inc(&p->cred->user->processes);
 		return 0;
@@ -317,7 +323,7 @@ int copy_creds(struct task_struct *p, unsigned long clone_flags)
 #endif
 
 	atomic_inc(&new->user->processes);
-	p->cred = new;
+	p->cred = p->real_cred = get_cred(new);
 	return 0;
 }
 
@@ -326,7 +332,9 @@ int copy_creds(struct task_struct *p, unsigned long clone_flags)
  * @new: The credentials to be assigned
  *
  * Install a new set of credentials to the current task, using RCU to replace
- * the old set.
+ * the old set.  Both the objective and the subjective credentials pointers are
+ * updated.  This function may not be called if the subjective credentials are
+ * in an overridden state.
  *
  * This function eats the caller's reference to the new credentials.
  *
@@ -338,12 +346,15 @@ int commit_creds(struct cred *new)
 	struct task_struct *task = current;
 	const struct cred *old;
 
+	BUG_ON(task->cred != task->real_cred);
+	BUG_ON(atomic_read(&task->real_cred->usage) < 2);
 	BUG_ON(atomic_read(&new->usage) < 1);
-	BUG_ON(atomic_read(&task->cred->usage) < 1);
 
-	old = task->cred;
+	old = task->real_cred;
 	security_commit_creds(new, old);
 
+	get_cred(new); /* we will require a ref for the subj creds too */
+
 	/* dumpability changes */
 	if (old->euid != new->euid ||
 	    old->egid != new->egid ||
@@ -369,6 +380,7 @@ int commit_creds(struct cred *new)
 	 */
 	if (new->user != old->user)
 		atomic_inc(&new->user->processes);
+	rcu_assign_pointer(task->real_cred, new);
 	rcu_assign_pointer(task->cred, new);
 	if (new->user != old->user)
 		atomic_dec(&old->user->processes);
@@ -388,6 +400,8 @@ int commit_creds(struct cred *new)
 	    new->fsgid != old->fsgid)
 		proc_id_connector(task, PROC_EVENT_GID);
 
+	/* release the old obj and subj refs both */
+	put_cred(old);
 	put_cred(old);
 	return 0;
 }
@@ -408,11 +422,11 @@ void abort_creds(struct cred *new)
 EXPORT_SYMBOL(abort_creds);
 
 /**
- * override_creds - Temporarily override the current process's credentials
+ * override_creds - Override the current process's subjective credentials
  * @new: The credentials to be assigned
  *
- * Install a set of temporary override credentials on the current process,
- * returning the old set for later reversion.
+ * Install a set of temporary override subjective credentials on the current
+ * process, returning the old set for later reversion.
  */
 const struct cred *override_creds(const struct cred *new)
 {
@@ -424,11 +438,11 @@ const struct cred *override_creds(const struct cred *new)
 EXPORT_SYMBOL(override_creds);
 
 /**
- * revert_creds - Revert a temporary credentials override
+ * revert_creds - Revert a temporary subjective credentials override
  * @old: The credentials to be restored
  *
- * Revert a temporary set of override credentials to an old set, discarding the
- * override set.
+ * Revert a temporary set of override subjective credentials to an old set,
+ * discarding the override set.
  */
 void revert_creds(const struct cred *old)
 {
diff --git a/kernel/fork.c b/kernel/fork.c
index 82a7948a664..af0d0f04585 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -146,6 +146,7 @@ void __put_task_struct(struct task_struct *tsk)
 	WARN_ON(atomic_read(&tsk->usage));
 	WARN_ON(tsk == current);
 
+	put_cred(tsk->real_cred);
 	put_cred(tsk->cred);
 	delayacct_tsk_free(tsk);
 
@@ -961,10 +962,10 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 	DEBUG_LOCKS_WARN_ON(!p->softirqs_enabled);
 #endif
 	retval = -EAGAIN;
-	if (atomic_read(&p->cred->user->processes) >=
+	if (atomic_read(&p->real_cred->user->processes) >=
 			p->signal->rlim[RLIMIT_NPROC].rlim_cur) {
 		if (!capable(CAP_SYS_ADMIN) && !capable(CAP_SYS_RESOURCE) &&
-		    p->cred->user != current->nsproxy->user_ns->root_user)
+		    p->real_cred->user != current->nsproxy->user_ns->root_user)
 			goto bad_fork_free;
 	}
 
@@ -1278,6 +1279,7 @@ bad_fork_cleanup_put_domain:
 	module_put(task_thread_info(p)->exec_domain->module);
 bad_fork_cleanup_count:
 	atomic_dec(&p->cred->user->processes);
+	put_cred(p->real_cred);
 	put_cred(p->cred);
 bad_fork_free:
 	free_task(p);
diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index 21a59218463..91b06f2aa96 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -161,7 +161,7 @@ static int selinux_secmark_enabled(void)
  */
 static void cred_init_security(void)
 {
-	struct cred *cred = (struct cred *) current->cred;
+	struct cred *cred = (struct cred *) current->real_cred;
 	struct task_security_struct *tsec;
 
 	tsec = kzalloc(sizeof(struct task_security_struct), GFP_KERNEL);
@@ -184,7 +184,7 @@ static inline u32 cred_sid(const struct cred *cred)
 }
 
 /*
- * get the security ID of a task
+ * get the objective security ID of a task
  */
 static inline u32 task_sid(const struct task_struct *task)
 {
@@ -197,7 +197,7 @@ static inline u32 task_sid(const struct task_struct *task)
 }
 
 /*
- * get the security ID of the current task
+ * get the subjective security ID of the current task
  */
 static inline u32 current_sid(void)
 {
@@ -1395,6 +1395,7 @@ static int cred_has_perm(const struct cred *actor,
  * Check permission between a pair of tasks, e.g. signal checks,
  * fork check, ptrace check, etc.
  * tsk1 is the actor and tsk2 is the target
+ * - this uses the default subjective creds of tsk1
  */
 static int task_has_perm(const struct task_struct *tsk1,
 			 const struct task_struct *tsk2,
@@ -1410,6 +1411,22 @@ static int task_has_perm(const struct task_struct *tsk1,
 	return avc_has_perm(sid1, sid2, SECCLASS_PROCESS, perms, NULL);
 }
 
+/*
+ * Check permission between current and another task, e.g. signal checks,
+ * fork check, ptrace check, etc.
+ * current is the actor and tsk2 is the target
+ * - this uses current's subjective creds
+ */
+static int current_has_perm(const struct task_struct *tsk,
+			    u32 perms)
+{
+	u32 sid, tsid;
+
+	sid = current_sid();
+	tsid = task_sid(tsk);
+	return avc_has_perm(sid, tsid, SECCLASS_PROCESS, perms, NULL);
+}
+
 #if CAP_LAST_CAP > 63
 #error Fix SELinux to handle capabilities > 63.
 #endif
@@ -1807,7 +1824,7 @@ static int selinux_ptrace_may_access(struct task_struct *child,
 		return avc_has_perm(sid, csid, SECCLASS_FILE, FILE__READ, NULL);
 	}
 
-	return task_has_perm(current, child, PROCESS__PTRACE);
+	return current_has_perm(child, PROCESS__PTRACE);
 }
 
 static int selinux_ptrace_traceme(struct task_struct *parent)
@@ -1826,7 +1843,7 @@ static int selinux_capget(struct task_struct *target, kernel_cap_t *effective,
 {
 	int error;
 
-	error = task_has_perm(current, target, PROCESS__GETCAP);
+	error = current_has_perm(target, PROCESS__GETCAP);
 	if (error)
 		return error;
 
@@ -3071,7 +3088,7 @@ static int selinux_file_mprotect(struct vm_area_struct *vma,
 		} else if (!vma->vm_file &&
 			   vma->vm_start <= vma->vm_mm->start_stack &&
 			   vma->vm_end >= vma->vm_mm->start_stack) {
-			rc = task_has_perm(current, current, PROCESS__EXECSTACK);
+			rc = current_has_perm(current, PROCESS__EXECSTACK);
 		} else if (vma->vm_file && vma->anon_vma) {
 			/*
 			 * We are making executable a file mapping that has
@@ -3220,7 +3237,7 @@ static int selinux_task_create(unsigned long clone_flags)
 	if (rc)
 		return rc;
 
-	return task_has_perm(current, current, PROCESS__FORK);
+	return current_has_perm(current, PROCESS__FORK);
 }
 
 /*
@@ -3285,17 +3302,17 @@ static int selinux_task_setgid(gid_t id0, gid_t id1, gid_t id2, int flags)
 
 static int selinux_task_setpgid(struct task_struct *p, pid_t pgid)
 {
-	return task_has_perm(current, p, PROCESS__SETPGID);
+	return current_has_perm(p, PROCESS__SETPGID);
 }
 
 static int selinux_task_getpgid(struct task_struct *p)
 {
-	return task_has_perm(current, p, PROCESS__GETPGID);
+	return current_has_perm(p, PROCESS__GETPGID);
 }
 
 static int selinux_task_getsid(struct task_struct *p)
 {
-	return task_has_perm(current, p, PROCESS__GETSESSION);
+	return current_has_perm(p, PROCESS__GETSESSION);
 }
 
 static void selinux_task_getsecid(struct task_struct *p, u32 *secid)
@@ -3317,7 +3334,7 @@ static int selinux_task_setnice(struct task_struct *p, int nice)
 	if (rc)
 		return rc;
 
-	return task_has_perm(current, p, PROCESS__SETSCHED);
+	return current_has_perm(p, PROCESS__SETSCHED);
 }
 
 static int selinux_task_setioprio(struct task_struct *p, int ioprio)
@@ -3328,12 +3345,12 @@ static int selinux_task_setioprio(struct task_struct *p, int ioprio)
 	if (rc)
 		return rc;
 
-	return task_has_perm(current, p, PROCESS__SETSCHED);
+	return current_has_perm(p, PROCESS__SETSCHED);
 }
 
 static int selinux_task_getioprio(struct task_struct *p)
 {
-	return task_has_perm(current, p, PROCESS__GETSCHED);
+	return current_has_perm(p, PROCESS__GETSCHED);
 }
 
 static int selinux_task_setrlimit(unsigned int resource, struct rlimit *new_rlim)
@@ -3350,7 +3367,7 @@ static int selinux_task_setrlimit(unsigned int resource, struct rlimit *new_rlim
 	   later be used as a safe reset point for the soft limit
 	   upon context transitions.  See selinux_bprm_committing_creds. */
 	if (old_rlim->rlim_max != new_rlim->rlim_max)
-		return task_has_perm(current, current, PROCESS__SETRLIMIT);
+		return current_has_perm(current, PROCESS__SETRLIMIT);
 
 	return 0;
 }
@@ -3363,17 +3380,17 @@ static int selinux_task_setscheduler(struct task_struct *p, int policy, struct s
 	if (rc)
 		return rc;
 
-	return task_has_perm(current, p, PROCESS__SETSCHED);
+	return current_has_perm(p, PROCESS__SETSCHED);
 }
 
 static int selinux_task_getscheduler(struct task_struct *p)
 {
-	return task_has_perm(current, p, PROCESS__GETSCHED);
+	return current_has_perm(p, PROCESS__GETSCHED);
 }
 
 static int selinux_task_movememory(struct task_struct *p)
 {
-	return task_has_perm(current, p, PROCESS__SETSCHED);
+	return current_has_perm(p, PROCESS__SETSCHED);
 }
 
 static int selinux_task_kill(struct task_struct *p, struct siginfo *info,
@@ -3394,7 +3411,7 @@ static int selinux_task_kill(struct task_struct *p, struct siginfo *info,
 		rc = avc_has_perm(secid, task_sid(p),
 				  SECCLASS_PROCESS, perm, NULL);
 	else
-		rc = task_has_perm(current, p, perm);
+		rc = current_has_perm(p, perm);
 	return rc;
 }
 
@@ -5250,7 +5267,7 @@ static int selinux_getprocattr(struct task_struct *p,
 	unsigned len;
 
 	if (current != p) {
-		error = task_has_perm(current, p, PROCESS__GETATTR);
+		error = current_has_perm(p, PROCESS__GETATTR);
 		if (error)
 			return error;
 	}
@@ -5309,15 +5326,15 @@ static int selinux_setprocattr(struct task_struct *p,
 	 * above restriction is ever removed.
 	 */
 	if (!strcmp(name, "exec"))
-		error = task_has_perm(current, p, PROCESS__SETEXEC);
+		error = current_has_perm(p, PROCESS__SETEXEC);
 	else if (!strcmp(name, "fscreate"))
-		error = task_has_perm(current, p, PROCESS__SETFSCREATE);
+		error = current_has_perm(p, PROCESS__SETFSCREATE);
 	else if (!strcmp(name, "keycreate"))
-		error = task_has_perm(current, p, PROCESS__SETKEYCREATE);
+		error = current_has_perm(p, PROCESS__SETKEYCREATE);
 	else if (!strcmp(name, "sockcreate"))
-		error = task_has_perm(current, p, PROCESS__SETSOCKCREATE);
+		error = current_has_perm(p, PROCESS__SETSOCKCREATE);
 	else if (!strcmp(name, "current"))
-		error = task_has_perm(current, p, PROCESS__SETCURRENT);
+		error = current_has_perm(p, PROCESS__SETCURRENT);
 	else
 		error = -EINVAL;
 	if (error)
-- 
cgit v1.2.3-70-g09d2


From 3a3b7ce9336952ea7b9564d976d068a238976c9d Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 14 Nov 2008 10:39:28 +1100
Subject: CRED: Allow kernel services to override LSM settings for task actions

Allow kernel services to override LSM settings appropriate to the actions
performed by a task by duplicating a set of credentials, modifying it and then
using task_struct::cred to point to it when performing operations on behalf of
a task.

This is used, for example, by CacheFiles which has to transparently access the
cache on behalf of a process that thinks it is doing, say, NFS accesses with a
potentially inappropriate (with respect to accessing the cache) set of
credentials.

This patch provides two LSM hooks for modifying a task security record:

 (*) security_kernel_act_as() which allows modification of the security datum
     with which a task acts on other objects (most notably files).

 (*) security_kernel_create_files_as() which allows modification of the
     security datum that is used to initialise the security data on a file that
     a task creates.

The patch also provides four new credentials handling functions, which wrap the
LSM functions:

 (1) prepare_kernel_cred()

     Prepare a set of credentials for a kernel service to use, based either on
     a daemon's credentials or on init_cred.  All the keyrings are cleared.

 (2) set_security_override()

     Set the LSM security ID in a set of credentials to a specific security
     context, assuming permission from the LSM policy.

 (3) set_security_override_from_ctx()

     As (2), but takes the security context as a string.

 (4) set_create_files_as()

     Set the file creation LSM security ID in a set of credentials to be the
     same as that on a particular inode.

Signed-off-by: Casey Schaufler <casey@schaufler-ca.com> [Smack changes]
Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: James Morris <jmorris@namei.org>
---
 include/linux/cred.h       |   6 +++
 include/linux/security.h   |  28 +++++++++++
 kernel/cred.c              | 113 +++++++++++++++++++++++++++++++++++++++++++++
 security/capability.c      |  12 +++++
 security/security.c        |  10 ++++
 security/selinux/hooks.c   |  46 ++++++++++++++++++
 security/smack/smack_lsm.c |  37 +++++++++++++++
 7 files changed, 252 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/cred.h b/include/linux/cred.h
index 55a9c995d69..26c1ab17994 100644
--- a/include/linux/cred.h
+++ b/include/linux/cred.h
@@ -18,6 +18,7 @@
 
 struct user_struct;
 struct cred;
+struct inode;
 
 /*
  * COW Supplementary groups list
@@ -148,6 +149,11 @@ extern int commit_creds(struct cred *);
 extern void abort_creds(struct cred *);
 extern const struct cred *override_creds(const struct cred *);
 extern void revert_creds(const struct cred *);
+extern struct cred *prepare_kernel_cred(struct task_struct *);
+extern int change_create_files_as(struct cred *, struct inode *);
+extern int set_security_override(struct cred *, u32);
+extern int set_security_override_from_ctx(struct cred *, const char *);
+extern int set_create_files_as(struct cred *, struct inode *);
 extern void __init cred_init(void);
 
 /**
diff --git a/include/linux/security.h b/include/linux/security.h
index 56a0eed6567..59a11e19b61 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -587,6 +587,19 @@ static inline void security_free_mnt_opts(struct security_mnt_opts *opts)
  *	@new points to the new credentials.
  *	@old points to the original credentials.
  *	Install a new set of credentials.
+ * @kernel_act_as:
+ *	Set the credentials for a kernel service to act as (subjective context).
+ *	@new points to the credentials to be modified.
+ *	@secid specifies the security ID to be set
+ *	The current task must be the one that nominated @secid.
+ *	Return 0 if successful.
+ * @kernel_create_files_as:
+ *	Set the file creation context in a set of credentials to be the same as
+ *	the objective context of the specified inode.
+ *	@new points to the credentials to be modified.
+ *	@inode points to the inode to use as a reference.
+ *	The current task must be the one that nominated @inode.
+ *	Return 0 if successful.
  * @task_setuid:
  *	Check permission before setting one or more of the user identity
  *	attributes of the current process.  The @flags parameter indicates
@@ -1381,6 +1394,8 @@ struct security_operations {
 	int (*cred_prepare)(struct cred *new, const struct cred *old,
 			    gfp_t gfp);
 	void (*cred_commit)(struct cred *new, const struct cred *old);
+	int (*kernel_act_as)(struct cred *new, u32 secid);
+	int (*kernel_create_files_as)(struct cred *new, struct inode *inode);
 	int (*task_setuid) (uid_t id0, uid_t id1, uid_t id2, int flags);
 	int (*task_fix_setuid) (struct cred *new, const struct cred *old,
 				int flags);
@@ -1632,6 +1647,8 @@ int security_task_create(unsigned long clone_flags);
 void security_cred_free(struct cred *cred);
 int security_prepare_creds(struct cred *new, const struct cred *old, gfp_t gfp);
 void security_commit_creds(struct cred *new, const struct cred *old);
+int security_kernel_act_as(struct cred *new, u32 secid);
+int security_kernel_create_files_as(struct cred *new, struct inode *inode);
 int security_task_setuid(uid_t id0, uid_t id1, uid_t id2, int flags);
 int security_task_fix_setuid(struct cred *new, const struct cred *old,
 			     int flags);
@@ -2151,6 +2168,17 @@ static inline void security_commit_creds(struct cred *new,
 {
 }
 
+static inline int security_kernel_act_as(struct cred *cred, u32 secid)
+{
+	return 0;
+}
+
+static inline int security_kernel_create_files_as(struct cred *cred,
+						  struct inode *inode)
+{
+	return 0;
+}
+
 static inline int security_task_setuid(uid_t id0, uid_t id1, uid_t id2,
 				       int flags)
 {
diff --git a/kernel/cred.c b/kernel/cred.c
index f3ca1066061..13697ca2bb3 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -462,3 +462,116 @@ void __init cred_init(void)
 	cred_jar = kmem_cache_create("cred_jar", sizeof(struct cred),
 				     0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
 }
+
+/**
+ * prepare_kernel_cred - Prepare a set of credentials for a kernel service
+ * @daemon: A userspace daemon to be used as a reference
+ *
+ * Prepare a set of credentials for a kernel service.  This can then be used to
+ * override a task's own credentials so that work can be done on behalf of that
+ * task that requires a different subjective context.
+ *
+ * @daemon is used to provide a base for the security record, but can be NULL.
+ * If @daemon is supplied, then the security data will be derived from that;
+ * otherwise they'll be set to 0 and no groups, full capabilities and no keys.
+ *
+ * The caller may change these controls afterwards if desired.
+ *
+ * Returns the new credentials or NULL if out of memory.
+ *
+ * Does not take, and does not return holding current->cred_replace_mutex.
+ */
+struct cred *prepare_kernel_cred(struct task_struct *daemon)
+{
+	const struct cred *old;
+	struct cred *new;
+
+	new = kmem_cache_alloc(cred_jar, GFP_KERNEL);
+	if (!new)
+		return NULL;
+
+	if (daemon)
+		old = get_task_cred(daemon);
+	else
+		old = get_cred(&init_cred);
+
+	get_uid(new->user);
+	get_group_info(new->group_info);
+
+#ifdef CONFIG_KEYS
+	atomic_inc(&init_tgcred.usage);
+	new->tgcred = &init_tgcred;
+	new->request_key_auth = NULL;
+	new->thread_keyring = NULL;
+	new->jit_keyring = KEY_REQKEY_DEFL_THREAD_KEYRING;
+#endif
+
+#ifdef CONFIG_SECURITY
+	new->security = NULL;
+#endif
+	if (security_prepare_creds(new, old, GFP_KERNEL) < 0)
+		goto error;
+
+	atomic_set(&new->usage, 1);
+	put_cred(old);
+	return new;
+
+error:
+	put_cred(new);
+	return NULL;
+}
+EXPORT_SYMBOL(prepare_kernel_cred);
+
+/**
+ * set_security_override - Set the security ID in a set of credentials
+ * @new: The credentials to alter
+ * @secid: The LSM security ID to set
+ *
+ * Set the LSM security ID in a set of credentials so that the subjective
+ * security is overridden when an alternative set of credentials is used.
+ */
+int set_security_override(struct cred *new, u32 secid)
+{
+	return security_kernel_act_as(new, secid);
+}
+EXPORT_SYMBOL(set_security_override);
+
+/**
+ * set_security_override_from_ctx - Set the security ID in a set of credentials
+ * @new: The credentials to alter
+ * @secctx: The LSM security context to generate the security ID from.
+ *
+ * Set the LSM security ID in a set of credentials so that the subjective
+ * security is overridden when an alternative set of credentials is used.  The
+ * security ID is specified in string form as a security context to be
+ * interpreted by the LSM.
+ */
+int set_security_override_from_ctx(struct cred *new, const char *secctx)
+{
+	u32 secid;
+	int ret;
+
+	ret = security_secctx_to_secid(secctx, strlen(secctx), &secid);
+	if (ret < 0)
+		return ret;
+
+	return set_security_override(new, secid);
+}
+EXPORT_SYMBOL(set_security_override_from_ctx);
+
+/**
+ * set_create_files_as - Set the LSM file create context in a set of credentials
+ * @new: The credentials to alter
+ * @inode: The inode to take the context from
+ *
+ * Change the LSM file creation context in a set of credentials to be the same
+ * as the object context of the specified inode, so that the new inodes have
+ * the same MAC context as that inode.
+ */
+int set_create_files_as(struct cred *new, struct inode *inode)
+{
+	new->fsuid = inode->i_uid;
+	new->fsgid = inode->i_gid;
+	return security_kernel_create_files_as(new, inode);
+}
+EXPORT_SYMBOL(set_create_files_as);
diff --git a/security/capability.c b/security/capability.c
index 185804f99ad..b9e391425e6 100644
--- a/security/capability.c
+++ b/security/capability.c
@@ -348,6 +348,16 @@ static void cap_cred_commit(struct cred *new, const struct cred *old)
 {
 }
 
+static int cap_kernel_act_as(struct cred *new, u32 secid)
+{
+	return 0;
+}
+
+static int cap_kernel_create_files_as(struct cred *new, struct inode *inode)
+{
+	return 0;
+}
+
 static int cap_task_setuid(uid_t id0, uid_t id1, uid_t id2, int flags)
 {
 	return 0;
@@ -889,6 +899,8 @@ void security_fixup_ops(struct security_operations *ops)
 	set_to_cap_if_null(ops, cred_free);
 	set_to_cap_if_null(ops, cred_prepare);
 	set_to_cap_if_null(ops, cred_commit);
+	set_to_cap_if_null(ops, kernel_act_as);
+	set_to_cap_if_null(ops, kernel_create_files_as);
 	set_to_cap_if_null(ops, task_setuid);
 	set_to_cap_if_null(ops, task_fix_setuid);
 	set_to_cap_if_null(ops, task_setgid);
diff --git a/security/security.c b/security/security.c
index dc5babb2d6d..038ef04b2c7 100644
--- a/security/security.c
+++ b/security/security.c
@@ -616,6 +616,16 @@ void security_commit_creds(struct cred *new, const struct cred *old)
 	return security_ops->cred_commit(new, old);
 }
 
+int security_kernel_act_as(struct cred *new, u32 secid)
+{
+	return security_ops->kernel_act_as(new, secid);
+}
+
+int security_kernel_create_files_as(struct cred *new, struct inode *inode)
+{
+	return security_ops->kernel_create_files_as(new, inode);
+}
+
 int security_task_setuid(uid_t id0, uid_t id1, uid_t id2, int flags)
 {
 	return security_ops->task_setuid(id0, id1, id2, flags);
diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index 91b06f2aa96..520f82ab3fb 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -3277,6 +3277,50 @@ static void selinux_cred_commit(struct cred *new, const struct cred *old)
 	secondary_ops->cred_commit(new, old);
 }
 
+/*
+ * set the security data for a kernel service
+ * - all the creation contexts are set to unlabelled
+ */
+static int selinux_kernel_act_as(struct cred *new, u32 secid)
+{
+	struct task_security_struct *tsec = new->security;
+	u32 sid = current_sid();
+	int ret;
+
+	ret = avc_has_perm(sid, secid,
+			   SECCLASS_KERNEL_SERVICE,
+			   KERNEL_SERVICE__USE_AS_OVERRIDE,
+			   NULL);
+	if (ret == 0) {
+		tsec->sid = secid;
+		tsec->create_sid = 0;
+		tsec->keycreate_sid = 0;
+		tsec->sockcreate_sid = 0;
+	}
+	return ret;
+}
+
+/*
+ * set the file creation context in a security record to the same as the
+ * objective context of the specified inode
+ */
+static int selinux_kernel_create_files_as(struct cred *new, struct inode *inode)
+{
+	struct inode_security_struct *isec = inode->i_security;
+	struct task_security_struct *tsec = new->security;
+	u32 sid = current_sid();
+	int ret;
+
+	ret = avc_has_perm(sid, isec->sid,
+			   SECCLASS_KERNEL_SERVICE,
+			   KERNEL_SERVICE__CREATE_FILES_AS,
+			   NULL);
+
+	if (ret == 0)
+		tsec->create_sid = isec->sid;
+	return 0;
+}
+
 static int selinux_task_setuid(uid_t id0, uid_t id1, uid_t id2, int flags)
 {
 	/* Since setuid only affects the current process, and
@@ -5593,6 +5637,8 @@ static struct security_operations selinux_ops = {
 	.cred_free =			selinux_cred_free,
 	.cred_prepare =			selinux_cred_prepare,
 	.cred_commit =			selinux_cred_commit,
+	.kernel_act_as =		selinux_kernel_act_as,
+	.kernel_create_files_as =	selinux_kernel_create_files_as,
 	.task_setuid =			selinux_task_setuid,
 	.task_fix_setuid =		selinux_task_fix_setuid,
 	.task_setgid =			selinux_task_setgid,
diff --git a/security/smack/smack_lsm.c b/security/smack/smack_lsm.c
index de396742abf..8ad48161cef 100644
--- a/security/smack/smack_lsm.c
+++ b/security/smack/smack_lsm.c
@@ -1011,6 +1011,41 @@ static void smack_cred_commit(struct cred *new, const struct cred *old)
 {
 }
 
+/**
+ * smack_kernel_act_as - Set the subjective context in a set of credentials
+ * @new points to the set of credentials to be modified.
+ * @secid specifies the security ID to be set
+ *
+ * Set the security data for a kernel service.
+ */
+static int smack_kernel_act_as(struct cred *new, u32 secid)
+{
+	char *smack = smack_from_secid(secid);
+
+	if (smack == NULL)
+		return -EINVAL;
+
+	new->security = smack;
+	return 0;
+}
+
+/**
+ * smack_kernel_create_files_as - Set the file creation label in a set of creds
+ * @new points to the set of credentials to be modified
+ * @inode points to the inode to use as a reference
+ *
+ * Set the file creation context in a set of credentials to the same
+ * as the objective context of the specified inode
+ */
+static int smack_kernel_create_files_as(struct cred *new,
+					struct inode *inode)
+{
+	struct inode_smack *isp = inode->i_security;
+
+	new->security = isp->smk_inode;
+	return 0;
+}
+
 /**
  * smack_task_setpgid - Smack check on setting pgid
  * @p: the task object
@@ -2641,6 +2676,8 @@ struct security_operations smack_ops = {
 	.cred_free =			smack_cred_free,
 	.cred_prepare =			smack_cred_prepare,
 	.cred_commit =			smack_cred_commit,
+	.kernel_act_as =		smack_kernel_act_as,
+	.kernel_create_files_as =	smack_kernel_create_files_as,
 	.task_fix_setuid =		cap_task_fix_setuid,
 	.task_setpgid = 		smack_task_setpgid,
 	.task_getpgid = 		smack_task_getpgid,
-- 
cgit v1.2.3-70-g09d2


From 31e889098a80ceb3e9e3c555d522b2686a6663c6 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Fri, 14 Nov 2008 16:21:19 -0800
Subject: ftrace: pass module struct to arch dynamic ftrace functions

Impact: allow archs more flexibility on dynamic ftrace implementations

Dynamic ftrace has largly been developed on x86. Since x86 does not
have the same limitations as other architectures, the ftrace interaction
between the generic code and the architecture specific code was not
flexible enough to handle some of the issues that other architectures
have.

Most notably, module trampolines. Due to the limited branch distance
that archs make in calling kernel core code from modules, the module
load code must create a trampoline to jump to what will make the
larger jump into core kernel code.

The problem arises when this happens to a call to mcount. Ftrace checks
all code before modifying it and makes sure the current code is what
it expects. Right now, there is not enough information to handle modifying
module trampolines.

This patch changes the API between generic dynamic ftrace code and
the arch dependent code. There is now two functions for modifying code:

  ftrace_make_nop(mod, rec, addr) - convert the code at rec->ip into
       a nop, where the original text is calling addr. (mod is the
       module struct if called by module init)

  ftrace_make_caller(rec, addr) - convert the code rec->ip that should
       be a nop into a caller to addr.

The record "rec" now has a new field called "arch" where the architecture
can add any special attributes to each call site record.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/include/asm/ftrace.h |  8 ++++++
 arch/x86/kernel/ftrace.c      | 29 +++++++++++++++++---
 include/linux/ftrace.h        | 53 +++++++++++++++++++++++++++---------
 kernel/module.c               |  2 +-
 kernel/trace/ftrace.c         | 62 +++++++++++++++++--------------------------
 5 files changed, 100 insertions(+), 54 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/include/asm/ftrace.h b/arch/x86/include/asm/ftrace.h
index 9b6a1fa19e7..2bb43b433e0 100644
--- a/arch/x86/include/asm/ftrace.h
+++ b/arch/x86/include/asm/ftrace.h
@@ -17,6 +17,14 @@ static inline unsigned long ftrace_call_adjust(unsigned long addr)
 	 */
 	return addr - 1;
 }
+
+#ifdef CONFIG_DYNAMIC_FTRACE
+
+struct dyn_arch_ftrace {
+	/* No extra data needed for x86 */
+};
+
+#endif /*  CONFIG_DYNAMIC_FTRACE */
 #endif /* __ASSEMBLY__ */
 #endif /* CONFIG_FUNCTION_TRACER */
 
diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c
index fe832738e1e..762222ad138 100644
--- a/arch/x86/kernel/ftrace.c
+++ b/arch/x86/kernel/ftrace.c
@@ -166,7 +166,7 @@ static int ftrace_calc_offset(long ip, long addr)
 	return (int)(addr - ip);
 }
 
-unsigned char *ftrace_call_replace(unsigned long ip, unsigned long addr)
+static unsigned char *ftrace_call_replace(unsigned long ip, unsigned long addr)
 {
 	static union ftrace_code_union calc;
 
@@ -311,12 +311,12 @@ do_ftrace_mod_code(unsigned long ip, void *new_code)
 
 static unsigned char ftrace_nop[MCOUNT_INSN_SIZE];
 
-unsigned char *ftrace_nop_replace(void)
+static unsigned char *ftrace_nop_replace(void)
 {
 	return ftrace_nop;
 }
 
-int
+static int
 ftrace_modify_code(unsigned long ip, unsigned char *old_code,
 		   unsigned char *new_code)
 {
@@ -349,6 +349,29 @@ ftrace_modify_code(unsigned long ip, unsigned char *old_code,
 	return 0;
 }
 
+int ftrace_make_nop(struct module *mod,
+		    struct dyn_ftrace *rec, unsigned long addr)
+{
+	unsigned char *new, *old;
+	unsigned long ip = rec->ip;
+
+	old = ftrace_call_replace(ip, addr);
+	new = ftrace_nop_replace();
+
+	return ftrace_modify_code(rec->ip, old, new);
+}
+
+int ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr)
+{
+	unsigned char *new, *old;
+	unsigned long ip = rec->ip;
+
+	old = ftrace_nop_replace();
+	new = ftrace_call_replace(ip, addr);
+
+	return ftrace_modify_code(rec->ip, old, new);
+}
+
 int ftrace_update_ftrace_func(ftrace_func_t func)
 {
 	unsigned long ip = (unsigned long)(&ftrace_call);
diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index 4fbc4a8b86a..166a2070ef6 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -74,6 +74,9 @@ static inline void ftrace_start(void) { }
 #endif /* CONFIG_FUNCTION_TRACER */
 
 #ifdef CONFIG_DYNAMIC_FTRACE
+/* asm/ftrace.h must be defined for archs supporting dynamic ftrace */
+#include <asm/ftrace.h>
+
 enum {
 	FTRACE_FL_FREE		= (1 << 0),
 	FTRACE_FL_FAILED	= (1 << 1),
@@ -88,6 +91,7 @@ struct dyn_ftrace {
 	struct list_head	list;
 	unsigned long		ip; /* address of mcount call-site */
 	unsigned long		flags;
+	struct dyn_arch_ftrace	arch;
 };
 
 int ftrace_force_update(void);
@@ -95,22 +99,40 @@ void ftrace_set_filter(unsigned char *buf, int len, int reset);
 
 /* defined in arch */
 extern int ftrace_ip_converted(unsigned long ip);
-extern unsigned char *ftrace_nop_replace(void);
-extern unsigned char *ftrace_call_replace(unsigned long ip, unsigned long addr);
 extern int ftrace_dyn_arch_init(void *data);
 extern int ftrace_update_ftrace_func(ftrace_func_t func);
 extern void ftrace_caller(void);
 extern void ftrace_call(void);
 extern void mcount_call(void);
 
-/* May be defined in arch */
-extern int ftrace_arch_read_dyn_info(char *buf, int size);
+/**
+ * ftrace_make_nop - convert code into top
+ * @mod: module structure if called by module load initialization
+ * @rec: the mcount call site record
+ * @addr: the address that the call site should be calling
+ *
+ * This is a very sensitive operation and great care needs
+ * to be taken by the arch.  The operation should carefully
+ * read the location, check to see if what is read is indeed
+ * what we expect it to be, and then on success of the compare,
+ * it should write to the location.
+ *
+ * The code segment at @rec->ip should be a caller to @addr
+ *
+ * Return must be:
+ *  0 on success
+ *  -EFAULT on error reading the location
+ *  -EINVAL on a failed compare of the contents
+ *  -EPERM  on error writing to the location
+ * Any other value will be considered a failure.
+ */
+extern int ftrace_make_nop(struct module *mod,
+			   struct dyn_ftrace *rec, unsigned long addr);
 
 /**
- * ftrace_modify_code - modify code segment
- * @ip: the address of the code segment
- * @old_code: the contents of what is expected to be there
- * @new_code: the code to patch in
+ * ftrace_make_call - convert a nop call site into a call to addr
+ * @rec: the mcount call site record
+ * @addr: the address that the call site should call
  *
  * This is a very sensitive operation and great care needs
  * to be taken by the arch.  The operation should carefully
@@ -118,6 +140,8 @@ extern int ftrace_arch_read_dyn_info(char *buf, int size);
  * what we expect it to be, and then on success of the compare,
  * it should write to the location.
  *
+ * The code segment at @rec->ip should be a nop
+ *
  * Return must be:
  *  0 on success
  *  -EFAULT on error reading the location
@@ -125,8 +149,11 @@ extern int ftrace_arch_read_dyn_info(char *buf, int size);
  *  -EPERM  on error writing to the location
  * Any other value will be considered a failure.
  */
-extern int ftrace_modify_code(unsigned long ip, unsigned char *old_code,
-			      unsigned char *new_code);
+extern int ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr);
+
+
+/* May be defined in arch */
+extern int ftrace_arch_read_dyn_info(char *buf, int size);
 
 extern int skip_trace(unsigned long ip);
 
@@ -259,11 +286,13 @@ static inline void ftrace_dump(void) { }
 
 #ifdef CONFIG_FTRACE_MCOUNT_RECORD
 extern void ftrace_init(void);
-extern void ftrace_init_module(unsigned long *start, unsigned long *end);
+extern void ftrace_init_module(struct module *mod,
+			       unsigned long *start, unsigned long *end);
 #else
 static inline void ftrace_init(void) { }
 static inline void
-ftrace_init_module(unsigned long *start, unsigned long *end) { }
+ftrace_init_module(struct module *mod,
+		   unsigned long *start, unsigned long *end) { }
 #endif
 
 
diff --git a/kernel/module.c b/kernel/module.c
index 1f4cc00e0c2..69791274e89 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -2201,7 +2201,7 @@ static noinline struct module *load_module(void __user *umod,
 	/* sechdrs[0].sh_size is always zero */
 	mseg = section_objs(hdr, sechdrs, secstrings, "__mcount_loc",
 			    sizeof(*mseg), &num_mcount);
-	ftrace_init_module(mseg, mseg + num_mcount);
+	ftrace_init_module(mod, mseg, mseg + num_mcount);
 
 	err = module_finalize(hdr, sechdrs, mod);
 	if (err < 0)
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 3940c71ac2a..e9a5fbfce08 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -358,9 +358,7 @@ static void print_ip_ins(const char *fmt, unsigned char *p)
 		printk(KERN_CONT "%s%02x", i ? ":" : "", p[i]);
 }
 
-static void ftrace_bug(int failed, unsigned long ip,
-		       unsigned char *expected,
-		       unsigned char *replace)
+static void ftrace_bug(int failed, unsigned long ip)
 {
 	switch (failed) {
 	case -EFAULT:
@@ -372,9 +370,7 @@ static void ftrace_bug(int failed, unsigned long ip,
 		FTRACE_WARN_ON_ONCE(1);
 		pr_info("ftrace failed to modify ");
 		print_ip_sym(ip);
-		print_ip_ins(" expected: ", expected);
 		print_ip_ins(" actual: ", (unsigned char *)ip);
-		print_ip_ins(" replace: ", replace);
 		printk(KERN_CONT "\n");
 		break;
 	case -EPERM:
@@ -392,8 +388,7 @@ static void ftrace_bug(int failed, unsigned long ip,
 #define FTRACE_ADDR ((long)(ftrace_caller))
 
 static int
-__ftrace_replace_code(struct dyn_ftrace *rec,
-		      unsigned char *old, unsigned char *new, int enable)
+__ftrace_replace_code(struct dyn_ftrace *rec, int enable)
 {
 	unsigned long ip, fl;
 
@@ -435,12 +430,10 @@ __ftrace_replace_code(struct dyn_ftrace *rec,
 		 * otherwise enable it!
 		 */
 		if (fl & FTRACE_FL_ENABLED) {
-			/* swap new and old */
-			new = old;
-			old = ftrace_call_replace(ip, FTRACE_ADDR);
+			enable = 0;
 			rec->flags &= ~FTRACE_FL_ENABLED;
 		} else {
-			new = ftrace_call_replace(ip, FTRACE_ADDR);
+			enable = 1;
 			rec->flags |= FTRACE_FL_ENABLED;
 		}
 	} else {
@@ -453,10 +446,7 @@ __ftrace_replace_code(struct dyn_ftrace *rec,
 			fl = rec->flags & (FTRACE_FL_NOTRACE | FTRACE_FL_ENABLED);
 			if (fl == FTRACE_FL_NOTRACE)
 				return 0;
-
-			new = ftrace_call_replace(ip, FTRACE_ADDR);
-		} else
-			old = ftrace_call_replace(ip, FTRACE_ADDR);
+		}
 
 		if (enable) {
 			if (rec->flags & FTRACE_FL_ENABLED)
@@ -469,21 +459,18 @@ __ftrace_replace_code(struct dyn_ftrace *rec,
 		}
 	}
 
-	return ftrace_modify_code(ip, old, new);
+	if (enable)
+		return ftrace_make_call(rec, FTRACE_ADDR);
+	else
+		return ftrace_make_nop(NULL, rec, FTRACE_ADDR);
 }
 
 static void ftrace_replace_code(int enable)
 {
 	int i, failed;
-	unsigned char *new = NULL, *old = NULL;
 	struct dyn_ftrace *rec;
 	struct ftrace_page *pg;
 
-	if (enable)
-		old = ftrace_nop_replace();
-	else
-		new = ftrace_nop_replace();
-
 	for (pg = ftrace_pages_start; pg; pg = pg->next) {
 		for (i = 0; i < pg->index; i++) {
 			rec = &pg->records[i];
@@ -504,34 +491,30 @@ static void ftrace_replace_code(int enable)
 				unfreeze_record(rec);
 			}
 
-			failed = __ftrace_replace_code(rec, old, new, enable);
+			failed = __ftrace_replace_code(rec, enable);
 			if (failed && (rec->flags & FTRACE_FL_CONVERTED)) {
 				rec->flags |= FTRACE_FL_FAILED;
 				if ((system_state == SYSTEM_BOOTING) ||
 				    !core_kernel_text(rec->ip)) {
 					ftrace_free_rec(rec);
 				} else
-					ftrace_bug(failed, rec->ip, old, new);
+					ftrace_bug(failed, rec->ip);
 			}
 		}
 	}
 }
 
 static int
-ftrace_code_disable(struct dyn_ftrace *rec)
+ftrace_code_disable(struct module *mod, struct dyn_ftrace *rec)
 {
 	unsigned long ip;
-	unsigned char *nop, *call;
 	int ret;
 
 	ip = rec->ip;
 
-	nop = ftrace_nop_replace();
-	call = ftrace_call_replace(ip, mcount_addr);
-
-	ret = ftrace_modify_code(ip, call, nop);
+	ret = ftrace_make_nop(mod, rec, mcount_addr);
 	if (ret) {
-		ftrace_bug(ret, ip, call, nop);
+		ftrace_bug(ret, ip);
 		rec->flags |= FTRACE_FL_FAILED;
 		return 0;
 	}
@@ -650,7 +633,7 @@ static cycle_t		ftrace_update_time;
 static unsigned long	ftrace_update_cnt;
 unsigned long		ftrace_update_tot_cnt;
 
-static int ftrace_update_code(void)
+static int ftrace_update_code(struct module *mod)
 {
 	struct dyn_ftrace *p, *t;
 	cycle_t start, stop;
@@ -667,7 +650,7 @@ static int ftrace_update_code(void)
 		list_del_init(&p->list);
 
 		/* convert record (i.e, patch mcount-call with NOP) */
-		if (ftrace_code_disable(p)) {
+		if (ftrace_code_disable(mod, p)) {
 			p->flags |= FTRACE_FL_CONVERTED;
 			ftrace_update_cnt++;
 		} else
@@ -1309,7 +1292,8 @@ static __init int ftrace_init_debugfs(void)
 
 fs_initcall(ftrace_init_debugfs);
 
-static int ftrace_convert_nops(unsigned long *start,
+static int ftrace_convert_nops(struct module *mod,
+			       unsigned long *start,
 			       unsigned long *end)
 {
 	unsigned long *p;
@@ -1325,18 +1309,19 @@ static int ftrace_convert_nops(unsigned long *start,
 
 	/* disable interrupts to prevent kstop machine */
 	local_irq_save(flags);
-	ftrace_update_code();
+	ftrace_update_code(mod);
 	local_irq_restore(flags);
 	mutex_unlock(&ftrace_start_lock);
 
 	return 0;
 }
 
-void ftrace_init_module(unsigned long *start, unsigned long *end)
+void ftrace_init_module(struct module *mod,
+			unsigned long *start, unsigned long *end)
 {
 	if (ftrace_disabled || start == end)
 		return;
-	ftrace_convert_nops(start, end);
+	ftrace_convert_nops(mod, start, end);
 }
 
 extern unsigned long __start_mcount_loc[];
@@ -1366,7 +1351,8 @@ void __init ftrace_init(void)
 
 	last_ftrace_enabled = ftrace_enabled = 1;
 
-	ret = ftrace_convert_nops(__start_mcount_loc,
+	ret = ftrace_convert_nops(NULL,
+				  __start_mcount_loc,
 				  __stop_mcount_loc);
 
 	return;
-- 
cgit v1.2.3-70-g09d2


From e7d3737ea1b102030f44e96c97754101e41515f0 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Sun, 16 Nov 2008 06:02:06 +0100
Subject: tracing/function-return-tracer: support for dynamic ftrace on
 function return tracer

This patch adds the support for dynamic tracing on the function return tracer.
The whole difference with normal dynamic function tracing is that we don't need
to hook on a particular callback. The only pro that we want is to nop or set
dynamically the calls to ftrace_caller (which is ftrace_return_caller here).

Some security checks ensure that we are not trying to launch dynamic tracing for
return tracing while normal function tracing is already running.

An example of trace with getnstimeofday set as a filter:

ktime_get_ts+0x22/0x50 -> getnstimeofday (2283 ns)
ktime_get_ts+0x22/0x50 -> getnstimeofday (1396 ns)
ktime_get_ts+0x22/0x50 -> getnstimeofday (1382 ns)
ktime_get_ts+0x22/0x50 -> getnstimeofday (1825 ns)
ktime_get_ts+0x22/0x50 -> getnstimeofday (1426 ns)
ktime_get_ts+0x22/0x50 -> getnstimeofday (1464 ns)
ktime_get_ts+0x22/0x50 -> getnstimeofday (1524 ns)
ktime_get_ts+0x22/0x50 -> getnstimeofday (1382 ns)
ktime_get_ts+0x22/0x50 -> getnstimeofday (1382 ns)
ktime_get_ts+0x22/0x50 -> getnstimeofday (1434 ns)
ktime_get_ts+0x22/0x50 -> getnstimeofday (1464 ns)
ktime_get_ts+0x22/0x50 -> getnstimeofday (1502 ns)
ktime_get_ts+0x22/0x50 -> getnstimeofday (1404 ns)
ktime_get_ts+0x22/0x50 -> getnstimeofday (1397 ns)
ktime_get_ts+0x22/0x50 -> getnstimeofday (1051 ns)
ktime_get_ts+0x22/0x50 -> getnstimeofday (1314 ns)
ktime_get_ts+0x22/0x50 -> getnstimeofday (1344 ns)
ktime_get_ts+0x22/0x50 -> getnstimeofday (1163 ns)
ktime_get_ts+0x22/0x50 -> getnstimeofday (1390 ns)
ktime_get_ts+0x22/0x50 -> getnstimeofday (1374 ns)

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/entry_32.S            |  18 ++-
 arch/x86/kernel/ftrace.c              | 258 +++++++++++++++++-----------------
 include/linux/ftrace.h                |  16 ++-
 kernel/trace/Kconfig                  |   1 -
 kernel/trace/ftrace.c                 |  58 +++++++-
 kernel/trace/trace_functions_return.c |  15 +-
 6 files changed, 211 insertions(+), 155 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
index f9762114983..74defe21ba4 100644
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -1190,7 +1190,7 @@ ENTRY(mcount)
 	jnz trace
 #ifdef CONFIG_FUNCTION_RET_TRACER
 	cmpl $ftrace_stub, ftrace_function_return
-	jnz trace_return
+	jnz ftrace_return_caller
 #endif
 .globl ftrace_stub
 ftrace_stub:
@@ -1211,9 +1211,15 @@ trace:
 	popl %ecx
 	popl %eax
 	jmp ftrace_stub
+END(mcount)
+#endif /* CONFIG_DYNAMIC_FTRACE */
+#endif /* CONFIG_FUNCTION_TRACER */
 
 #ifdef CONFIG_FUNCTION_RET_TRACER
-trace_return:
+ENTRY(ftrace_return_caller)
+	cmpl $0, function_trace_stop
+	jne ftrace_stub
+
 	pushl %eax
 	pushl %ecx
 	pushl %edx
@@ -1223,7 +1229,8 @@ trace_return:
 	popl %edx
 	popl %ecx
 	popl %eax
-	jmp ftrace_stub
+	ret
+END(ftrace_return_caller)
 
 .globl return_to_handler
 return_to_handler:
@@ -1237,10 +1244,7 @@ return_to_handler:
 	popl %ecx
 	popl %eax
 	ret
-#endif /* CONFIG_FUNCTION_RET_TRACER */
-END(mcount)
-#endif /* CONFIG_DYNAMIC_FTRACE */
-#endif /* CONFIG_FUNCTION_TRACER */
+#endif
 
 .section .rodata,"a"
 #include "syscall_table_32.S"
diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c
index d98b5a8ecf4..924153edd97 100644
--- a/arch/x86/kernel/ftrace.c
+++ b/arch/x86/kernel/ftrace.c
@@ -24,134 +24,6 @@
 #include <asm/nmi.h>
 
 
-
-#ifdef CONFIG_FUNCTION_RET_TRACER
-
-/*
- * These functions are picked from those used on
- * this page for dynamic ftrace. They have been
- * simplified to ignore all traces in NMI context.
- */
-static atomic_t in_nmi;
-
-void ftrace_nmi_enter(void)
-{
-	atomic_inc(&in_nmi);
-}
-
-void ftrace_nmi_exit(void)
-{
-	atomic_dec(&in_nmi);
-}
-
-/* Add a function return address to the trace stack on thread info.*/
-static int push_return_trace(unsigned long ret, unsigned long long time,
-				unsigned long func)
-{
-	int index;
-	struct thread_info *ti = current_thread_info();
-
-	/* The return trace stack is full */
-	if (ti->curr_ret_stack == FTRACE_RET_STACK_SIZE - 1)
-		return -EBUSY;
-
-	index = ++ti->curr_ret_stack;
-	barrier();
-	ti->ret_stack[index].ret = ret;
-	ti->ret_stack[index].func = func;
-	ti->ret_stack[index].calltime = time;
-
-	return 0;
-}
-
-/* Retrieve a function return address to the trace stack on thread info.*/
-static void pop_return_trace(unsigned long *ret, unsigned long long *time,
-				unsigned long *func)
-{
-	int index;
-
-	struct thread_info *ti = current_thread_info();
-	index = ti->curr_ret_stack;
-	*ret = ti->ret_stack[index].ret;
-	*func = ti->ret_stack[index].func;
-	*time = ti->ret_stack[index].calltime;
-	ti->curr_ret_stack--;
-}
-
-/*
- * Send the trace to the ring-buffer.
- * @return the original return address.
- */
-unsigned long ftrace_return_to_handler(void)
-{
-	struct ftrace_retfunc trace;
-	pop_return_trace(&trace.ret, &trace.calltime, &trace.func);
-	trace.rettime = cpu_clock(raw_smp_processor_id());
-	ftrace_function_return(&trace);
-
-	return trace.ret;
-}
-
-/*
- * Hook the return address and push it in the stack of return addrs
- * in current thread info.
- */
-void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr)
-{
-	unsigned long old;
-	unsigned long long calltime;
-	int faulted;
-	unsigned long return_hooker = (unsigned long)
-				&return_to_handler;
-
-	/* Nmi's are currently unsupported */
-	if (atomic_read(&in_nmi))
-		return;
-
-	/*
-	 * Protect against fault, even if it shouldn't
-	 * happen. This tool is too much intrusive to
-	 * ignore such a protection.
-	 */
-	asm volatile(
-		"1: movl (%[parent_old]), %[old]\n"
-		"2: movl %[return_hooker], (%[parent_replaced])\n"
-		"   movl $0, %[faulted]\n"
-
-		".section .fixup, \"ax\"\n"
-		"3: movl $1, %[faulted]\n"
-		".previous\n"
-
-		".section __ex_table, \"a\"\n"
-		"   .long 1b, 3b\n"
-		"   .long 2b, 3b\n"
-		".previous\n"
-
-		: [parent_replaced] "=r" (parent), [old] "=r" (old),
-		  [faulted] "=r" (faulted)
-		: [parent_old] "0" (parent), [return_hooker] "r" (return_hooker)
-		: "memory"
-	);
-
-	if (WARN_ON(faulted)) {
-		unregister_ftrace_return();
-		return;
-	}
-
-	if (WARN_ON(!__kernel_text_address(old))) {
-		unregister_ftrace_return();
-		*parent = old;
-		return;
-	}
-
-	calltime = cpu_clock(raw_smp_processor_id());
-
-	if (push_return_trace(old, calltime, self_addr) == -EBUSY)
-		*parent = old;
-}
-
-#endif
-
 #ifdef CONFIG_DYNAMIC_FTRACE
 
 union ftrace_code_union {
@@ -450,3 +322,133 @@ int __init ftrace_dyn_arch_init(void *data)
 	return 0;
 }
 #endif
+
+#ifdef CONFIG_FUNCTION_RET_TRACER
+
+#ifndef CONFIG_DYNAMIC_FTRACE
+
+/*
+ * These functions are picked from those used on
+ * this page for dynamic ftrace. They have been
+ * simplified to ignore all traces in NMI context.
+ */
+static atomic_t in_nmi;
+
+void ftrace_nmi_enter(void)
+{
+	atomic_inc(&in_nmi);
+}
+
+void ftrace_nmi_exit(void)
+{
+	atomic_dec(&in_nmi);
+}
+#endif /* !CONFIG_DYNAMIC_FTRACE */
+
+/* Add a function return address to the trace stack on thread info.*/
+static int push_return_trace(unsigned long ret, unsigned long long time,
+				unsigned long func)
+{
+	int index;
+	struct thread_info *ti = current_thread_info();
+
+	/* The return trace stack is full */
+	if (ti->curr_ret_stack == FTRACE_RET_STACK_SIZE - 1)
+		return -EBUSY;
+
+	index = ++ti->curr_ret_stack;
+	barrier();
+	ti->ret_stack[index].ret = ret;
+	ti->ret_stack[index].func = func;
+	ti->ret_stack[index].calltime = time;
+
+	return 0;
+}
+
+/* Retrieve a function return address to the trace stack on thread info.*/
+static void pop_return_trace(unsigned long *ret, unsigned long long *time,
+				unsigned long *func)
+{
+	int index;
+
+	struct thread_info *ti = current_thread_info();
+	index = ti->curr_ret_stack;
+	*ret = ti->ret_stack[index].ret;
+	*func = ti->ret_stack[index].func;
+	*time = ti->ret_stack[index].calltime;
+	ti->curr_ret_stack--;
+}
+
+/*
+ * Send the trace to the ring-buffer.
+ * @return the original return address.
+ */
+unsigned long ftrace_return_to_handler(void)
+{
+	struct ftrace_retfunc trace;
+	pop_return_trace(&trace.ret, &trace.calltime, &trace.func);
+	trace.rettime = cpu_clock(raw_smp_processor_id());
+	ftrace_function_return(&trace);
+
+	return trace.ret;
+}
+
+/*
+ * Hook the return address and push it in the stack of return addrs
+ * in current thread info.
+ */
+void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr)
+{
+	unsigned long old;
+	unsigned long long calltime;
+	int faulted;
+	unsigned long return_hooker = (unsigned long)
+				&return_to_handler;
+
+	/* Nmi's are currently unsupported */
+	if (atomic_read(&in_nmi))
+		return;
+
+	/*
+	 * Protect against fault, even if it shouldn't
+	 * happen. This tool is too much intrusive to
+	 * ignore such a protection.
+	 */
+	asm volatile(
+		"1: movl (%[parent_old]), %[old]\n"
+		"2: movl %[return_hooker], (%[parent_replaced])\n"
+		"   movl $0, %[faulted]\n"
+
+		".section .fixup, \"ax\"\n"
+		"3: movl $1, %[faulted]\n"
+		".previous\n"
+
+		".section __ex_table, \"a\"\n"
+		"   .long 1b, 3b\n"
+		"   .long 2b, 3b\n"
+		".previous\n"
+
+		: [parent_replaced] "=r" (parent), [old] "=r" (old),
+		  [faulted] "=r" (faulted)
+		: [parent_old] "0" (parent), [return_hooker] "r" (return_hooker)
+		: "memory"
+	);
+
+	if (WARN_ON(faulted)) {
+		unregister_ftrace_return();
+		return;
+	}
+
+	if (WARN_ON(!__kernel_text_address(old))) {
+		unregister_ftrace_return();
+		*parent = old;
+		return;
+	}
+
+	calltime = cpu_clock(raw_smp_processor_id());
+
+	if (push_return_trace(old, calltime, self_addr) == -EBUSY)
+		*parent = old;
+}
+
+#endif /* CONFIG_FUNCTION_RET_TRACER */
diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index 166a2070ef6..f1af1aab00e 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -25,6 +25,17 @@ struct ftrace_ops {
 
 extern int function_trace_stop;
 
+/*
+ * Type of the current tracing.
+ */
+enum ftrace_tracing_type_t {
+	FTRACE_TYPE_ENTER = 0, /* Hook the call of the function */
+	FTRACE_TYPE_RETURN,	/* Hook the return of the function */
+};
+
+/* Current tracing type, default is FTRACE_TYPE_ENTER */
+extern enum ftrace_tracing_type_t ftrace_tracing_type;
+
 /**
  * ftrace_stop - stop function tracer.
  *
@@ -104,6 +115,9 @@ extern int ftrace_update_ftrace_func(ftrace_func_t func);
 extern void ftrace_caller(void);
 extern void ftrace_call(void);
 extern void mcount_call(void);
+#ifdef CONFIG_FUNCTION_RET_TRACER
+extern void ftrace_return_caller(void);
+#endif
 
 /**
  * ftrace_make_nop - convert code into top
@@ -310,7 +324,7 @@ struct ftrace_retfunc {
 /* Type of a callback handler of tracing return function */
 typedef void (*trace_function_return_t)(struct ftrace_retfunc *);
 
-extern void register_ftrace_return(trace_function_return_t func);
+extern int register_ftrace_return(trace_function_return_t func);
 /* The current handler in use */
 extern trace_function_return_t ftrace_function_return;
 extern void unregister_ftrace_return(void);
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 9c89526b6b7..b8378fad29a 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -59,7 +59,6 @@ config FUNCTION_TRACER
 
 config FUNCTION_RET_TRACER
 	bool "Kernel Function return Tracer"
-	depends on !DYNAMIC_FTRACE
 	depends on HAVE_FUNCTION_RET_TRACER
 	depends on FUNCTION_TRACER
 	help
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index b42ec1de546..2f78a45aac1 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -50,6 +50,9 @@ static int last_ftrace_enabled;
 /* Quick disabling of function tracer. */
 int function_trace_stop;
 
+/* By default, current tracing type is normal tracing. */
+enum ftrace_tracing_type_t ftrace_tracing_type = FTRACE_TYPE_ENTER;
+
 /*
  * ftrace_disabled is set when an anomaly is discovered.
  * ftrace_disabled is much stronger than ftrace_enabled.
@@ -385,12 +388,21 @@ static void ftrace_bug(int failed, unsigned long ip)
 	}
 }
 
-#define FTRACE_ADDR ((long)(ftrace_caller))
 
 static int
 __ftrace_replace_code(struct dyn_ftrace *rec, int enable)
 {
 	unsigned long ip, fl;
+	unsigned long ftrace_addr;
+
+#ifdef CONFIG_FUNCTION_RET_TRACER
+	if (ftrace_tracing_type == FTRACE_TYPE_ENTER)
+		ftrace_addr = (unsigned long)ftrace_caller;
+	else
+		ftrace_addr = (unsigned long)ftrace_return_caller;
+#else
+	ftrace_addr = (unsigned long)ftrace_caller;
+#endif
 
 	ip = rec->ip;
 
@@ -450,9 +462,9 @@ __ftrace_replace_code(struct dyn_ftrace *rec, int enable)
 	}
 
 	if (rec->flags & FTRACE_FL_ENABLED)
-		return ftrace_make_call(rec, FTRACE_ADDR);
+		return ftrace_make_call(rec, ftrace_addr);
 	else
-		return ftrace_make_nop(NULL, rec, FTRACE_ADDR);
+		return ftrace_make_nop(NULL, rec, ftrace_addr);
 }
 
 static void ftrace_replace_code(int enable)
@@ -1405,10 +1417,17 @@ int register_ftrace_function(struct ftrace_ops *ops)
 		return -1;
 
 	mutex_lock(&ftrace_sysctl_lock);
+
+	if (ftrace_tracing_type == FTRACE_TYPE_RETURN) {
+		ret = -EBUSY;
+		goto out;
+	}
+
 	ret = __register_ftrace_function(ops);
 	ftrace_startup();
-	mutex_unlock(&ftrace_sysctl_lock);
 
+out:
+	mutex_unlock(&ftrace_sysctl_lock);
 	return ret;
 }
 
@@ -1474,16 +1493,45 @@ ftrace_enable_sysctl(struct ctl_table *table, int write,
 }
 
 #ifdef CONFIG_FUNCTION_RET_TRACER
+
+/* The callback that hooks the return of a function */
 trace_function_return_t ftrace_function_return =
 			(trace_function_return_t)ftrace_stub;
-void register_ftrace_return(trace_function_return_t func)
+
+int register_ftrace_return(trace_function_return_t func)
 {
+	int ret = 0;
+
+	mutex_lock(&ftrace_sysctl_lock);
+
+	/*
+	 * Don't launch return tracing if normal function
+	 * tracing is already running.
+	 */
+	if (ftrace_trace_function != ftrace_stub) {
+		ret = -EBUSY;
+		goto out;
+	}
+
+	ftrace_tracing_type = FTRACE_TYPE_RETURN;
 	ftrace_function_return = func;
+	ftrace_startup();
+
+out:
+	mutex_unlock(&ftrace_sysctl_lock);
+	return ret;
 }
 
 void unregister_ftrace_return(void)
 {
+	mutex_lock(&ftrace_sysctl_lock);
+
 	ftrace_function_return = (trace_function_return_t)ftrace_stub;
+	ftrace_shutdown();
+	/* Restore normal tracing type */
+	ftrace_tracing_type = FTRACE_TYPE_ENTER;
+
+	mutex_unlock(&ftrace_sysctl_lock);
 }
 #endif
 
diff --git a/kernel/trace/trace_functions_return.c b/kernel/trace/trace_functions_return.c
index 61185f756a1..a68564af022 100644
--- a/kernel/trace/trace_functions_return.c
+++ b/kernel/trace/trace_functions_return.c
@@ -14,29 +14,18 @@
 #include "trace.h"
 
 
-static void start_return_trace(struct trace_array *tr)
-{
-	register_ftrace_return(&trace_function_return);
-}
-
-static void stop_return_trace(struct trace_array *tr)
-{
-	unregister_ftrace_return();
-}
-
 static int return_trace_init(struct trace_array *tr)
 {
 	int cpu;
 	for_each_online_cpu(cpu)
 		tracing_reset(tr, cpu);
 
-	start_return_trace(tr);
-	return 0;
+	return register_ftrace_return(&trace_function_return);
 }
 
 static void return_trace_reset(struct trace_array *tr)
 {
-		stop_return_trace(tr);
+		unregister_ftrace_return();
 }
 
 
-- 
cgit v1.2.3-70-g09d2


From 954e100d2275cb2f150f2b18d5cddcdf67b956ac Mon Sep 17 00:00:00 2001
From: Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca>
Date: Fri, 14 Nov 2008 17:47:34 -0500
Subject: rcu: add rcu_read_*_sched_notrace()

Impact: new API, useful for tracepoints and markers.

Add _notrace version to rcu_read_*_sched().

Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca>
Reviewed-by: Paul E McKenney <paulmck@linux.vnet.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/rcupdate.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index 86f1f5e43e3..895dc9c1088 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -142,6 +142,7 @@ struct rcu_head {
  * on the write-side to insure proper synchronization.
  */
 #define rcu_read_lock_sched() preempt_disable()
+#define rcu_read_lock_sched_notrace() preempt_disable_notrace()
 
 /*
  * rcu_read_unlock_sched - marks the end of a RCU-classic critical section
@@ -149,6 +150,7 @@ struct rcu_head {
  * See rcu_read_lock_sched for more information.
  */
 #define rcu_read_unlock_sched() preempt_enable()
+#define rcu_read_unlock_sched_notrace() preempt_enable_notrace()
 
 
-- 
cgit v1.2.3-70-g09d2


From e3f8c4b9117d70127a8cab480af83bbfd048a28b Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Fri, 14 Nov 2008 17:47:36 -0500
Subject: markers: add missing stdargs.h include, needed due to va_list usage

Impact: build fix (for future changes)

That seemed to cause built issue when marker.h is included early, even
though stdargs.h is included in kernel.h.

Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/marker.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/marker.h b/include/linux/marker.h
index 4cf45472d9f..05ec0df3708 100644
--- a/include/linux/marker.h
+++ b/include/linux/marker.h
@@ -12,6 +12,7 @@
  * See the file COPYING for more details.
  */
 
+#include <stdarg.h>
 #include <linux/types.h>
 
 struct module;
-- 
cgit v1.2.3-70-g09d2


From c1df1bd2c4d4b20c83755a0f41956b57aec4842a Mon Sep 17 00:00:00 2001
From: Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca>
Date: Fri, 14 Nov 2008 17:47:39 -0500
Subject: markers: auto enable tracepoints (new API : trace_mark_tp())

Impact: new API

Add a new API trace_mark_tp(), which declares a marker within a
tracepoint probe. When the marker is activated, the tracepoint is
automatically enabled.

No branch test is used at the marker site, because it would be a
duplicate of the branch already present in the tracepoint.

Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/marker.h | 45 +++++++++++++++++++++++++++++++++++++++++-
 init/Kconfig           |  1 +
 kernel/marker.c        | 53 ++++++++++++++++++++++++++++++++++++++++++++++++--
 3 files changed, 96 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/marker.h b/include/linux/marker.h
index 05ec0df3708..57a307018ce 100644
--- a/include/linux/marker.h
+++ b/include/linux/marker.h
@@ -49,6 +49,8 @@ struct marker {
 	void (*call)(const struct marker *mdata, void *call_private, ...);
 	struct marker_probe_closure single;
 	struct marker_probe_closure *multi;
+	const char *tp_name;	/* Optional tracepoint name */
+	void *tp_cb;		/* Optional tracepoint callback */
 } __attribute__((aligned(8)));
 
 #ifdef CONFIG_MARKERS
@@ -73,7 +75,7 @@ struct marker {
 		__attribute__((section("__markers"), aligned(8))) =	\
 		{ __mstrtab_##name, &__mstrtab_##name[sizeof(#name)],	\
 		0, 0, marker_probe_cb,					\
-		{ __mark_empty_function, NULL}, NULL };			\
+		{ __mark_empty_function, NULL}, NULL, NULL, NULL };	\
 		__mark_check_format(format, ## args);			\
 		if (unlikely(__mark_##name.state)) {			\
 			(*__mark_##name.call)				\
@@ -81,11 +83,38 @@ struct marker {
 		}							\
 	} while (0)
 
+#define __trace_mark_tp(name, call_private, tp_name, tp_cb, format, args...) \
+	do {								\
+		void __check_tp_type(void)				\
+		{							\
+			register_trace_##tp_name(tp_cb);		\
+		}							\
+		static const char __mstrtab_##name[]			\
+		__attribute__((section("__markers_strings")))		\
+		= #name "\0" format;					\
+		static struct marker __mark_##name			\
+		__attribute__((section("__markers"), aligned(8))) =	\
+		{ __mstrtab_##name, &__mstrtab_##name[sizeof(#name)],	\
+		0, 0, marker_probe_cb,					\
+		{ __mark_empty_function, NULL}, NULL, #tp_name, tp_cb };\
+		__mark_check_format(format, ## args);			\
+		(*__mark_##name.call)(&__mark_##name, call_private,	\
+					## args);			\
+	} while (0)
+
 extern void marker_update_probe_range(struct marker *begin,
 	struct marker *end);
 #else /* !CONFIG_MARKERS */
 #define __trace_mark(generic, name, call_private, format, args...) \
 		__mark_check_format(format, ## args)
+#define __trace_mark_tp(name, call_private, tp_name, tp_cb, format, args...) \
+	do {								\
+		void __check_tp_type(void)				\
+		{							\
+			register_trace_##tp_name(tp_cb);		\
+		}							\
+		__mark_check_format(format, ## args);			\
+	} while (0)
 static inline void marker_update_probe_range(struct marker *begin,
 	struct marker *end)
 { }
@@ -117,6 +146,20 @@ static inline void marker_update_probe_range(struct marker *begin,
 #define _trace_mark(name, format, args...) \
 	__trace_mark(1, name, NULL, format, ## args)
 
+/**
+ * trace_mark_tp - Marker in a tracepoint callback
+ * @name: marker name, not quoted.
+ * @tp_name: tracepoint name, not quoted.
+ * @tp_cb: tracepoint callback. Should have an associated global symbol so it
+ *         is not optimized away by the compiler (should not be static).
+ * @format: format string
+ * @args...: variable argument list
+ *
+ * Places a marker in a tracepoint callback.
+ */
+#define trace_mark_tp(name, tp_name, tp_cb, format, args...)	\
+	__trace_mark_tp(name, NULL, tp_name, tp_cb, format, ## args)
+
 /**
  * MARK_NOARGS - Format string for a marker with no argument.
  */
diff --git a/init/Kconfig b/init/Kconfig
index 86b00c53fad..f5bacb43871 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -808,6 +808,7 @@ config TRACEPOINTS
 
 config MARKERS
 	bool "Activate markers"
+	depends on TRACEPOINTS
 	help
 	  Place an empty function call at each marker site. Can be
 	  dynamically changed for a probe function.
diff --git a/kernel/marker.c b/kernel/marker.c
index 348e70cc355..c14ec26a9b9 100644
--- a/kernel/marker.c
+++ b/kernel/marker.c
@@ -479,7 +479,7 @@ static int marker_set_format(struct marker_entry *entry, const char *format)
 static int set_marker(struct marker_entry *entry, struct marker *elem,
 		int active)
 {
-	int ret;
+	int ret = 0;
 	WARN_ON(strcmp(entry->name, elem->name) != 0);
 
 	if (entry->format) {
@@ -531,9 +531,40 @@ static int set_marker(struct marker_entry *entry, struct marker *elem,
 	 */
 	smp_wmb();
 	elem->ptype = entry->ptype;
+
+	if (elem->tp_name && (active ^ elem->state)) {
+		WARN_ON(!elem->tp_cb);
+		/*
+		 * It is ok to directly call the probe registration because type
+		 * checking has been done in the __trace_mark_tp() macro.
+		 */
+
+		if (active) {
+			/*
+			 * try_module_get should always succeed because we hold
+			 * lock_module() to get the tp_cb address.
+			 */
+			ret = try_module_get(__module_text_address(
+				(unsigned long)elem->tp_cb));
+			BUG_ON(!ret);
+			ret = tracepoint_probe_register_noupdate(
+				elem->tp_name,
+				elem->tp_cb);
+		} else {
+			ret = tracepoint_probe_unregister_noupdate(
+				elem->tp_name,
+				elem->tp_cb);
+			/*
+			 * tracepoint_probe_update_all() must be called
+			 * before the module containing tp_cb is unloaded.
+			 */
+			module_put(__module_text_address(
+				(unsigned long)elem->tp_cb));
+		}
+	}
 	elem->state = active;
 
-	return 0;
+	return ret;
 }
 
 /*
@@ -544,7 +575,24 @@ static int set_marker(struct marker_entry *entry, struct marker *elem,
  */
 static void disable_marker(struct marker *elem)
 {
+	int ret;
+
 	/* leave "call" as is. It is known statically. */
+	if (elem->tp_name && elem->state) {
+		WARN_ON(!elem->tp_cb);
+		/*
+		 * It is ok to directly call the probe registration because type
+		 * checking has been done in the __trace_mark_tp() macro.
+		 */
+		ret = tracepoint_probe_unregister_noupdate(elem->tp_name,
+			elem->tp_cb);
+		WARN_ON(ret);
+		/*
+		 * tracepoint_probe_update_all() must be called
+		 * before the module containing tp_cb is unloaded.
+		 */
+		module_put(__module_text_address((unsigned long)elem->tp_cb));
+	}
 	elem->state = 0;
 	elem->single.func = __mark_empty_function;
 	/* Update the function before setting the ptype */
@@ -608,6 +656,7 @@ static void marker_update_probes(void)
 	marker_update_probe_range(__start___markers, __stop___markers);
 	/* Markers in modules. */
 	module_update_markers();
+	tracepoint_probe_update_all();
 }
 
 /**
-- 
cgit v1.2.3-70-g09d2


From a0bca6a59ebc052751eed6e3b182c153495672d8 Mon Sep 17 00:00:00 2001
From: Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca>
Date: Fri, 14 Nov 2008 17:47:40 -0500
Subject: markers: create DEFINE_MARKER and GET_MARKER (new API)

Impact: new API.

Allow markers to be used only for declaration, without function call
associated. Useful to create specialized probes.

The problem we had is that two function calls were required when one
wanted to put a marker in a tracepoint probe. Now the marker can be used
simply for trace data type declaration, leaving the trace write work
within the tracepoint probe without any additional function call.

Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 Documentation/markers.txt | 14 ++++++++++++++
 include/linux/marker.h    | 39 +++++++++++++++++++++++----------------
 2 files changed, 37 insertions(+), 16 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/markers.txt b/Documentation/markers.txt
index 089f6138fcd..6d275e4ef38 100644
--- a/Documentation/markers.txt
+++ b/Documentation/markers.txt
@@ -70,6 +70,20 @@ a printk warning which identifies the inconsistency:
 
 "Format mismatch for probe probe_name (format), marker (format)"
 
+Another way to use markers is to simply define the marker without generating any
+function call to actually call into the marker. This is useful in combination
+with tracepoint probes in a scheme like this :
+
+void probe_tracepoint_name(unsigned int arg1, struct task_struct *tsk);
+
+DEFINE_MARKER_TP(marker_eventname, tracepoint_name, probe_tracepoint_name,
+	"arg1 %u pid %d");
+
+notrace void probe_tracepoint_name(unsigned int arg1, struct task_struct *tsk)
+{
+	struct marker *marker = &GET_MARKER(kernel_irq_entry);
+	/* write data to trace buffers ... */
+}
 
 * Probe / marker example
 
diff --git a/include/linux/marker.h b/include/linux/marker.h
index 57a307018ce..34c14bc957f 100644
--- a/include/linux/marker.h
+++ b/include/linux/marker.h
@@ -55,6 +55,22 @@ struct marker {
 
 #ifdef CONFIG_MARKERS
 
+#define _DEFINE_MARKER(name, tp_name_str, tp_cb, format)		\
+		static const char __mstrtab_##name[]			\
+		__attribute__((section("__markers_strings")))		\
+		= #name "\0" format;					\
+		static struct marker __mark_##name			\
+		__attribute__((section("__markers"), aligned(8))) =	\
+		{ __mstrtab_##name, &__mstrtab_##name[sizeof(#name)],	\
+		  0, 0, marker_probe_cb, { __mark_empty_function, NULL},\
+		  NULL, tp_name_str, tp_cb }
+
+#define DEFINE_MARKER(name, format)					\
+		_DEFINE_MARKER(name, NULL, NULL, format)
+
+#define DEFINE_MARKER_TP(name, tp_name, tp_cb, format)			\
+		_DEFINE_MARKER(name, #tp_name, tp_cb, format)
+
 /*
  * Note : the empty asm volatile with read constraint is used here instead of a
  * "used" attribute to fix a gcc 4.1.x bug.
@@ -68,14 +84,7 @@ struct marker {
  */
 #define __trace_mark(generic, name, call_private, format, args...)	\
 	do {								\
-		static const char __mstrtab_##name[]			\
-		__attribute__((section("__markers_strings")))		\
-		= #name "\0" format;					\
-		static struct marker __mark_##name			\
-		__attribute__((section("__markers"), aligned(8))) =	\
-		{ __mstrtab_##name, &__mstrtab_##name[sizeof(#name)],	\
-		0, 0, marker_probe_cb,					\
-		{ __mark_empty_function, NULL}, NULL, NULL, NULL };	\
+		DEFINE_MARKER(name, format);				\
 		__mark_check_format(format, ## args);			\
 		if (unlikely(__mark_##name.state)) {			\
 			(*__mark_##name.call)				\
@@ -89,14 +98,7 @@ struct marker {
 		{							\
 			register_trace_##tp_name(tp_cb);		\
 		}							\
-		static const char __mstrtab_##name[]			\
-		__attribute__((section("__markers_strings")))		\
-		= #name "\0" format;					\
-		static struct marker __mark_##name			\
-		__attribute__((section("__markers"), aligned(8))) =	\
-		{ __mstrtab_##name, &__mstrtab_##name[sizeof(#name)],	\
-		0, 0, marker_probe_cb,					\
-		{ __mark_empty_function, NULL}, NULL, #tp_name, tp_cb };\
+		DEFINE_MARKER_TP(name, tp_name, tp_cb, format);		\
 		__mark_check_format(format, ## args);			\
 		(*__mark_##name.call)(&__mark_##name, call_private,	\
 					## args);			\
@@ -104,7 +106,11 @@ struct marker {
 
 extern void marker_update_probe_range(struct marker *begin,
 	struct marker *end);
+
+#define GET_MARKER(name)	(__mark_##name)
+
 #else /* !CONFIG_MARKERS */
+#define DEFINE_MARKER(name, tp_name, tp_cb, format)
 #define __trace_mark(generic, name, call_private, format, args...) \
 		__mark_check_format(format, ## args)
 #define __trace_mark_tp(name, call_private, tp_name, tp_cb, format, args...) \
@@ -118,6 +124,7 @@ extern void marker_update_probe_range(struct marker *begin,
 static inline void marker_update_probe_range(struct marker *begin,
 	struct marker *end)
 { }
+#define GET_MARKER(name)
 #endif /* CONFIG_MARKERS */
 
 /**
-- 
cgit v1.2.3-70-g09d2


From da7b3eab167091693ad215ad7692f7d0d24d1356 Mon Sep 17 00:00:00 2001
From: Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca>
Date: Fri, 14 Nov 2008 17:47:43 -0500
Subject: tracepoints: use rcu_*_sched_notrace

Make sure tracepoints can be called within ftrace callbacks.

Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/tracepoint.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h
index 63064e9403f..69648c54a32 100644
--- a/include/linux/tracepoint.h
+++ b/include/linux/tracepoint.h
@@ -40,14 +40,14 @@ struct tracepoint {
 	do {								\
 		void **it_func;						\
 									\
-		rcu_read_lock_sched();					\
+		rcu_read_lock_sched_notrace();				\
 		it_func = rcu_dereference((tp)->funcs);			\
 		if (it_func) {						\
 			do {						\
 				((void(*)(proto))(*it_func))(args);	\
 			} while (*(++it_func));				\
 		}							\
-		rcu_read_unlock_sched();				\
+		rcu_read_unlock_sched_notrace();			\
 	} while (0)
 
 /*
-- 
cgit v1.2.3-70-g09d2


From c420970ef476d7d68df119711700666224001f43 Mon Sep 17 00:00:00 2001
From: Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca>
Date: Fri, 14 Nov 2008 17:47:44 -0500
Subject: tracepoints: use unregister return value

Impact: bugfix.

Unregistering a tracepoint can fail. Return the error value.

Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/tracepoint.h | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h
index 69648c54a32..c60a791f887 100644
--- a/include/linux/tracepoint.h
+++ b/include/linux/tracepoint.h
@@ -73,9 +73,9 @@ struct tracepoint {
 		return tracepoint_probe_register(#name ":" #proto,	\
 			(void *)probe);					\
 	}								\
-	static inline void unregister_trace_##name(void (*probe)(proto))\
+	static inline int unregister_trace_##name(void (*probe)(proto))	\
 	{								\
-		tracepoint_probe_unregister(#name ":" #proto,		\
+		return tracepoint_probe_unregister(#name ":" #proto,	\
 			(void *)probe);					\
 	}
 
@@ -92,8 +92,10 @@ extern void tracepoint_update_probe_range(struct tracepoint *begin,
 	{								\
 		return -ENOSYS;						\
 	}								\
-	static inline void unregister_trace_##name(void (*probe)(proto))\
-	{ }
+	static inline int unregister_trace_##name(void (*probe)(proto))	\
+	{								\
+		return -ENOSYS;						\
+	}
 
 static inline void tracepoint_update_probe_range(struct tracepoint *begin,
 	struct tracepoint *end)
-- 
cgit v1.2.3-70-g09d2


From 5f382671def7cb9c0f4b75d586dc5f60dca5e1c3 Mon Sep 17 00:00:00 2001
From: Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca>
Date: Fri, 14 Nov 2008 17:47:45 -0500
Subject: tracepoints: do not put arguments in name

Impact: cleanup

That's overkill, takes space. We have a global tracepoint registery in
header files anyway.

Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/tracepoint.h | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h
index c60a791f887..7e9b42aeae0 100644
--- a/include/linux/tracepoint.h
+++ b/include/linux/tracepoint.h
@@ -60,7 +60,7 @@ struct tracepoint {
 	{								\
 		static const char __tpstrtab_##name[]			\
 		__attribute__((section("__tracepoints_strings")))	\
-		= #name ":" #proto;					\
+		= #name;						\
 		static struct tracepoint __tracepoint_##name		\
 		__attribute__((section("__tracepoints"), aligned(8))) =	\
 		{ __tpstrtab_##name, 0, NULL };				\
@@ -70,13 +70,11 @@ struct tracepoint {
 	}								\
 	static inline int register_trace_##name(void (*probe)(proto))	\
 	{								\
-		return tracepoint_probe_register(#name ":" #proto,	\
-			(void *)probe);					\
+		return tracepoint_probe_register(#name, (void *)probe);	\
 	}								\
 	static inline int unregister_trace_##name(void (*probe)(proto))	\
 	{								\
-		return tracepoint_probe_unregister(#name ":" #proto,	\
-			(void *)probe);					\
+		return tracepoint_probe_unregister(#name, (void *)probe);\
 	}
 
 extern void tracepoint_update_probe_range(struct tracepoint *begin,
-- 
cgit v1.2.3-70-g09d2


From 7e066fb870fcd1025ec3ba7bbde5d541094f4ce1 Mon Sep 17 00:00:00 2001
From: Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca>
Date: Fri, 14 Nov 2008 17:47:47 -0500
Subject: tracepoints: add DECLARE_TRACE() and DEFINE_TRACE()

Impact: API *CHANGE*. Must update all tracepoint users.

Add DEFINE_TRACE() to tracepoints to let them declare the tracepoint
structure in a single spot for all the kernel. It helps reducing memory
consumption, especially when declaring a lot of tracepoints, e.g. for
kmalloc tracing.

*API CHANGE WARNING*: now, DECLARE_TRACE() must be used in headers for
tracepoint declarations rather than DEFINE_TRACE(). This is the sane way
to do it. The name previously used was misleading.

Updates scheduler instrumentation to follow this API change.

Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 Documentation/tracepoints.txt           |  7 ++++++-
 include/asm-generic/vmlinux.lds.h       |  1 +
 include/linux/tracepoint.h              | 35 +++++++++++++++++++++++----------
 include/trace/sched.h                   | 24 +++++++++++-----------
 kernel/exit.c                           |  4 ++++
 kernel/fork.c                           |  2 ++
 kernel/kthread.c                        |  3 +++
 kernel/sched.c                          |  6 ++++++
 kernel/signal.c                         |  2 ++
 samples/tracepoints/tp-samples-trace.h  |  4 ++--
 samples/tracepoints/tracepoint-sample.c |  3 +++
 11 files changed, 66 insertions(+), 25 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/tracepoints.txt b/Documentation/tracepoints.txt
index 5d354e16749..e8ad47b437f 100644
--- a/Documentation/tracepoints.txt
+++ b/Documentation/tracepoints.txt
@@ -42,7 +42,7 @@ In include/trace/subsys.h :
 
 #include <linux/tracepoint.h>
 
-DEFINE_TRACE(subsys_eventname,
+DECLARE_TRACE(subsys_eventname,
 	TPPTOTO(int firstarg, struct task_struct *p),
 	TPARGS(firstarg, p));
 
@@ -50,6 +50,8 @@ In subsys/file.c (where the tracing statement must be added) :
 
 #include <trace/subsys.h>
 
+DEFINE_TRACE(subsys_eventname);
+
 void somefct(void)
 {
 	...
@@ -86,6 +88,9 @@ to limit collisions. Tracepoint names are global to the kernel: they are
 considered as being the same whether they are in the core kernel image or in
 modules.
 
+If the tracepoint has to be used in kernel modules, an
+EXPORT_TRACEPOINT_SYMBOL_GPL() or EXPORT_TRACEPOINT_SYMBOL() can be used to
+export the defined tracepoints.
 
 * Probe / tracepoint example
 
diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h
index a5e4ed9baec..3b46ae46493 100644
--- a/include/asm-generic/vmlinux.lds.h
+++ b/include/asm-generic/vmlinux.lds.h
@@ -71,6 +71,7 @@
 	VMLINUX_SYMBOL(__start___markers) = .;				\
 	*(__markers)							\
 	VMLINUX_SYMBOL(__stop___markers) = .;				\
+	. = ALIGN(32);							\
 	VMLINUX_SYMBOL(__start___tracepoints) = .;			\
 	*(__tracepoints)						\
 	VMLINUX_SYMBOL(__stop___tracepoints) = .;			\
diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h
index 7e9b42aeae0..75700545836 100644
--- a/include/linux/tracepoint.h
+++ b/include/linux/tracepoint.h
@@ -24,8 +24,12 @@ struct tracepoint {
 	const char *name;		/* Tracepoint name */
 	int state;			/* State. */
 	void **funcs;
-} __attribute__((aligned(8)));
-
+} __attribute__((aligned(32)));		/*
+					 * Aligned on 32 bytes because it is
+					 * globally visible and gcc happily
+					 * align these on the structure size.
+					 * Keep in sync with vmlinux.lds.h.
+					 */
 
 #define TPPROTO(args...)	args
 #define TPARGS(args...)		args
@@ -55,15 +59,10 @@ struct tracepoint {
  * not add unwanted padding between the beginning of the section and the
  * structure. Force alignment to the same alignment as the section start.
  */
-#define DEFINE_TRACE(name, proto, args)					\
+#define DECLARE_TRACE(name, proto, args)				\
+	extern struct tracepoint __tracepoint_##name;			\
 	static inline void trace_##name(proto)				\
 	{								\
-		static const char __tpstrtab_##name[]			\
-		__attribute__((section("__tracepoints_strings")))	\
-		= #name;						\
-		static struct tracepoint __tracepoint_##name		\
-		__attribute__((section("__tracepoints"), aligned(8))) =	\
-		{ __tpstrtab_##name, 0, NULL };				\
 		if (unlikely(__tracepoint_##name.state))		\
 			__DO_TRACE(&__tracepoint_##name,		\
 				TPPROTO(proto), TPARGS(args));		\
@@ -77,11 +76,23 @@ struct tracepoint {
 		return tracepoint_probe_unregister(#name, (void *)probe);\
 	}
 
+#define DEFINE_TRACE(name)						\
+	static const char __tpstrtab_##name[]				\
+	__attribute__((section("__tracepoints_strings"))) = #name;	\
+	struct tracepoint __tracepoint_##name				\
+	__attribute__((section("__tracepoints"), aligned(32))) =	\
+		{ __tpstrtab_##name, 0, NULL }
+
+#define EXPORT_TRACEPOINT_SYMBOL_GPL(name)				\
+	EXPORT_SYMBOL_GPL(__tracepoint_##name)
+#define EXPORT_TRACEPOINT_SYMBOL(name)					\
+	EXPORT_SYMBOL(__tracepoint_##name)
+
 extern void tracepoint_update_probe_range(struct tracepoint *begin,
 	struct tracepoint *end);
 
 #else /* !CONFIG_TRACEPOINTS */
-#define DEFINE_TRACE(name, proto, args)			\
+#define DECLARE_TRACE(name, proto, args)				\
 	static inline void _do_trace_##name(struct tracepoint *tp, proto) \
 	{ }								\
 	static inline void trace_##name(proto)				\
@@ -95,6 +106,10 @@ extern void tracepoint_update_probe_range(struct tracepoint *begin,
 		return -ENOSYS;						\
 	}
 
+#define DEFINE_TRACE(name)
+#define EXPORT_TRACEPOINT_SYMBOL_GPL(name)
+#define EXPORT_TRACEPOINT_SYMBOL(name)
+
 static inline void tracepoint_update_probe_range(struct tracepoint *begin,
 	struct tracepoint *end)
 { }
diff --git a/include/trace/sched.h b/include/trace/sched.h
index ad47369d01b..9b2854abf7e 100644
--- a/include/trace/sched.h
+++ b/include/trace/sched.h
@@ -4,52 +4,52 @@
 #include <linux/sched.h>
 #include <linux/tracepoint.h>
 
-DEFINE_TRACE(sched_kthread_stop,
+DECLARE_TRACE(sched_kthread_stop,
 	TPPROTO(struct task_struct *t),
 		TPARGS(t));
 
-DEFINE_TRACE(sched_kthread_stop_ret,
+DECLARE_TRACE(sched_kthread_stop_ret,
 	TPPROTO(int ret),
 		TPARGS(ret));
 
-DEFINE_TRACE(sched_wait_task,
+DECLARE_TRACE(sched_wait_task,
 	TPPROTO(struct rq *rq, struct task_struct *p),
 		TPARGS(rq, p));
 
-DEFINE_TRACE(sched_wakeup,
+DECLARE_TRACE(sched_wakeup,
 	TPPROTO(struct rq *rq, struct task_struct *p),
 		TPARGS(rq, p));
 
-DEFINE_TRACE(sched_wakeup_new,
+DECLARE_TRACE(sched_wakeup_new,
 	TPPROTO(struct rq *rq, struct task_struct *p),
 		TPARGS(rq, p));
 
-DEFINE_TRACE(sched_switch,
+DECLARE_TRACE(sched_switch,
 	TPPROTO(struct rq *rq, struct task_struct *prev,
 		struct task_struct *next),
 		TPARGS(rq, prev, next));
 
-DEFINE_TRACE(sched_migrate_task,
+DECLARE_TRACE(sched_migrate_task,
 	TPPROTO(struct rq *rq, struct task_struct *p, int dest_cpu),
 		TPARGS(rq, p, dest_cpu));
 
-DEFINE_TRACE(sched_process_free,
+DECLARE_TRACE(sched_process_free,
 	TPPROTO(struct task_struct *p),
 		TPARGS(p));
 
-DEFINE_TRACE(sched_process_exit,
+DECLARE_TRACE(sched_process_exit,
 	TPPROTO(struct task_struct *p),
 		TPARGS(p));
 
-DEFINE_TRACE(sched_process_wait,
+DECLARE_TRACE(sched_process_wait,
 	TPPROTO(struct pid *pid),
 		TPARGS(pid));
 
-DEFINE_TRACE(sched_process_fork,
+DECLARE_TRACE(sched_process_fork,
 	TPPROTO(struct task_struct *parent, struct task_struct *child),
 		TPARGS(parent, child));
 
-DEFINE_TRACE(sched_signal_send,
+DECLARE_TRACE(sched_signal_send,
 	TPPROTO(int sig, struct task_struct *p),
 		TPARGS(sig, p));
 
diff --git a/kernel/exit.c b/kernel/exit.c
index ae2b92be5fa..f995d241866 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -54,6 +54,10 @@
 #include <asm/pgtable.h>
 #include <asm/mmu_context.h>
 
+DEFINE_TRACE(sched_process_free);
+DEFINE_TRACE(sched_process_exit);
+DEFINE_TRACE(sched_process_wait);
+
 static void exit_mm(struct task_struct * tsk);
 
 static inline int task_detached(struct task_struct *p)
diff --git a/kernel/fork.c b/kernel/fork.c
index f6083561dfe..0837d0deee5 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -79,6 +79,8 @@ DEFINE_PER_CPU(unsigned long, process_counts) = 0;
 
 __cacheline_aligned DEFINE_RWLOCK(tasklist_lock);  /* outer */
 
+DEFINE_TRACE(sched_process_fork);
+
 int nr_processes(void)
 {
 	int cpu;
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 8e7a7ce3ed0..4fbc456f393 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -21,6 +21,9 @@ static DEFINE_SPINLOCK(kthread_create_lock);
 static LIST_HEAD(kthread_create_list);
 struct task_struct *kthreadd_task;
 
+DEFINE_TRACE(sched_kthread_stop);
+DEFINE_TRACE(sched_kthread_stop_ret);
+
 struct kthread_create_info
 {
 	/* Information passed to kthread() from kthreadd. */
diff --git a/kernel/sched.c b/kernel/sched.c
index 50a21f96467..327f91c63c9 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -118,6 +118,12 @@
  */
 #define RUNTIME_INF	((u64)~0ULL)
 
+DEFINE_TRACE(sched_wait_task);
+DEFINE_TRACE(sched_wakeup);
+DEFINE_TRACE(sched_wakeup_new);
+DEFINE_TRACE(sched_switch);
+DEFINE_TRACE(sched_migrate_task);
+
 #ifdef CONFIG_SMP
 /*
  * Divide a load by a sched group cpu_power : (load / sg->__cpu_power)
diff --git a/kernel/signal.c b/kernel/signal.c
index 4530fc65445..e9afe63da24 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -41,6 +41,8 @@
 
 static struct kmem_cache *sigqueue_cachep;
 
+DEFINE_TRACE(sched_signal_send);
+
 static void __user *sig_handler(struct task_struct *t, int sig)
 {
 	return t->sighand->action[sig - 1].sa.sa_handler;
diff --git a/samples/tracepoints/tp-samples-trace.h b/samples/tracepoints/tp-samples-trace.h
index 0216b55bd64..01724e04c55 100644
--- a/samples/tracepoints/tp-samples-trace.h
+++ b/samples/tracepoints/tp-samples-trace.h
@@ -4,10 +4,10 @@
 #include <linux/proc_fs.h>	/* for struct inode and struct file */
 #include <linux/tracepoint.h>
 
-DEFINE_TRACE(subsys_event,
+DECLARE_TRACE(subsys_event,
 	TPPROTO(struct inode *inode, struct file *file),
 	TPARGS(inode, file));
-DEFINE_TRACE(subsys_eventb,
+DECLARE_TRACE(subsys_eventb,
 	TPPROTO(void),
 	TPARGS());
 #endif
diff --git a/samples/tracepoints/tracepoint-sample.c b/samples/tracepoints/tracepoint-sample.c
index 4ae4b7fcc04..00d169792a3 100644
--- a/samples/tracepoints/tracepoint-sample.c
+++ b/samples/tracepoints/tracepoint-sample.c
@@ -13,6 +13,9 @@
 #include <linux/proc_fs.h>
 #include "tp-samples-trace.h"
 
+DEFINE_TRACE(subsys_event);
+DEFINE_TRACE(subsys_eventb);
+
 struct proc_dir_entry *pentry_example;
 
 static int my_open(struct inode *inode, struct file *file)
-- 
cgit v1.2.3-70-g09d2


From f004f3ea34209d8b836426b26ade3dc502631b18 Mon Sep 17 00:00:00 2001
From: Paulius Zaleckas <paulius.zaleckas@teltonika.lt>
Date: Fri, 14 Nov 2008 00:24:34 +0000
Subject: phylib: make mdio-gpio work without OF (v4)

make mdio-gpio work with non OpenFirmware gpio implementation.

Aditional changes to mdio-gpio:
- use gpio_request() and gpio_free()
- place irq[] array in struct mdio_gpio_info
- add module description, author and license
- add note about compiling this driver as module
- rename mdc and mdio function (were ugly names)
- change MII to MDIO in bus name
- add __init __exit to module (un)loading functions
- probe fails if no phys added to the bus
- kzalloc bitbang with sizeof(*bitbang)

Changes since v3:
- keep bus naming "%x" to be compatible with existing drivers.

Changes since v2:
- more #ifdefs reduction
- platform driver will be registered on OF platforms also
- unified platform and OF bus_id to phy%i

Changes since v1:
- removed NO_IRQ
- reduced #idefs

Laurent, please test this driver under OF.

Signed-off-by: Paulius Zaleckas <paulius.zaleckas@teltonika.lt>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/phy/Kconfig     |   5 +-
 drivers/net/phy/mdio-gpio.c | 232 +++++++++++++++++++++++++++++++-------------
 include/linux/mdio-gpio.h   |  25 +++++
 3 files changed, 191 insertions(+), 71 deletions(-)
 create mode 100644 include/linux/mdio-gpio.h

(limited to 'include/linux')

diff --git a/drivers/net/phy/Kconfig b/drivers/net/phy/Kconfig
index 03180775199..c4c5a2f8ec7 100644
--- a/drivers/net/phy/Kconfig
+++ b/drivers/net/phy/Kconfig
@@ -86,8 +86,11 @@ config MDIO_BITBANG
 
 config MDIO_GPIO
 	tristate "Support for GPIO lib-based bitbanged MDIO buses"
-	depends on MDIO_BITBANG && OF_GPIO
+	depends on MDIO_BITBANG && GENERIC_GPIO
 	---help---
 	  Supports GPIO lib-based MDIO busses.
 
+	  To compile this driver as a module, choose M here: the module
+	  will be called mdio-gpio.
+
 endif # PHYLIB
diff --git a/drivers/net/phy/mdio-gpio.c b/drivers/net/phy/mdio-gpio.c
index 2ff97754e57..a439ebeb431 100644
--- a/drivers/net/phy/mdio-gpio.c
+++ b/drivers/net/phy/mdio-gpio.c
@@ -1,9 +1,12 @@
 /*
- * OpenFirmware GPIO based MDIO bitbang driver.
+ * GPIO based MDIO bitbang driver.
+ * Supports OpenFirmware.
  *
  * Copyright (c) 2008 CSE Semaphore Belgium.
  *  by Laurent Pinchart <laurentp@cse-semaphore.com>
  *
+ * Copyright (C) 2008, Paulius Zaleckas <paulius.zaleckas@teltonika.lt>
+ *
  * Based on earlier work by
  *
  * Copyright (c) 2003 Intracom S.A.
@@ -21,9 +24,14 @@
 #include <linux/slab.h>
 #include <linux/init.h>
 #include <linux/interrupt.h>
-#include <linux/mdio-bitbang.h>
+#include <linux/platform_device.h>
+#include <linux/gpio.h>
+#include <linux/mdio-gpio.h>
+
+#ifdef CONFIG_OF_GPIO
 #include <linux/of_gpio.h>
 #include <linux/of_platform.h>
+#endif
 
 struct mdio_gpio_info {
 	struct mdiobb_ctrl ctrl;
@@ -41,7 +49,7 @@ static void mdio_dir(struct mdiobb_ctrl *ctrl, int dir)
 		gpio_direction_input(bitbang->mdio);
 }
 
-static int mdio_read(struct mdiobb_ctrl *ctrl)
+static int mdio_get(struct mdiobb_ctrl *ctrl)
 {
 	struct mdio_gpio_info *bitbang =
 		container_of(ctrl, struct mdio_gpio_info, ctrl);
@@ -49,7 +57,7 @@ static int mdio_read(struct mdiobb_ctrl *ctrl)
 	return gpio_get_value(bitbang->mdio);
 }
 
-static void mdio(struct mdiobb_ctrl *ctrl, int what)
+static void mdio_set(struct mdiobb_ctrl *ctrl, int what)
 {
 	struct mdio_gpio_info *bitbang =
 		container_of(ctrl, struct mdio_gpio_info, ctrl);
@@ -57,7 +65,7 @@ static void mdio(struct mdiobb_ctrl *ctrl, int what)
 	gpio_set_value(bitbang->mdio, what);
 }
 
-static void mdc(struct mdiobb_ctrl *ctrl, int what)
+static void mdc_set(struct mdiobb_ctrl *ctrl, int what)
 {
 	struct mdio_gpio_info *bitbang =
 		container_of(ctrl, struct mdio_gpio_info, ctrl);
@@ -67,93 +75,69 @@ static void mdc(struct mdiobb_ctrl *ctrl, int what)
 
 static struct mdiobb_ops mdio_gpio_ops = {
 	.owner = THIS_MODULE,
-	.set_mdc = mdc,
+	.set_mdc = mdc_set,
 	.set_mdio_dir = mdio_dir,
-	.set_mdio_data = mdio,
-	.get_mdio_data = mdio_read,
+	.set_mdio_data = mdio_set,
+	.get_mdio_data = mdio_get,
 };
 
-static int __devinit mdio_ofgpio_bitbang_init(struct mii_bus *bus,
-                                         struct device_node *np)
-{
-	struct mdio_gpio_info *bitbang = bus->priv;
-
-	bitbang->mdc = of_get_gpio(np, 0);
-	bitbang->mdio = of_get_gpio(np, 1);
-
-	if (bitbang->mdc < 0 || bitbang->mdio < 0)
-		return -ENODEV;
-
-	snprintf(bus->id, MII_BUS_ID_SIZE, "%x", bitbang->mdc);
-	return 0;
-}
-
-static void __devinit add_phy(struct mii_bus *bus, struct device_node *np)
+static int __devinit mdio_gpio_bus_init(struct device *dev,
+					struct mdio_gpio_platform_data *pdata,
+					int bus_id)
 {
-	const u32 *data;
-	int len, id, irq;
-
-	data = of_get_property(np, "reg", &len);
-	if (!data || len != 4)
-		return;
-
-	id = *data;
-	bus->phy_mask &= ~(1 << id);
-
-	irq = of_irq_to_resource(np, 0, NULL);
-	if (irq != NO_IRQ)
-		bus->irq[id] = irq;
-}
-
-static int __devinit mdio_ofgpio_probe(struct of_device *ofdev,
-                                        const struct of_device_id *match)
-{
-	struct device_node *np = NULL;
 	struct mii_bus *new_bus;
 	struct mdio_gpio_info *bitbang;
 	int ret = -ENOMEM;
 	int i;
 
-	bitbang = kzalloc(sizeof(struct mdio_gpio_info), GFP_KERNEL);
+	bitbang = kzalloc(sizeof(*bitbang), GFP_KERNEL);
 	if (!bitbang)
 		goto out;
 
 	bitbang->ctrl.ops = &mdio_gpio_ops;
+	bitbang->mdc = pdata->mdc;
+	bitbang->mdio = pdata->mdio;
 
 	new_bus = alloc_mdio_bitbang(&bitbang->ctrl);
 	if (!new_bus)
 		goto out_free_bitbang;
 
-	new_bus->name = "GPIO Bitbanged MII",
+	new_bus->name = "GPIO Bitbanged MDIO",
 
-	ret = mdio_ofgpio_bitbang_init(new_bus, ofdev->node);
-	if (ret)
-		goto out_free_bus;
+	ret = -ENODEV;
 
-	new_bus->phy_mask = ~0;
-	new_bus->irq = kmalloc(sizeof(int) * PHY_MAX_ADDR, GFP_KERNEL);
-	if (!new_bus->irq)
+	new_bus->phy_mask = pdata->phy_mask;
+	new_bus->irq = pdata->irqs;
+	new_bus->parent = dev;
+
+	if (new_bus->phy_mask == ~0)
 		goto out_free_bus;
 
 	for (i = 0; i < PHY_MAX_ADDR; i++)
-		new_bus->irq[i] = -1;
+		if (!new_bus->irq[i])
+			new_bus->irq[i] = PHY_POLL;
 
-	while ((np = of_get_next_child(ofdev->node, np)))
-		if (!strcmp(np->type, "ethernet-phy"))
-			add_phy(new_bus, np);
+	snprintf(new_bus->id, MII_BUS_ID_SIZE, "%x", bus_id);
+
+	if (gpio_request(bitbang->mdc, "mdc"))
+		goto out_free_bus;
 
-	new_bus->parent = &ofdev->dev;
-	dev_set_drvdata(&ofdev->dev, new_bus);
+	if (gpio_request(bitbang->mdio, "mdio"))
+		goto out_free_mdc;
+
+	dev_set_drvdata(dev, new_bus);
 
 	ret = mdiobus_register(new_bus);
 	if (ret)
-		goto out_free_irqs;
+		goto out_free_all;
 
 	return 0;
 
-out_free_irqs:
-	dev_set_drvdata(&ofdev->dev, NULL);
-	kfree(new_bus->irq);
+out_free_all:
+	dev_set_drvdata(dev, NULL);
+	gpio_free(bitbang->mdio);
+out_free_mdc:
+	gpio_free(bitbang->mdc);
 out_free_bus:
 	free_mdio_bitbang(new_bus);
 out_free_bitbang:
@@ -162,16 +146,86 @@ out:
 	return ret;
 }
 
-static int mdio_ofgpio_remove(struct of_device *ofdev)
+static void __devexit mdio_gpio_bus_destroy(struct device *dev)
 {
-	struct mii_bus *bus = dev_get_drvdata(&ofdev->dev);
+	struct mii_bus *bus = dev_get_drvdata(dev);
 	struct mdio_gpio_info *bitbang = bus->priv;
 
 	mdiobus_unregister(bus);
-	kfree(bus->irq);
 	free_mdio_bitbang(bus);
-	dev_set_drvdata(&ofdev->dev, NULL);
+	dev_set_drvdata(dev, NULL);
+	gpio_free(bitbang->mdc);
+	gpio_free(bitbang->mdio);
 	kfree(bitbang);
+}
+
+static int __devinit mdio_gpio_probe(struct platform_device *pdev)
+{
+	struct mdio_gpio_platform_data *pdata = pdev->dev.platform_data;
+
+	if (!pdata)
+		return -ENODEV;
+
+	return mdio_gpio_bus_init(&pdev->dev, pdata, pdev->id);
+}
+
+static int __devexit mdio_gpio_remove(struct platform_device *pdev)
+{
+	mdio_gpio_bus_destroy(&pdev->dev);
+
+	return 0;
+}
+
+#ifdef CONFIG_OF_GPIO
+static void __devinit add_phy(struct mdio_gpio_platform_data *pdata,
+			      struct device_node *np)
+{
+	const u32 *data;
+	int len, id, irq;
+
+	data = of_get_property(np, "reg", &len);
+	if (!data || len != 4)
+		return;
+
+	id = *data;
+	pdata->phy_mask &= ~(1 << id);
+
+	irq = of_irq_to_resource(np, 0, NULL);
+	if (irq)
+		pdata->irqs[id] = irq;
+}
+
+static int __devinit mdio_ofgpio_probe(struct of_device *ofdev,
+                                        const struct of_device_id *match)
+{
+	struct device_node *np = NULL;
+	struct mdio_gpio_platform_data *pdata;
+
+	pdata = kzalloc(sizeof(*pdata), GFP_KERNEL);
+	if (!pdata)
+		return -ENOMEM;
+
+	pdata->mdc = of_get_gpio(ofdev->node, 0);
+	pdata->mdio = of_get_gpio(ofdev->node, 1);
+
+	if (pdata->mdc < 0 || pdata->mdio < 0)
+		goto out_free;
+
+	while ((np = of_get_next_child(ofdev->node, np)))
+		if (!strcmp(np->type, "ethernet-phy"))
+			add_phy(pdata, np);
+
+	return mdio_gpio_bus_init(&ofdev->dev, pdata, pdata->mdc);
+
+out_free:
+	kfree(pdata);
+	return -ENODEV;
+}
+
+static int __devexit mdio_ofgpio_remove(struct of_device *ofdev)
+{
+	mdio_gpio_bus_destroy(&ofdev->dev);
+	kfree(ofdev->dev.platform_data);
 
 	return 0;
 }
@@ -187,18 +241,56 @@ static struct of_platform_driver mdio_ofgpio_driver = {
 	.name = "mdio-gpio",
 	.match_table = mdio_ofgpio_match,
 	.probe = mdio_ofgpio_probe,
-	.remove = mdio_ofgpio_remove,
+	.remove = __devexit_p(mdio_ofgpio_remove),
 };
 
-static int mdio_ofgpio_init(void)
+static inline int __init mdio_ofgpio_init(void)
 {
 	return of_register_platform_driver(&mdio_ofgpio_driver);
 }
 
-static void mdio_ofgpio_exit(void)
+static inline void __exit mdio_ofgpio_exit(void)
 {
 	of_unregister_platform_driver(&mdio_ofgpio_driver);
 }
+#else
+static inline int __init mdio_ofgpio_init(void) { return 0; }
+static inline void __exit mdio_ofgpio_exit(void) { }
+#endif /* CONFIG_OF_GPIO */
+
+static struct platform_driver mdio_gpio_driver = {
+	.probe = mdio_gpio_probe,
+	.remove = __devexit_p(mdio_gpio_remove),
+	.driver		= {
+		.name	= "mdio-gpio",
+		.owner	= THIS_MODULE,
+	},
+};
+
+static int __init mdio_gpio_init(void)
+{
+	int ret;
+
+	ret = mdio_ofgpio_init();
+	if (ret)
+		return ret;
+
+	ret = platform_driver_register(&mdio_gpio_driver);
+	if (ret)
+		mdio_ofgpio_exit();
+
+	return ret;
+}
+module_init(mdio_gpio_init);
+
+static void __exit mdio_gpio_exit(void)
+{
+	platform_driver_unregister(&mdio_gpio_driver);
+	mdio_ofgpio_exit();
+}
+module_exit(mdio_gpio_exit);
 
-module_init(mdio_ofgpio_init);
-module_exit(mdio_ofgpio_exit);
+MODULE_ALIAS("platform:mdio-gpio");
+MODULE_AUTHOR("Laurent Pinchart, Paulius Zaleckas");
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("Generic driver for MDIO bus emulation using GPIO");
diff --git a/include/linux/mdio-gpio.h b/include/linux/mdio-gpio.h
new file mode 100644
index 00000000000..e9d3fdfe41d
--- /dev/null
+++ b/include/linux/mdio-gpio.h
@@ -0,0 +1,25 @@
+/*
+ * MDIO-GPIO bus platform data structures
+ *
+ * Copyright (C) 2008, Paulius Zaleckas <paulius.zaleckas@teltonika.lt>
+ *
+ * This file is licensed under the terms of the GNU General Public License
+ * version 2. This program is licensed "as is" without any warranty of any
+ * kind, whether express or implied.
+ */
+
+#ifndef __LINUX_MDIO_GPIO_H
+#define __LINUX_MDIO_GPIO_H
+
+#include <linux/mdio-bitbang.h>
+
+struct mdio_gpio_platform_data {
+	/* GPIO numbers for bus pins */
+	unsigned int mdc;
+	unsigned int mdio;
+
+	unsigned int phy_mask;
+	int irqs[PHY_MAX_ADDR];
+};
+
+#endif /* __LINUX_MDIO_GPIO_H */
-- 
cgit v1.2.3-70-g09d2


From e8b2dfe9b4501ed0047459b2756ba26e5a940a69 Mon Sep 17 00:00:00 2001
From: Balazs Scheidler <bazsi@balabit.hu>
Date: Sun, 16 Nov 2008 19:32:39 -0800
Subject: TPROXY: implemented IP_RECVORIGDSTADDR socket option

In case UDP traffic is redirected to a local UDP socket,
the originally addressed destination address/port
cannot be recovered with the in-kernel tproxy.

This patch adds an IP_RECVORIGDSTADDR sockopt that enables
a IP_ORIGDSTADDR ancillary message in recvmsg(). This
ancillary message contains the original destination address/port
of the packet being received.

Signed-off-by: Balazs Scheidler <bazsi@balabit.hu>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/in.h     |  4 ++++
 net/ipv4/ip_sockglue.c | 40 +++++++++++++++++++++++++++++++++++++++-
 2 files changed, 43 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/in.h b/include/linux/in.h
index db458beef19..d60122a3a08 100644
--- a/include/linux/in.h
+++ b/include/linux/in.h
@@ -80,6 +80,10 @@ struct in_addr {
 /* BSD compatibility */
 #define IP_RECVRETOPTS	IP_RETOPTS
 
+/* TProxy original addresses */
+#define IP_ORIGDSTADDR       20
+#define IP_RECVORIGDSTADDR   IP_ORIGDSTADDR
+
 /* IP_MTU_DISCOVER values */
 #define IP_PMTUDISC_DONT		0	/* Never send DF frames */
 #define IP_PMTUDISC_WANT		1	/* Use per route hints	*/
diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
index e976efeb145..624b534201e 100644
--- a/net/ipv4/ip_sockglue.c
+++ b/net/ipv4/ip_sockglue.c
@@ -48,6 +48,7 @@
 #define IP_CMSG_RECVOPTS	8
 #define IP_CMSG_RETOPTS		16
 #define IP_CMSG_PASSSEC		32
+#define IP_CMSG_ORIGDSTADDR     64
 
 /*
  *	SOL_IP control messages.
@@ -126,6 +127,27 @@ static void ip_cmsg_recv_security(struct msghdr *msg, struct sk_buff *skb)
 	security_release_secctx(secdata, seclen);
 }
 
+void ip_cmsg_recv_dstaddr(struct msghdr *msg, struct sk_buff *skb)
+{
+	struct sockaddr_in sin;
+	struct iphdr *iph = ip_hdr(skb);
+	u16 *ports = (u16 *) skb_transport_header(skb);
+
+	if (skb_transport_offset(skb) + 4 > skb->len)
+		return;
+
+	/* All current transport protocols have the port numbers in the
+	 * first four bytes of the transport header and this function is
+	 * written with this assumption in mind.
+	 */
+
+	sin.sin_family = AF_INET;
+	sin.sin_addr.s_addr = iph->daddr;
+	sin.sin_port = ports[1];
+	memset(sin.sin_zero, 0, sizeof(sin.sin_zero));
+
+	put_cmsg(msg, SOL_IP, IP_ORIGDSTADDR, sizeof(sin), &sin);
+}
 
 void ip_cmsg_recv(struct msghdr *msg, struct sk_buff *skb)
 {
@@ -160,6 +182,12 @@ void ip_cmsg_recv(struct msghdr *msg, struct sk_buff *skb)
 
 	if (flags & 1)
 		ip_cmsg_recv_security(msg, skb);
+
+	if ((flags>>=1) == 0)
+		return;
+	if (flags & 1)
+		ip_cmsg_recv_dstaddr(msg, skb);
+
 }
 
 int ip_cmsg_send(struct net *net, struct msghdr *msg, struct ipcm_cookie *ipc)
@@ -421,7 +449,8 @@ static int do_ip_setsockopt(struct sock *sk, int level,
 			     (1<<IP_ROUTER_ALERT) | (1<<IP_FREEBIND) |
 			     (1<<IP_PASSSEC) | (1<<IP_TRANSPARENT))) ||
 	    optname == IP_MULTICAST_TTL ||
-	    optname == IP_MULTICAST_LOOP) {
+	    optname == IP_MULTICAST_LOOP ||
+	    optname == IP_RECVORIGDSTADDR) {
 		if (optlen >= sizeof(int)) {
 			if (get_user(val, (int __user *) optval))
 				return -EFAULT;
@@ -509,6 +538,12 @@ static int do_ip_setsockopt(struct sock *sk, int level,
 		else
 			inet->cmsg_flags &= ~IP_CMSG_PASSSEC;
 		break;
+	case IP_RECVORIGDSTADDR:
+		if (val)
+			inet->cmsg_flags |= IP_CMSG_ORIGDSTADDR;
+		else
+			inet->cmsg_flags &= ~IP_CMSG_ORIGDSTADDR;
+		break;
 	case IP_TOS:	/* This sets both TOS and Precedence */
 		if (sk->sk_type == SOCK_STREAM) {
 			val &= ~3;
@@ -1022,6 +1057,9 @@ static int do_ip_getsockopt(struct sock *sk, int level, int optname,
 	case IP_PASSSEC:
 		val = (inet->cmsg_flags & IP_CMSG_PASSSEC) != 0;
 		break;
+	case IP_RECVORIGDSTADDR:
+		val = (inet->cmsg_flags & IP_CMSG_ORIGDSTADDR) != 0;
+		break;
 	case IP_TOS:
 		val = inet->tos;
 		break;
-- 
cgit v1.2.3-70-g09d2


From bbaffaca4810de1a25e32ecaf836eeaacc7a3d11 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <dada1@cosmosbay.com>
Date: Sun, 16 Nov 2008 19:37:55 -0800
Subject: rcu: Introduce hlist_nulls variant of hlist

hlist uses NULL value to finish a chain.

hlist_nulls variant use the low order bit set to 1 to signal an end-of-list marker.

This allows to store many different end markers, so that some RCU lockless
algos (used in TCP/UDP stack for example) can save some memory barriers in
fast paths.

Two new files are added :

include/linux/list_nulls.h
  - mimics hlist part of include/linux/list.h, derived to hlist_nulls variant

include/linux/rculist_nulls.h
  - mimics hlist part of include/linux/rculist.h, derived to hlist_nulls variant

   Only four helpers are declared for the moment :

     hlist_nulls_del_init_rcu(), hlist_nulls_del_rcu(),
     hlist_nulls_add_head_rcu() and hlist_nulls_for_each_entry_rcu()

prefetches() were removed, since an end of list is not anymore NULL value.
prefetches() could trigger useless (and possibly dangerous) memory transactions.

Example of use (extracted from __udp4_lib_lookup())

	struct sock *sk, *result;
        struct hlist_nulls_node *node;
        unsigned short hnum = ntohs(dport);
        unsigned int hash = udp_hashfn(net, hnum);
        struct udp_hslot *hslot = &udptable->hash[hash];
        int score, badness;

        rcu_read_lock();
begin:
        result = NULL;
        badness = -1;
        sk_nulls_for_each_rcu(sk, node, &hslot->head) {
                score = compute_score(sk, net, saddr, hnum, sport,
                                      daddr, dport, dif);
                if (score > badness) {
                        result = sk;
                        badness = score;
                }
        }
        /*
         * if the nulls value we got at the end of this lookup is
         * not the expected one, we must restart lookup.
         * We probably met an item that was moved to another chain.
         */
        if (get_nulls_value(node) != hash)
                goto begin;

        if (result) {
                if (unlikely(!atomic_inc_not_zero(&result->sk_refcnt)))
                        result = NULL;
                else if (unlikely(compute_score(result, net, saddr, hnum, sport,
                                  daddr, dport, dif) < badness)) {
                        sock_put(result);
                        goto begin;
                }
        }
        rcu_read_unlock();
        return result;

Signed-off-by: Eric Dumazet <dada1@cosmosbay.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/list_nulls.h    |  94 ++++++++++++++++++++++++++++++++++++
 include/linux/rculist_nulls.h | 110 ++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 204 insertions(+)
 create mode 100644 include/linux/list_nulls.h
 create mode 100644 include/linux/rculist_nulls.h

(limited to 'include/linux')

diff --git a/include/linux/list_nulls.h b/include/linux/list_nulls.h
new file mode 100644
index 00000000000..93150ecf3ea
--- /dev/null
+++ b/include/linux/list_nulls.h
@@ -0,0 +1,94 @@
+#ifndef _LINUX_LIST_NULLS_H
+#define _LINUX_LIST_NULLS_H
+
+/*
+ * Special version of lists, where end of list is not a NULL pointer,
+ * but a 'nulls' marker, which can have many different values.
+ * (up to 2^31 different values guaranteed on all platforms)
+ *
+ * In the standard hlist, termination of a list is the NULL pointer.
+ * In this special 'nulls' variant, we use the fact that objects stored in
+ * a list are aligned on a word (4 or 8 bytes alignment).
+ * We therefore use the last significant bit of 'ptr' :
+ * Set to 1 : This is a 'nulls' end-of-list marker (ptr >> 1)
+ * Set to 0 : This is a pointer to some object (ptr)
+ */
+
+struct hlist_nulls_head {
+	struct hlist_nulls_node *first;
+};
+
+struct hlist_nulls_node {
+	struct hlist_nulls_node *next, **pprev;
+};
+#define INIT_HLIST_NULLS_HEAD(ptr, nulls) \
+	((ptr)->first = (struct hlist_nulls_node *) (1UL | (((long)nulls) << 1)))
+
+#define hlist_nulls_entry(ptr, type, member) container_of(ptr,type,member)
+/**
+ * ptr_is_a_nulls - Test if a ptr is a nulls
+ * @ptr: ptr to be tested
+ *
+ */
+static inline int is_a_nulls(const struct hlist_nulls_node *ptr)
+{
+	return ((unsigned long)ptr & 1);
+}
+
+/**
+ * get_nulls_value - Get the 'nulls' value of the end of chain
+ * @ptr: end of chain
+ *
+ * Should be called only if is_a_nulls(ptr);
+ */
+static inline unsigned long get_nulls_value(const struct hlist_nulls_node *ptr)
+{
+	return ((unsigned long)ptr) >> 1;
+}
+
+static inline int hlist_nulls_unhashed(const struct hlist_nulls_node *h)
+{
+	return !h->pprev;
+}
+
+static inline int hlist_nulls_empty(const struct hlist_nulls_head *h)
+{
+	return is_a_nulls(h->first);
+}
+
+static inline void __hlist_nulls_del(struct hlist_nulls_node *n)
+{
+	struct hlist_nulls_node *next = n->next;
+	struct hlist_nulls_node **pprev = n->pprev;
+	*pprev = next;
+	if (!is_a_nulls(next))
+		next->pprev = pprev;
+}
+
+/**
+ * hlist_nulls_for_each_entry	- iterate over list of given type
+ * @tpos:	the type * to use as a loop cursor.
+ * @pos:	the &struct hlist_node to use as a loop cursor.
+ * @head:	the head for your list.
+ * @member:	the name of the hlist_node within the struct.
+ *
+ */
+#define hlist_nulls_for_each_entry(tpos, pos, head, member)		       \
+	for (pos = (head)->first;					       \
+	     (!is_a_nulls(pos)) &&					       \
+		({ tpos = hlist_nulls_entry(pos, typeof(*tpos), member); 1;}); \
+	     pos = pos->next)
+
+/**
+ * hlist_nulls_for_each_entry_from - iterate over a hlist continuing from current point
+ * @tpos:	the type * to use as a loop cursor.
+ * @pos:	the &struct hlist_node to use as a loop cursor.
+ * @member:	the name of the hlist_node within the struct.
+ *
+ */
+#define hlist_nulls_for_each_entry_from(tpos, pos, member)	\
+	for (; (!is_a_nulls(pos)) && 				\
+		({ tpos = hlist_nulls_entry(pos, typeof(*tpos), member); 1;}); \
+	     pos = pos->next)
+
+#endif
diff --git a/include/linux/rculist_nulls.h b/include/linux/rculist_nulls.h
new file mode 100644
index 00000000000..f9ddd03961a
--- /dev/null
+++ b/include/linux/rculist_nulls.h
@@ -0,0 +1,110 @@
+#ifndef _LINUX_RCULIST_NULLS_H
+#define _LINUX_RCULIST_NULLS_H
+
+#ifdef __KERNEL__
+
+/*
+ * RCU-protected list version
+ */
+#include <linux/list_nulls.h>
+#include <linux/rcupdate.h>
+
+/**
+ * hlist_nulls_del_init_rcu - deletes entry from hash list with re-initialization
+ * @n: the element to delete from the hash list.
+ *
+ * Note: hlist_nulls_unhashed() on the node return true after this. It is
+ * useful for RCU based read lockfree traversal if the writer side
+ * must know if the list entry is still hashed or already unhashed.
+ *
+ * In particular, it means that we can not poison the forward pointers
+ * that may still be used for walking the hash list and we can only
+ * zero the pprev pointer so list_unhashed() will return true after
+ * this.
+ *
+ * The caller must take whatever precautions are necessary (such as
+ * holding appropriate locks) to avoid racing with another
+ * list-mutation primitive, such as hlist_nulls_add_head_rcu() or
+ * hlist_nulls_del_rcu(), running on this same list.  However, it is
+ * perfectly legal to run concurrently with the _rcu list-traversal
+ * primitives, such as hlist_nulls_for_each_entry_rcu().
+ */
+static inline void hlist_nulls_del_init_rcu(struct hlist_nulls_node *n)
+{
+	if (!hlist_nulls_unhashed(n)) {
+		__hlist_nulls_del(n);
+		n->pprev = NULL;
+	}
+}
+
+/**
+ * hlist_nulls_del_rcu - deletes entry from hash list without re-initialization
+ * @n: the element to delete from the hash list.
+ *
+ * Note: hlist_nulls_unhashed() on entry does not return true after this,
+ * the entry is in an undefined state. It is useful for RCU based
+ * lockfree traversal.
+ *
+ * In particular, it means that we can not poison the forward
+ * pointers that may still be used for walking the hash list.
+ *
+ * The caller must take whatever precautions are necessary
+ * (such as holding appropriate locks) to avoid racing
+ * with another list-mutation primitive, such as hlist_nulls_add_head_rcu()
+ * or hlist_nulls_del_rcu(), running on this same list.
+ * However, it is perfectly legal to run concurrently with
+ * the _rcu list-traversal primitives, such as
+ * hlist_nulls_for_each_entry().
+ */
+static inline void hlist_nulls_del_rcu(struct hlist_nulls_node *n)
+{
+	__hlist_nulls_del(n);
+	n->pprev = LIST_POISON2;
+}
+
+/**
+ * hlist_nulls_add_head_rcu
+ * @n: the element to add to the hash list.
+ * @h: the list to add to.
+ *
+ * Description:
+ * Adds the specified element to the specified hlist_nulls,
+ * while permitting racing traversals.
+ *
+ * The caller must take whatever precautions are necessary
+ * (such as holding appropriate locks) to avoid racing
+ * with another list-mutation primitive, such as hlist_nulls_add_head_rcu()
+ * or hlist_nulls_del_rcu(), running on this same list.
+ * However, it is perfectly legal to run concurrently with
+ * the _rcu list-traversal primitives, such as
+ * hlist_nulls_for_each_entry_rcu(), used to prevent memory-consistency
+ * problems on Alpha CPUs.  Regardless of the type of CPU, the
+ * list-traversal primitive must be guarded by rcu_read_lock().
+ */
+static inline void hlist_nulls_add_head_rcu(struct hlist_nulls_node *n,
+					struct hlist_nulls_head *h)
+{
+	struct hlist_nulls_node *first = h->first;
+
+	n->next = first;
+	n->pprev = &h->first;
+	rcu_assign_pointer(h->first, n);
+	if (!is_a_nulls(first))
+		first->pprev = &n->next;
+}
+/**
+ * hlist_nulls_for_each_entry_rcu - iterate over rcu list of given type
+ * @tpos:	the type * to use as a loop cursor.
+ * @pos:	the &struct hlist_nulls_node to use as a loop cursor.
+ * @head:	the head for your list.
+ * @member:	the name of the hlist_nulls_node within the struct.
+ *
+ */
+#define hlist_nulls_for_each_entry_rcu(tpos, pos, head, member) \
+	for (pos = rcu_dereference((head)->first);			 \
+		(!is_a_nulls(pos)) && 			\
+		({ tpos = hlist_nulls_entry(pos, typeof(*tpos), member); 1; }); \
+		pos = rcu_dereference(pos->next))
+
+#endif
+#endif
-- 
cgit v1.2.3-70-g09d2


From 88ab1932eac721c6e7336708558fa5ed02c85c80 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <dada1@cosmosbay.com>
Date: Sun, 16 Nov 2008 19:39:21 -0800
Subject: udp: Use hlist_nulls in UDP RCU code

This is a straightforward patch, using hlist_nulls infrastructure.

RCUification already done on UDP two weeks ago.

Using hlist_nulls permits us to avoid some memory barriers, both
at lookup time and delete time.

Patch is large because it adds new macros to include/net/sock.h.
These macros will be used by TCP & DCCP in next patch.

Signed-off-by: Eric Dumazet <dada1@cosmosbay.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/rculist.h | 17 ---------------
 include/net/sock.h      | 57 +++++++++++++++++++++++++++++++++++++++----------
 include/net/udp.h       |  2 +-
 net/ipv4/udp.c          | 47 +++++++++++++++++++---------------------
 net/ipv6/udp.c          | 26 +++++++++++-----------
 5 files changed, 83 insertions(+), 66 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/rculist.h b/include/linux/rculist.h
index 3ba2998b22b..e649bd3f2c9 100644
--- a/include/linux/rculist.h
+++ b/include/linux/rculist.h
@@ -383,22 +383,5 @@ static inline void hlist_add_after_rcu(struct hlist_node *prev,
 		({ tpos = hlist_entry(pos, typeof(*tpos), member); 1; }); \
 		pos = rcu_dereference(pos->next))
 
-/**
- * hlist_for_each_entry_rcu_safenext - iterate over rcu list of given type
- * @tpos:	the type * to use as a loop cursor.
- * @pos:	the &struct hlist_node to use as a loop cursor.
- * @head:	the head for your list.
- * @member:	the name of the hlist_node within the struct.
- * @next:       the &struct hlist_node to use as a next cursor
- *
- * Special version of hlist_for_each_entry_rcu that make sure
- * each next pointer is fetched before each iteration.
- */
-#define hlist_for_each_entry_rcu_safenext(tpos, pos, head, member, next) \
-	for (pos = rcu_dereference((head)->first);			 \
-		pos && ({ next = pos->next; smp_rmb(); prefetch(next); 1; }) &&	\
-		({ tpos = hlist_entry(pos, typeof(*tpos), member); 1; }); \
-		pos = rcu_dereference(next))
-
 #endif	/* __KERNEL__ */
 #endif
diff --git a/include/net/sock.h b/include/net/sock.h
index 8b2b82131b6..0a638948868 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -42,6 +42,7 @@
 
 #include <linux/kernel.h>
 #include <linux/list.h>
+#include <linux/list_nulls.h>
 #include <linux/timer.h>
 #include <linux/cache.h>
 #include <linux/module.h>
@@ -52,6 +53,7 @@
 #include <linux/security.h>
 
 #include <linux/filter.h>
+#include <linux/rculist_nulls.h>
 
 #include <asm/atomic.h>
 #include <net/dst.h>
@@ -106,6 +108,7 @@ struct net;
  *	@skc_reuse: %SO_REUSEADDR setting
  *	@skc_bound_dev_if: bound device index if != 0
  *	@skc_node: main hash linkage for various protocol lookup tables
+ *	@skc_nulls_node: main hash linkage for UDP/UDP-Lite protocol
  *	@skc_bind_node: bind hash linkage for various protocol lookup tables
  *	@skc_refcnt: reference count
  *	@skc_hash: hash value used with various protocol lookup tables
@@ -120,7 +123,10 @@ struct sock_common {
 	volatile unsigned char	skc_state;
 	unsigned char		skc_reuse;
 	int			skc_bound_dev_if;
-	struct hlist_node	skc_node;
+	union {
+		struct hlist_node	skc_node;
+		struct hlist_nulls_node skc_nulls_node;
+	};
 	struct hlist_node	skc_bind_node;
 	atomic_t		skc_refcnt;
 	unsigned int		skc_hash;
@@ -206,6 +212,7 @@ struct sock {
 #define sk_reuse		__sk_common.skc_reuse
 #define sk_bound_dev_if		__sk_common.skc_bound_dev_if
 #define sk_node			__sk_common.skc_node
+#define sk_nulls_node		__sk_common.skc_nulls_node
 #define sk_bind_node		__sk_common.skc_bind_node
 #define sk_refcnt		__sk_common.skc_refcnt
 #define sk_hash			__sk_common.skc_hash
@@ -300,12 +307,30 @@ static inline struct sock *sk_head(const struct hlist_head *head)
 	return hlist_empty(head) ? NULL : __sk_head(head);
 }
 
+static inline struct sock *__sk_nulls_head(const struct hlist_nulls_head *head)
+{
+	return hlist_nulls_entry(head->first, struct sock, sk_nulls_node);
+}
+
+static inline struct sock *sk_nulls_head(const struct hlist_nulls_head *head)
+{
+	return hlist_nulls_empty(head) ? NULL : __sk_nulls_head(head);
+}
+
 static inline struct sock *sk_next(const struct sock *sk)
 {
 	return sk->sk_node.next ?
 		hlist_entry(sk->sk_node.next, struct sock, sk_node) : NULL;
 }
 
+static inline struct sock *sk_nulls_next(const struct sock *sk)
+{
+	return (!is_a_nulls(sk->sk_nulls_node.next)) ?
+		hlist_nulls_entry(sk->sk_nulls_node.next,
+				  struct sock, sk_nulls_node) :
+		NULL;
+}
+
 static inline int sk_unhashed(const struct sock *sk)
 {
 	return hlist_unhashed(&sk->sk_node);
@@ -321,6 +346,11 @@ static __inline__ void sk_node_init(struct hlist_node *node)
 	node->pprev = NULL;
 }
 
+static __inline__ void sk_nulls_node_init(struct hlist_nulls_node *node)
+{
+	node->pprev = NULL;
+}
+
 static __inline__ void __sk_del_node(struct sock *sk)
 {
 	__hlist_del(&sk->sk_node);
@@ -367,18 +397,18 @@ static __inline__ int sk_del_node_init(struct sock *sk)
 	return rc;
 }
 
-static __inline__ int __sk_del_node_init_rcu(struct sock *sk)
+static __inline__ int __sk_nulls_del_node_init_rcu(struct sock *sk)
 {
 	if (sk_hashed(sk)) {
-		hlist_del_init_rcu(&sk->sk_node);
+		hlist_nulls_del_init_rcu(&sk->sk_nulls_node);
 		return 1;
 	}
 	return 0;
 }
 
-static __inline__ int sk_del_node_init_rcu(struct sock *sk)
+static __inline__ int sk_nulls_del_node_init_rcu(struct sock *sk)
 {
-	int rc = __sk_del_node_init_rcu(sk);
+	int rc = __sk_nulls_del_node_init_rcu(sk);
 
 	if (rc) {
 		/* paranoid for a while -acme */
@@ -399,15 +429,15 @@ static __inline__ void sk_add_node(struct sock *sk, struct hlist_head *list)
 	__sk_add_node(sk, list);
 }
 
-static __inline__ void __sk_add_node_rcu(struct sock *sk, struct hlist_head *list)
+static __inline__ void __sk_nulls_add_node_rcu(struct sock *sk, struct hlist_nulls_head *list)
 {
-	hlist_add_head_rcu(&sk->sk_node, list);
+	hlist_nulls_add_head_rcu(&sk->sk_nulls_node, list);
 }
 
-static __inline__ void sk_add_node_rcu(struct sock *sk, struct hlist_head *list)
+static __inline__ void sk_nulls_add_node_rcu(struct sock *sk, struct hlist_nulls_head *list)
 {
 	sock_hold(sk);
-	__sk_add_node_rcu(sk, list);
+	__sk_nulls_add_node_rcu(sk, list);
 }
 
 static __inline__ void __sk_del_bind_node(struct sock *sk)
@@ -423,11 +453,16 @@ static __inline__ void sk_add_bind_node(struct sock *sk,
 
 #define sk_for_each(__sk, node, list) \
 	hlist_for_each_entry(__sk, node, list, sk_node)
-#define sk_for_each_rcu_safenext(__sk, node, list, next) \
-	hlist_for_each_entry_rcu_safenext(__sk, node, list, sk_node, next)
+#define sk_nulls_for_each(__sk, node, list) \
+	hlist_nulls_for_each_entry(__sk, node, list, sk_nulls_node)
+#define sk_nulls_for_each_rcu(__sk, node, list) \
+	hlist_nulls_for_each_entry_rcu(__sk, node, list, sk_nulls_node)
 #define sk_for_each_from(__sk, node) \
 	if (__sk && ({ node = &(__sk)->sk_node; 1; })) \
 		hlist_for_each_entry_from(__sk, node, sk_node)
+#define sk_nulls_for_each_from(__sk, node) \
+	if (__sk && ({ node = &(__sk)->sk_nulls_node; 1; })) \
+		hlist_nulls_for_each_entry_from(__sk, node, sk_nulls_node)
 #define sk_for_each_continue(__sk, node) \
 	if (__sk && ({ node = &(__sk)->sk_node; 1; })) \
 		hlist_for_each_entry_continue(__sk, node, sk_node)
diff --git a/include/net/udp.h b/include/net/udp.h
index df2bfe54537..90e6ce56be6 100644
--- a/include/net/udp.h
+++ b/include/net/udp.h
@@ -51,7 +51,7 @@ struct udp_skb_cb {
 #define UDP_SKB_CB(__skb)	((struct udp_skb_cb *)((__skb)->cb))
 
 struct udp_hslot {
-	struct hlist_head	head;
+	struct hlist_nulls_head	head;
 	spinlock_t		lock;
 } __attribute__((aligned(2 * sizeof(long))));
 struct udp_table {
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 54badc9a019..fea2d873dd4 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -127,9 +127,9 @@ static int udp_lib_lport_inuse(struct net *net, __u16 num,
 						 const struct sock *sk2))
 {
 	struct sock *sk2;
-	struct hlist_node *node;
+	struct hlist_nulls_node *node;
 
-	sk_for_each(sk2, node, &hslot->head)
+	sk_nulls_for_each(sk2, node, &hslot->head)
 		if (net_eq(sock_net(sk2), net)			&&
 		    sk2 != sk					&&
 		    sk2->sk_hash == num				&&
@@ -189,12 +189,7 @@ int udp_lib_get_port(struct sock *sk, unsigned short snum,
 	inet_sk(sk)->num = snum;
 	sk->sk_hash = snum;
 	if (sk_unhashed(sk)) {
-		/*
-		 * We need that previous write to sk->sk_hash committed
-		 * before write to sk->next done in following add_node() variant
-		 */
-		smp_wmb();
-		sk_add_node_rcu(sk, &hslot->head);
+		sk_nulls_add_node_rcu(sk, &hslot->head);
 		sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
 	}
 	error = 0;
@@ -261,7 +256,7 @@ static struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr,
 		int dif, struct udp_table *udptable)
 {
 	struct sock *sk, *result;
-	struct hlist_node *node, *next;
+	struct hlist_nulls_node *node;
 	unsigned short hnum = ntohs(dport);
 	unsigned int hash = udp_hashfn(net, hnum);
 	struct udp_hslot *hslot = &udptable->hash[hash];
@@ -271,13 +266,7 @@ static struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr,
 begin:
 	result = NULL;
 	badness = -1;
-	sk_for_each_rcu_safenext(sk, node, &hslot->head, next) {
-		/*
-		 * lockless reader, and SLAB_DESTROY_BY_RCU items:
-		 * We must check this item was not moved to another chain
-		 */
-		if (udp_hashfn(net, sk->sk_hash) != hash)
-			goto begin;
+	sk_nulls_for_each_rcu(sk, node, &hslot->head) {
 		score = compute_score(sk, net, saddr, hnum, sport,
 				      daddr, dport, dif);
 		if (score > badness) {
@@ -285,6 +274,14 @@ begin:
 			badness = score;
 		}
 	}
+	/*
+	 * if the nulls value we got at the end of this lookup is
+	 * not the expected one, we must restart lookup.
+	 * We probably met an item that was moved to another chain.
+	 */
+	if (get_nulls_value(node) != hash)
+		goto begin;
+
 	if (result) {
 		if (unlikely(!atomic_inc_not_zero(&result->sk_refcnt)))
 			result = NULL;
@@ -325,11 +322,11 @@ static inline struct sock *udp_v4_mcast_next(struct net *net, struct sock *sk,
 					     __be16 rmt_port, __be32 rmt_addr,
 					     int dif)
 {
-	struct hlist_node *node;
+	struct hlist_nulls_node *node;
 	struct sock *s = sk;
 	unsigned short hnum = ntohs(loc_port);
 
-	sk_for_each_from(s, node) {
+	sk_nulls_for_each_from(s, node) {
 		struct inet_sock *inet = inet_sk(s);
 
 		if (!net_eq(sock_net(s), net)				||
@@ -977,7 +974,7 @@ void udp_lib_unhash(struct sock *sk)
 	struct udp_hslot *hslot = &udptable->hash[hash];
 
 	spin_lock_bh(&hslot->lock);
-	if (sk_del_node_init_rcu(sk)) {
+	if (sk_nulls_del_node_init_rcu(sk)) {
 		inet_sk(sk)->num = 0;
 		sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
 	}
@@ -1130,7 +1127,7 @@ static int __udp4_lib_mcast_deliver(struct net *net, struct sk_buff *skb,
 	int dif;
 
 	spin_lock(&hslot->lock);
-	sk = sk_head(&hslot->head);
+	sk = sk_nulls_head(&hslot->head);
 	dif = skb->dev->ifindex;
 	sk = udp_v4_mcast_next(net, sk, uh->dest, daddr, uh->source, saddr, dif);
 	if (sk) {
@@ -1139,7 +1136,7 @@ static int __udp4_lib_mcast_deliver(struct net *net, struct sk_buff *skb,
 		do {
 			struct sk_buff *skb1 = skb;
 
-			sknext = udp_v4_mcast_next(net, sk_next(sk), uh->dest,
+			sknext = udp_v4_mcast_next(net, sk_nulls_next(sk), uh->dest,
 						   daddr, uh->source, saddr,
 						   dif);
 			if (sknext)
@@ -1560,10 +1557,10 @@ static struct sock *udp_get_first(struct seq_file *seq, int start)
 	struct net *net = seq_file_net(seq);
 
 	for (state->bucket = start; state->bucket < UDP_HTABLE_SIZE; ++state->bucket) {
-		struct hlist_node *node;
+		struct hlist_nulls_node *node;
 		struct udp_hslot *hslot = &state->udp_table->hash[state->bucket];
 		spin_lock_bh(&hslot->lock);
-		sk_for_each(sk, node, &hslot->head) {
+		sk_nulls_for_each(sk, node, &hslot->head) {
 			if (!net_eq(sock_net(sk), net))
 				continue;
 			if (sk->sk_family == state->family)
@@ -1582,7 +1579,7 @@ static struct sock *udp_get_next(struct seq_file *seq, struct sock *sk)
 	struct net *net = seq_file_net(seq);
 
 	do {
-		sk = sk_next(sk);
+		sk = sk_nulls_next(sk);
 	} while (sk && (!net_eq(sock_net(sk), net) || sk->sk_family != state->family));
 
 	if (!sk) {
@@ -1753,7 +1750,7 @@ void __init udp_table_init(struct udp_table *table)
 	int i;
 
 	for (i = 0; i < UDP_HTABLE_SIZE; i++) {
-		INIT_HLIST_HEAD(&table->hash[i].head);
+		INIT_HLIST_NULLS_HEAD(&table->hash[i].head, i);
 		spin_lock_init(&table->hash[i].lock);
 	}
 }
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index 8dafa36b1ba..fd2d9ad4a8a 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -98,7 +98,7 @@ static struct sock *__udp6_lib_lookup(struct net *net,
 				      int dif, struct udp_table *udptable)
 {
 	struct sock *sk, *result;
-	struct hlist_node *node, *next;
+	struct hlist_nulls_node *node;
 	unsigned short hnum = ntohs(dport);
 	unsigned int hash = udp_hashfn(net, hnum);
 	struct udp_hslot *hslot = &udptable->hash[hash];
@@ -108,19 +108,21 @@ static struct sock *__udp6_lib_lookup(struct net *net,
 begin:
 	result = NULL;
 	badness = -1;
-	sk_for_each_rcu_safenext(sk, node, &hslot->head, next) {
-		/*
-		 * lockless reader, and SLAB_DESTROY_BY_RCU items:
-		 * We must check this item was not moved to another chain
-		 */
-		if (udp_hashfn(net, sk->sk_hash) != hash)
-			goto begin;
+	sk_nulls_for_each_rcu(sk, node, &hslot->head) {
 		score = compute_score(sk, net, hnum, saddr, sport, daddr, dport, dif);
 		if (score > badness) {
 			result = sk;
 			badness = score;
 		}
 	}
+	/*
+	 * if the nulls value we got at the end of this lookup is
+	 * not the expected one, we must restart lookup.
+	 * We probably met an item that was moved to another chain.
+	 */
+	if (get_nulls_value(node) != hash)
+		goto begin;
+
 	if (result) {
 		if (unlikely(!atomic_inc_not_zero(&result->sk_refcnt)))
 			result = NULL;
@@ -374,11 +376,11 @@ static struct sock *udp_v6_mcast_next(struct net *net, struct sock *sk,
 				      __be16 rmt_port, struct in6_addr *rmt_addr,
 				      int dif)
 {
-	struct hlist_node *node;
+	struct hlist_nulls_node *node;
 	struct sock *s = sk;
 	unsigned short num = ntohs(loc_port);
 
-	sk_for_each_from(s, node) {
+	sk_nulls_for_each_from(s, node) {
 		struct inet_sock *inet = inet_sk(s);
 
 		if (!net_eq(sock_net(s), net))
@@ -423,7 +425,7 @@ static int __udp6_lib_mcast_deliver(struct net *net, struct sk_buff *skb,
 	int dif;
 
 	spin_lock(&hslot->lock);
-	sk = sk_head(&hslot->head);
+	sk = sk_nulls_head(&hslot->head);
 	dif = inet6_iif(skb);
 	sk = udp_v6_mcast_next(net, sk, uh->dest, daddr, uh->source, saddr, dif);
 	if (!sk) {
@@ -432,7 +434,7 @@ static int __udp6_lib_mcast_deliver(struct net *net, struct sk_buff *skb,
 	}
 
 	sk2 = sk;
-	while ((sk2 = udp_v6_mcast_next(net, sk_next(sk2), uh->dest, daddr,
+	while ((sk2 = udp_v6_mcast_next(net, sk_nulls_next(sk2), uh->dest, daddr,
 					uh->source, saddr, dif))) {
 		struct sk_buff *buff = skb_clone(skb, GFP_ATOMIC);
 		if (buff) {
-- 
cgit v1.2.3-70-g09d2


From 3f2c31d90327f21d76d296af34aa4ca547932ff4 Mon Sep 17 00:00:00 2001
From: Mark McLoughlin <markmc@redhat.com>
Date: Sun, 16 Nov 2008 22:41:34 -0800
Subject: virtio_net: VIRTIO_NET_F_MSG_RXBUF (imprive rcv buffer allocation)

If segmentation offload is enabled by the host, we currently allocate
maximum sized packet buffers and pass them to the host. This uses up
20 ring entries, allowing us to supply only 20 packet buffers to the
host with a 256 entry ring. This is a huge overhead when receiving
small packets, and is most keenly felt when receiving MTU sized
packets from off-host.

The VIRTIO_NET_F_MRG_RXBUF feature flag is set by hosts which support
using receive buffers which are smaller than the maximum packet size.
In order to transfer large packets to the guest, the host merges
together multiple receive buffers to form a larger logical buffer.
The number of merged buffers is returned to the guest via a field in
the virtio_net_hdr.

Make use of this support by supplying single page receive buffers to
the host. On receive, we extract the virtio_net_hdr, copy 128 bytes of
the payload to the skb's linear data buffer and adjust the fragment
offset to point to the remaining data. This ensures proper alignment
and allows us to not use any paged data for small packets. If the
payload occupies multiple pages, we simply append those pages as
fragments and free the associated skbs.

This scheme allows us to be efficient in our use of ring entries
while still supporting large packets. Benchmarking using netperf from
an external machine to a guest over a 10Gb/s network shows a 100%
improvement from ~1Gb/s to ~2Gb/s. With a local host->guest benchmark
with GSO disabled on the host side, throughput was seen to increase
from 700Mb/s to 1.7Gb/s.

Based on a patch from Herbert Xu.

Signed-off-by: Mark McLoughlin <markmc@redhat.com>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au> (use netdev_priv)
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/virtio_net.c   | 173 +++++++++++++++++++++++++++++++++++++++------
 include/linux/virtio_net.h |   9 +++
 2 files changed, 162 insertions(+), 20 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index 27559c987d4..e6b5d6ef9ea 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -34,6 +34,7 @@ module_param(gso, bool, 0444);
 
 /* FIXME: MTU in config. */
 #define MAX_PACKET_LEN (ETH_HLEN+ETH_DATA_LEN)
+#define GOOD_COPY_LEN	128
 
 struct virtnet_info
 {
@@ -58,6 +59,9 @@ struct virtnet_info
 	/* I like... big packets and I cannot lie! */
 	bool big_packets;
 
+	/* Host will merge rx buffers for big packets (shake it! shake it!) */
+	bool mergeable_rx_bufs;
+
 	/* Receive & send queues. */
 	struct sk_buff_head recv;
 	struct sk_buff_head send;
@@ -66,16 +70,11 @@ struct virtnet_info
 	struct page *pages;
 };
 
-static inline struct virtio_net_hdr *skb_vnet_hdr(struct sk_buff *skb)
+static inline void *skb_vnet_hdr(struct sk_buff *skb)
 {
 	return (struct virtio_net_hdr *)skb->cb;
 }
 
-static inline void vnet_hdr_to_sg(struct scatterlist *sg, struct sk_buff *skb)
-{
-	sg_init_one(sg, skb_vnet_hdr(skb), sizeof(struct virtio_net_hdr));
-}
-
 static void give_a_page(struct virtnet_info *vi, struct page *page)
 {
 	page->private = (unsigned long)vi->pages;
@@ -121,25 +120,97 @@ static void skb_xmit_done(struct virtqueue *svq)
 static void receive_skb(struct net_device *dev, struct sk_buff *skb,
 			unsigned len)
 {
+	struct virtnet_info *vi = netdev_priv(dev);
 	struct virtio_net_hdr *hdr = skb_vnet_hdr(skb);
 	int err;
+	int i;
 
 	if (unlikely(len < sizeof(struct virtio_net_hdr) + ETH_HLEN)) {
 		pr_debug("%s: short packet %i\n", dev->name, len);
 		dev->stats.rx_length_errors++;
 		goto drop;
 	}
-	len -= sizeof(struct virtio_net_hdr);
 
-	if (len <= MAX_PACKET_LEN)
-		trim_pages(netdev_priv(dev), skb);
+	if (vi->mergeable_rx_bufs) {
+		struct virtio_net_hdr_mrg_rxbuf *mhdr = skb_vnet_hdr(skb);
+		unsigned int copy;
+		char *p = page_address(skb_shinfo(skb)->frags[0].page);
 
-	err = pskb_trim(skb, len);
-	if (err) {
-		pr_debug("%s: pskb_trim failed %i %d\n", dev->name, len, err);
-		dev->stats.rx_dropped++;
-		goto drop;
+		if (len > PAGE_SIZE)
+			len = PAGE_SIZE;
+		len -= sizeof(struct virtio_net_hdr_mrg_rxbuf);
+
+		memcpy(hdr, p, sizeof(*mhdr));
+		p += sizeof(*mhdr);
+
+		copy = len;
+		if (copy > skb_tailroom(skb))
+			copy = skb_tailroom(skb);
+
+		memcpy(skb_put(skb, copy), p, copy);
+
+		len -= copy;
+
+		if (!len) {
+			give_a_page(vi, skb_shinfo(skb)->frags[0].page);
+			skb_shinfo(skb)->nr_frags--;
+		} else {
+			skb_shinfo(skb)->frags[0].page_offset +=
+				sizeof(*mhdr) + copy;
+			skb_shinfo(skb)->frags[0].size = len;
+			skb->data_len += len;
+			skb->len += len;
+		}
+
+		while (--mhdr->num_buffers) {
+			struct sk_buff *nskb;
+
+			i = skb_shinfo(skb)->nr_frags;
+			if (i >= MAX_SKB_FRAGS) {
+				pr_debug("%s: packet too long %d\n", dev->name,
+					 len);
+				dev->stats.rx_length_errors++;
+				goto drop;
+			}
+
+			nskb = vi->rvq->vq_ops->get_buf(vi->rvq, &len);
+			if (!nskb) {
+				pr_debug("%s: rx error: %d buffers missing\n",
+					 dev->name, mhdr->num_buffers);
+				dev->stats.rx_length_errors++;
+				goto drop;
+			}
+
+			__skb_unlink(nskb, &vi->recv);
+			vi->num--;
+
+			skb_shinfo(skb)->frags[i] = skb_shinfo(nskb)->frags[0];
+			skb_shinfo(nskb)->nr_frags = 0;
+			kfree_skb(nskb);
+
+			if (len > PAGE_SIZE)
+				len = PAGE_SIZE;
+
+			skb_shinfo(skb)->frags[i].size = len;
+			skb_shinfo(skb)->nr_frags++;
+			skb->data_len += len;
+			skb->len += len;
+		}
+	} else {
+		len -= sizeof(struct virtio_net_hdr);
+
+		if (len <= MAX_PACKET_LEN)
+			trim_pages(vi, skb);
+
+		err = pskb_trim(skb, len);
+		if (err) {
+			pr_debug("%s: pskb_trim failed %i %d\n", dev->name,
+				 len, err);
+			dev->stats.rx_dropped++;
+			goto drop;
+		}
 	}
+
 	skb->truesize += skb->data_len;
 	dev->stats.rx_bytes += skb->len;
 	dev->stats.rx_packets++;
@@ -198,7 +269,7 @@ drop:
 	dev_kfree_skb(skb);
 }
 
-static void try_fill_recv(struct virtnet_info *vi)
+static void try_fill_recv_maxbufs(struct virtnet_info *vi)
 {
 	struct sk_buff *skb;
 	struct scatterlist sg[2+MAX_SKB_FRAGS];
@@ -206,12 +277,16 @@ static void try_fill_recv(struct virtnet_info *vi)
 
 	sg_init_table(sg, 2+MAX_SKB_FRAGS);
 	for (;;) {
+		struct virtio_net_hdr *hdr;
+
 		skb = netdev_alloc_skb(vi->dev, MAX_PACKET_LEN);
 		if (unlikely(!skb))
 			break;
 
 		skb_put(skb, MAX_PACKET_LEN);
-		vnet_hdr_to_sg(sg, skb);
+
+		hdr = skb_vnet_hdr(skb);
+		sg_init_one(sg, hdr, sizeof(*hdr));
 
 		if (vi->big_packets) {
 			for (i = 0; i < MAX_SKB_FRAGS; i++) {
@@ -247,6 +322,54 @@ static void try_fill_recv(struct virtnet_info *vi)
 	vi->rvq->vq_ops->kick(vi->rvq);
 }
 
+static void try_fill_recv(struct virtnet_info *vi)
+{
+	struct sk_buff *skb;
+	struct scatterlist sg[1];
+	int err;
+
+	if (!vi->mergeable_rx_bufs) {
+		try_fill_recv_maxbufs(vi);
+		return;
+	}
+
+	for (;;) {
+		skb_frag_t *f;
+
+		skb = netdev_alloc_skb(vi->dev, GOOD_COPY_LEN + NET_IP_ALIGN);
+		if (unlikely(!skb))
+			break;
+
+		skb_reserve(skb, NET_IP_ALIGN);
+
+		f = &skb_shinfo(skb)->frags[0];
+		f->page = get_a_page(vi, GFP_ATOMIC);
+		if (!f->page) {
+			kfree_skb(skb);
+			break;
+		}
+
+		f->page_offset = 0;
+		f->size = PAGE_SIZE;
+
+		skb_shinfo(skb)->nr_frags++;
+
+		sg_init_one(sg, page_address(f->page), PAGE_SIZE);
+		skb_queue_head(&vi->recv, skb);
+
+		err = vi->rvq->vq_ops->add_buf(vi->rvq, sg, 0, 1, skb);
+		if (err) {
+			skb_unlink(skb, &vi->recv);
+			kfree_skb(skb);
+			break;
+		}
+		vi->num++;
+	}
+	if (unlikely(vi->num > vi->max))
+		vi->max = vi->num;
+	vi->rvq->vq_ops->kick(vi->rvq);
+}
+
 static void skb_recv_done(struct virtqueue *rvq)
 {
 	struct virtnet_info *vi = rvq->vdev->priv;
@@ -325,15 +448,14 @@ static int xmit_skb(struct virtnet_info *vi, struct sk_buff *skb)
 {
 	int num, err;
 	struct scatterlist sg[2+MAX_SKB_FRAGS];
-	struct virtio_net_hdr *hdr;
+	struct virtio_net_hdr_mrg_rxbuf *mhdr = skb_vnet_hdr(skb);
+	struct virtio_net_hdr *hdr = skb_vnet_hdr(skb);
 	const unsigned char *dest = ((struct ethhdr *)skb->data)->h_dest;
 
 	sg_init_table(sg, 2+MAX_SKB_FRAGS);
 
 	pr_debug("%s: xmit %p %pM\n", vi->dev->name, skb, dest);
 
-	/* Encode metadata header at front. */
-	hdr = skb_vnet_hdr(skb);
 	if (skb->ip_summed == CHECKSUM_PARTIAL) {
 		hdr->flags = VIRTIO_NET_HDR_F_NEEDS_CSUM;
 		hdr->csum_start = skb->csum_start - skb_headroom(skb);
@@ -361,7 +483,14 @@ static int xmit_skb(struct virtnet_info *vi, struct sk_buff *skb)
 		hdr->gso_size = hdr->hdr_len = 0;
 	}
 
-	vnet_hdr_to_sg(sg, skb);
+	mhdr->num_buffers = 0;
+
+	/* Encode metadata header at front. */
+	if (vi->mergeable_rx_bufs)
+		sg_init_one(sg, mhdr, sizeof(*mhdr));
+	else
+		sg_init_one(sg, hdr, sizeof(*hdr));
+
 	num = skb_to_sgvec(skb, sg+1, 0, skb->len) + 1;
 
 	err = vi->svq->vq_ops->add_buf(vi->svq, sg, num, 0, skb);
@@ -551,6 +680,9 @@ static int virtnet_probe(struct virtio_device *vdev)
 	    || virtio_has_feature(vdev, VIRTIO_NET_F_GUEST_ECN))
 		vi->big_packets = true;
 
+	if (virtio_has_feature(vdev, VIRTIO_NET_F_MRG_RXBUF))
+		vi->mergeable_rx_bufs = true;
+
 	/* We expect two virtqueues, receive then send. */
 	vi->rvq = vdev->config->find_vq(vdev, 0, skb_recv_done);
 	if (IS_ERR(vi->rvq)) {
@@ -643,6 +775,7 @@ static unsigned int features[] = {
 	VIRTIO_NET_F_HOST_TSO4, VIRTIO_NET_F_HOST_UFO, VIRTIO_NET_F_HOST_TSO6,
 	VIRTIO_NET_F_HOST_ECN, VIRTIO_NET_F_GUEST_TSO4, VIRTIO_NET_F_GUEST_TSO6,
 	VIRTIO_NET_F_GUEST_ECN, /* We don't yet handle UFO input. */
+	VIRTIO_NET_F_MRG_RXBUF,
 	VIRTIO_F_NOTIFY_ON_EMPTY,
 };
 
diff --git a/include/linux/virtio_net.h b/include/linux/virtio_net.h
index 5e33761b9b8..5cdd0aa8bde 100644
--- a/include/linux/virtio_net.h
+++ b/include/linux/virtio_net.h
@@ -20,6 +20,7 @@
 #define VIRTIO_NET_F_HOST_TSO6	12	/* Host can handle TSOv6 in. */
 #define VIRTIO_NET_F_HOST_ECN	13	/* Host can handle TSO[6] w/ ECN in. */
 #define VIRTIO_NET_F_HOST_UFO	14	/* Host can handle UFO in. */
+#define VIRTIO_NET_F_MRG_RXBUF	15	/* Host can merge receive buffers. */
 
 struct virtio_net_config
 {
@@ -44,4 +45,12 @@ struct virtio_net_hdr
 	__u16 csum_start;	/* Position to start checksumming from */
 	__u16 csum_offset;	/* Offset after that to place checksum */
 };
+
+/* This is the version of the header to use when the MRG_RXBUF
+ * feature has been negotiated. */
+struct virtio_net_hdr_mrg_rxbuf {
+	struct virtio_net_hdr hdr;
+	__u16 num_buffers;	/* Number of merged rx buffers */
+};
+
 #endif /* _LINUX_VIRTIO_NET_H */
-- 
cgit v1.2.3-70-g09d2


From 49aebc66d6b896f9c7c5739d85c4548c00015aa7 Mon Sep 17 00:00:00 2001
From: Gerrit Renker <gerrit@erg.abdn.ac.uk>
Date: Sun, 16 Nov 2008 22:51:23 -0800
Subject: dccp: Deprecate old setsockopt framework

The previous setsockopt interface, which passed socket options via struct
dccp_so_feat, is complicated/difficult to use. Continuing to support it leads to
ugly code since the old approach did not distinguish between NN and SP values.

This patch removes the old setsockopt interface and replaces it with two new
functions to register NN/SP values for feature negotiation.
These are essentially wrappers around the internal __feat_register functions,
with checking added to avoid

 * wrong usage (type);
 * changing values while the connection is in progress.

Signed-off-by: Gerrit Renker <gerrit@erg.abdn.ac.uk>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/dccp.h |  7 -----
 net/dccp/feat.c      | 72 ++++++++++++++++++++--------------------------------
 net/dccp/feat.h      |  5 ++--
 net/dccp/proto.c     | 53 ++------------------------------------
 4 files changed, 32 insertions(+), 105 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/dccp.h b/include/linux/dccp.h
index d3ac1bde60b..6eaaca9b037 100644
--- a/include/linux/dccp.h
+++ b/include/linux/dccp.h
@@ -193,13 +193,6 @@ enum dccp_feature_numbers {
 	DCCPF_MAX_CCID_SPECIFIC = 255,
 };
 
-/* this structure is argument to DCCP_SOCKOPT_CHANGE_X */
-struct dccp_so_feat {
-	__u8 dccpsf_feat;
-	__u8 __user *dccpsf_val;
-	__u8 dccpsf_len;
-};
-
 /* DCCP socket options */
 #define DCCP_SOCKOPT_PACKET_SIZE	1 /* XXX deprecated, without effect */
 #define DCCP_SOCKOPT_SERVICE		2
diff --git a/net/dccp/feat.c b/net/dccp/feat.c
index 4f86a48723f..47ce9abaf72 100644
--- a/net/dccp/feat.c
+++ b/net/dccp/feat.c
@@ -364,53 +364,35 @@ static int __feat_register_sp(struct list_head *fn, u8 feat, u8 is_local,
 	return dccp_feat_push_change(fn, feat, is_local, mandatory, &fval);
 }
 
-int dccp_feat_change(struct dccp_minisock *dmsk, u8 type, u8 feature,
-		     u8 *val, u8 len, gfp_t gfp)
-{
-	struct dccp_opt_pend *opt;
-
-	dccp_feat_debug(type, feature, *val);
-
-	if (len > 3) {
-		DCCP_WARN("invalid length %d\n", len);
+/**
+ * dccp_feat_register_sp  -  Register requests to change SP feature values
+ * @sk: client or listening socket
+ * @feat: one of %dccp_feature_numbers
+ * @is_local: whether the local (1) or remote (0) @feat is meant
+ * @list: array of preferred values, in descending order of preference
+ * @len: length of @list in bytes
+ */
+int dccp_feat_register_sp(struct sock *sk, u8 feat, u8 is_local,
+			  u8 const *list, u8 len)
+{	 /* any changes must be registered before establishing the connection */
+	if (sk->sk_state != DCCP_CLOSED)
+		return -EISCONN;
+	if (dccp_feat_type(feat) != FEAT_SP)
 		return -EINVAL;
-	}
-	/* XXX add further sanity checks */
-
-	/* check if that feature is already being negotiated */
-	list_for_each_entry(opt, &dmsk->dccpms_pending, dccpop_node) {
-		/* ok we found a negotiation for this option already */
-		if (opt->dccpop_feat == feature && opt->dccpop_type == type) {
-			dccp_pr_debug("Replacing old\n");
-			/* replace */
-			BUG_ON(opt->dccpop_val == NULL);
-			kfree(opt->dccpop_val);
-			opt->dccpop_val	 = val;
-			opt->dccpop_len	 = len;
-			opt->dccpop_conf = 0;
-			return 0;
-		}
-	}
-
-	/* negotiation for a new feature */
-	opt = kmalloc(sizeof(*opt), gfp);
-	if (opt == NULL)
-		return -ENOMEM;
-
-	opt->dccpop_type = type;
-	opt->dccpop_feat = feature;
-	opt->dccpop_len	 = len;
-	opt->dccpop_val	 = val;
-	opt->dccpop_conf = 0;
-	opt->dccpop_sc	 = NULL;
-
-	BUG_ON(opt->dccpop_val == NULL);
-
-	list_add_tail(&opt->dccpop_node, &dmsk->dccpms_pending);
-	return 0;
+	return __feat_register_sp(&dccp_sk(sk)->dccps_featneg, feat, is_local,
+				  0, list, len);
 }
 
-EXPORT_SYMBOL_GPL(dccp_feat_change);
+/* Analogous to dccp_feat_register_sp(), but for non-negotiable values */
+int dccp_feat_register_nn(struct sock *sk, u8 feat, u64 val)
+{
+	/* any changes must be registered before establishing the connection */
+	if (sk->sk_state != DCCP_CLOSED)
+		return -EISCONN;
+	if (dccp_feat_type(feat) != FEAT_NN)
+		return -EINVAL;
+	return __feat_register_nn(&dccp_sk(sk)->dccps_featneg, feat, 0, val);
+}
 
 /*
  *	Tracking features whose value depend on the choice of CCID
@@ -1108,7 +1090,7 @@ int dccp_feat_init(struct sock *sk)
 
 	/* Ack ratio */
 	rc = __feat_register_nn(&dp->dccps_featneg, DCCPF_ACK_RATIO, 0,
-				dmsk->dccpms_ack_ratio);
+				dp->dccps_l_ack_ratio);
 out:
 	return rc;
 }
diff --git a/net/dccp/feat.h b/net/dccp/feat.h
index 232c653e69c..4d172822df1 100644
--- a/net/dccp/feat.h
+++ b/net/dccp/feat.h
@@ -111,8 +111,9 @@ static inline void dccp_feat_debug(const u8 type, const u8 feat, const u8 val)
 #define dccp_feat_debug(type, feat, val)
 #endif /* CONFIG_IP_DCCP_DEBUG */
 
-extern int  dccp_feat_change(struct dccp_minisock *dmsk, u8 type, u8 feature,
-			     u8 *val, u8 len, gfp_t gfp);
+extern int  dccp_feat_register_sp(struct sock *sk, u8 feat, u8 is_local,
+				  u8 const *list, u8 len);
+extern int  dccp_feat_register_nn(struct sock *sk, u8 feat, u64 val);
 extern int  dccp_feat_change_recv(struct sock *sk, u8 type, u8 feature,
 				  u8 *val, u8 len);
 extern int  dccp_feat_confirm_recv(struct sock *sk, u8 type, u8 feature,
diff --git a/net/dccp/proto.c b/net/dccp/proto.c
index 1117d4d8c8f..3090fc40eeb 100644
--- a/net/dccp/proto.c
+++ b/net/dccp/proto.c
@@ -470,44 +470,6 @@ static int dccp_setsockopt_service(struct sock *sk, const __be32 service,
 	return 0;
 }
 
-/* byte 1 is feature.  the rest is the preference list */
-static int dccp_setsockopt_change(struct sock *sk, int type,
-				  struct dccp_so_feat __user *optval)
-{
-	struct dccp_so_feat opt;
-	u8 *val;
-	int rc;
-
-	if (copy_from_user(&opt, optval, sizeof(opt)))
-		return -EFAULT;
-	/*
-	 * rfc4340: 6.1. Change Options
-	 */
-	if (opt.dccpsf_len < 1)
-		return -EINVAL;
-
-	val = kmalloc(opt.dccpsf_len, GFP_KERNEL);
-	if (!val)
-		return -ENOMEM;
-
-	if (copy_from_user(val, opt.dccpsf_val, opt.dccpsf_len)) {
-		rc = -EFAULT;
-		goto out_free_val;
-	}
-
-	rc = dccp_feat_change(dccp_msk(sk), type, opt.dccpsf_feat,
-			      val, opt.dccpsf_len, GFP_KERNEL);
-	if (rc)
-		goto out_free_val;
-
-out:
-	return rc;
-
-out_free_val:
-	kfree(val);
-	goto out;
-}
-
 static int do_dccp_setsockopt(struct sock *sk, int level, int optname,
 		char __user *optval, int optlen)
 {
@@ -530,20 +492,9 @@ static int do_dccp_setsockopt(struct sock *sk, int level, int optname,
 		err = 0;
 		break;
 	case DCCP_SOCKOPT_CHANGE_L:
-		if (optlen != sizeof(struct dccp_so_feat))
-			err = -EINVAL;
-		else
-			err = dccp_setsockopt_change(sk, DCCPO_CHANGE_L,
-						     (struct dccp_so_feat __user *)
-						     optval);
-		break;
 	case DCCP_SOCKOPT_CHANGE_R:
-		if (optlen != sizeof(struct dccp_so_feat))
-			err = -EINVAL;
-		else
-			err = dccp_setsockopt_change(sk, DCCPO_CHANGE_R,
-						     (struct dccp_so_feat __user *)
-						     optval);
+		DCCP_WARN("sockopt(CHANGE_L/R) is deprecated: fix your app\n");
+		err = 0;
 		break;
 	case DCCP_SOCKOPT_SERVER_TIMEWAIT:
 		if (dp->dccps_role != DCCP_ROLE_SERVER)
-- 
cgit v1.2.3-70-g09d2


From 29450559849da7066813601effb7666966869853 Mon Sep 17 00:00:00 2001
From: Gerrit Renker <gerrit@erg.abdn.ac.uk>
Date: Sun, 16 Nov 2008 22:53:48 -0800
Subject: dccp: Feature negotiation for minimum-checksum-coverage

This provides feature negotiation for server minimum checksum coverage
which so far has been missing.

Since sender/receiver coverage values range only from 0...15, their
type has also been reduced in size from u16 to u4.

Feature-negotiation options are now generated for both sender and receiver
coverage, i.e. when the peer has `forgotten' to enable partial coverage
then feature negotiation will automatically enable (negotiate) the partial
coverage value for this connection.

Signed-off-by: Gerrit Renker <gerrit@erg.abdn.ac.uk>
Acked-by: Ian McDonald <ian.mcdonald@jandi.co.nz>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/dccp.h |  4 ++--
 net/dccp/proto.c     | 53 +++++++++++++++++++++++++++++++++++++++-------------
 2 files changed, 42 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/dccp.h b/include/linux/dccp.h
index 6eaaca9b037..5a5a89935db 100644
--- a/include/linux/dccp.h
+++ b/include/linux/dccp.h
@@ -527,8 +527,8 @@ struct dccp_sock {
 	__u32				dccps_timestamp_time;
 	__u16				dccps_l_ack_ratio;
 	__u16				dccps_r_ack_ratio;
-	__u16				dccps_pcslen;
-	__u16				dccps_pcrlen;
+	__u8				dccps_pcslen:4;
+	__u8				dccps_pcrlen:4;
 	__u64				dccps_ndp_count:48;
 	unsigned long			dccps_rate_last;
 	struct dccp_minisock		dccps_minisock;
diff --git a/net/dccp/proto.c b/net/dccp/proto.c
index 3090fc40eeb..c6b4362bb1d 100644
--- a/net/dccp/proto.c
+++ b/net/dccp/proto.c
@@ -470,6 +470,42 @@ static int dccp_setsockopt_service(struct sock *sk, const __be32 service,
 	return 0;
 }
 
+static int dccp_setsockopt_cscov(struct sock *sk, int cscov, bool rx)
+{
+	u8 *list, len;
+	int i, rc;
+
+	if (cscov < 0 || cscov > 15)
+		return -EINVAL;
+	/*
+	 * Populate a list of permissible values, in the range cscov...15. This
+	 * is necessary since feature negotiation of single values only works if
+	 * both sides incidentally choose the same value. Since the list starts
+	 * lowest-value first, negotiation will pick the smallest shared value.
+	 */
+	if (cscov == 0)
+		return 0;
+	len = 16 - cscov;
+
+	list = kmalloc(len, GFP_KERNEL);
+	if (list == NULL)
+		return -ENOBUFS;
+
+	for (i = 0; i < len; i++)
+		list[i] = cscov++;
+
+	rc = dccp_feat_register_sp(sk, DCCPF_MIN_CSUM_COVER, rx, list, len);
+
+	if (rc == 0) {
+		if (rx)
+			dccp_sk(sk)->dccps_pcrlen = cscov;
+		else
+			dccp_sk(sk)->dccps_pcslen = cscov;
+	}
+	kfree(list);
+	return rc;
+}
+
 static int do_dccp_setsockopt(struct sock *sk, int level, int optname,
 		char __user *optval, int optlen)
 {
@@ -502,20 +538,11 @@ static int do_dccp_setsockopt(struct sock *sk, int level, int optname,
 		else
 			dp->dccps_server_timewait = (val != 0);
 		break;
-	case DCCP_SOCKOPT_SEND_CSCOV:	/* sender side, RFC 4340, sec. 9.2 */
-		if (val < 0 || val > 15)
-			err = -EINVAL;
-		else
-			dp->dccps_pcslen = val;
+	case DCCP_SOCKOPT_SEND_CSCOV:
+		err = dccp_setsockopt_cscov(sk, val, false);
 		break;
-	case DCCP_SOCKOPT_RECV_CSCOV:	/* receiver side, RFC 4340 sec. 9.2.1 */
-		if (val < 0 || val > 15)
-			err = -EINVAL;
-		else {
-			dp->dccps_pcrlen = val;
-			/* FIXME: add feature negotiation,
-			 * ChangeL(MinimumChecksumCoverage, val) */
-		}
+	case DCCP_SOCKOPT_RECV_CSCOV:
+		err = dccp_setsockopt_cscov(sk, val, true);
 		break;
 	default:
 		err = -ENOPROTOOPT;
-- 
cgit v1.2.3-70-g09d2


From dd9c0e363cef32b7d6f23d4c87e8dfe4f91fd1c5 Mon Sep 17 00:00:00 2001
From: Gerrit Renker <gerrit@erg.abdn.ac.uk>
Date: Sun, 16 Nov 2008 22:55:08 -0800
Subject: dccp: Deprecate Ack Ratio sysctl

This patch deprecates the Ack Ratio sysctl, since
 * Ack Ratio is entirely ignored by CCID-3 and CCID-4,
 * Ack Ratio currently doesn't work in CCID-2 (i.e. is always set to 1);
 * even if it would work in CCID-2, there is no point for a user to change it:
   - Ack Ratio is constrained by cwnd (RFC 4341, 6.1.2),
   - if Ack Ratio > cwnd, the system resorts to spurious RTO timeouts
     (since waiting for Acks which will never arrive in this window),
   - cwnd is not a user-configurable value.

The only reasonable place for Ack Ratio is to print it for debugging. It is
planned to do this later on, as part of e.g. dccp_probe.

With this patch Ack Ratio is now under full control of feature negotiation:
 * Ack Ratio is resolved as a dependency of the selected CCID;
 * if the chosen CCID supports it (i.e. CCID == CCID-2), Ack Ratio is set to
   the default of 2, following RFC 4340, 11.3 - "New connections start with Ack
   Ratio 2 for both endpoints";
 * what happens then is part of another patch set, since it concerns the
   dynamic update of Ack Ratio while the connection is in full flight.

Thanks to Tomasz Grobelny for discussion leading up to this patch.

Signed-off-by: Gerrit Renker <gerrit@erg.abdn.ac.uk>
Acked-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/dccp.txt | 3 ---
 include/linux/dccp.h              | 2 --
 net/dccp/dccp.h                   | 1 -
 net/dccp/minisocks.c              | 1 -
 net/dccp/options.c                | 1 -
 net/dccp/sysctl.c                 | 7 -------
 6 files changed, 15 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/networking/dccp.txt b/Documentation/networking/dccp.txt
index f0aeb20fa63..43df4487379 100644
--- a/Documentation/networking/dccp.txt
+++ b/Documentation/networking/dccp.txt
@@ -125,9 +125,6 @@ send_ndp = 1
 send_ackvec = 1
 	Whether or not to send Ack Vector options (sec. 11.5).
 
-ack_ratio = 2
-	The default Ack Ratio (sec. 11.3) to use.
-
 tx_ccid = 2
 	Default CCID for the sender-receiver half-connection.
 
diff --git a/include/linux/dccp.h b/include/linux/dccp.h
index 5a5a89935db..eda389ce04f 100644
--- a/include/linux/dccp.h
+++ b/include/linux/dccp.h
@@ -368,7 +368,6 @@ static inline unsigned int dccp_hdr_len(const struct sk_buff *skb)
   * @dccpms_ccid - Congestion Control Id (CCID) (section 10)
   * @dccpms_send_ack_vector - Send Ack Vector Feature (section 11.5)
   * @dccpms_send_ndp_count - Send NDP Count Feature (7.7.2)
-  * @dccpms_ack_ratio - Ack Ratio Feature (section 11.3)
   * @dccpms_pending - List of features being negotiated
   * @dccpms_conf -
   */
@@ -378,7 +377,6 @@ struct dccp_minisock {
 	__u8			dccpms_tx_ccid;
 	__u8			dccpms_send_ack_vector;
 	__u8			dccpms_send_ndp_count;
-	__u8			dccpms_ack_ratio;
 	struct list_head	dccpms_pending;
 	struct list_head	dccpms_conf;
 };
diff --git a/net/dccp/dccp.h b/net/dccp/dccp.h
index e656dafb5d9..031ce350d3c 100644
--- a/net/dccp/dccp.h
+++ b/net/dccp/dccp.h
@@ -98,7 +98,6 @@ extern int  sysctl_dccp_retries2;
 extern int  sysctl_dccp_feat_sequence_window;
 extern int  sysctl_dccp_feat_rx_ccid;
 extern int  sysctl_dccp_feat_tx_ccid;
-extern int  sysctl_dccp_feat_ack_ratio;
 extern int  sysctl_dccp_feat_send_ack_vector;
 extern int  sysctl_dccp_feat_send_ndp_count;
 extern int  sysctl_dccp_tx_qlen;
diff --git a/net/dccp/minisocks.c b/net/dccp/minisocks.c
index afdacbb94d7..ed61bc58e41 100644
--- a/net/dccp/minisocks.c
+++ b/net/dccp/minisocks.c
@@ -47,7 +47,6 @@ void dccp_minisock_init(struct dccp_minisock *dmsk)
 	dmsk->dccpms_sequence_window = sysctl_dccp_feat_sequence_window;
 	dmsk->dccpms_rx_ccid	     = sysctl_dccp_feat_rx_ccid;
 	dmsk->dccpms_tx_ccid	     = sysctl_dccp_feat_tx_ccid;
-	dmsk->dccpms_ack_ratio	     = sysctl_dccp_feat_ack_ratio;
 	dmsk->dccpms_send_ack_vector = sysctl_dccp_feat_send_ack_vector;
 	dmsk->dccpms_send_ndp_count  = sysctl_dccp_feat_send_ndp_count;
 }
diff --git a/net/dccp/options.c b/net/dccp/options.c
index 67a171a1268..515ad45013a 100644
--- a/net/dccp/options.c
+++ b/net/dccp/options.c
@@ -26,7 +26,6 @@
 int sysctl_dccp_feat_sequence_window = DCCPF_INITIAL_SEQUENCE_WINDOW;
 int sysctl_dccp_feat_rx_ccid	      = DCCPF_INITIAL_CCID;
 int sysctl_dccp_feat_tx_ccid	      = DCCPF_INITIAL_CCID;
-int sysctl_dccp_feat_ack_ratio	      = DCCPF_INITIAL_ACK_RATIO;
 int sysctl_dccp_feat_send_ack_vector = DCCPF_INITIAL_SEND_ACK_VECTOR;
 int sysctl_dccp_feat_send_ndp_count  = DCCPF_INITIAL_SEND_NDP_COUNT;
 
diff --git a/net/dccp/sysctl.c b/net/dccp/sysctl.c
index 21295993fdb..f6e54f433e2 100644
--- a/net/dccp/sysctl.c
+++ b/net/dccp/sysctl.c
@@ -40,13 +40,6 @@ static struct ctl_table dccp_default_table[] = {
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec,
 	},
-	{
-		.procname	= "ack_ratio",
-		.data		= &sysctl_dccp_feat_ack_ratio,
-		.maxlen		= sizeof(sysctl_dccp_feat_ack_ratio),
-		.mode		= 0644,
-		.proc_handler	= proc_dointvec,
-	},
 	{
 		.procname	= "send_ackvec",
 		.data		= &sysctl_dccp_feat_send_ack_vector,
-- 
cgit v1.2.3-70-g09d2


From e17be2b2a95b43fe0d5878adf330701bb7a77115 Mon Sep 17 00:00:00 2001
From: David Vrabel <david.vrabel@csr.com>
Date: Mon, 17 Nov 2008 15:24:14 +0000
Subject: uwb: add pal parameter to new reservation callback

The pal parameter allows PALs to retrieve their PAL-specific data
structure.

Signed-off-by: David Vrabel <david.vrabel@csr.com>
---
 drivers/uwb/rsv.c       |  2 +-
 drivers/uwb/uwb-debug.c | 10 +++++-----
 include/linux/uwb.h     |  2 +-
 3 files changed, 7 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/uwb/rsv.c b/drivers/uwb/rsv.c
index 3d76efe26ac..935d5b536db 100644
--- a/drivers/uwb/rsv.c
+++ b/drivers/uwb/rsv.c
@@ -558,7 +558,7 @@ static struct uwb_rsv *uwb_rsv_new_target(struct uwb_rc *rc,
 	spin_lock(&rc->pal_lock);
 	list_for_each_entry(pal, &rc->pals, node) {
 		if (pal->new_rsv)
-			pal->new_rsv(rsv);
+			pal->new_rsv(pal, rsv);
 		if (rsv->state == UWB_RSV_STATE_T_ACCEPTED)
 			break;
 	}
diff --git a/drivers/uwb/uwb-debug.c b/drivers/uwb/uwb-debug.c
index 88e6ac71381..217ebaac128 100644
--- a/drivers/uwb/uwb-debug.c
+++ b/drivers/uwb/uwb-debug.c
@@ -306,13 +306,13 @@ static struct file_operations drp_avail_fops = {
 	.owner   = THIS_MODULE,
 };
 
-static void uwb_dbg_new_rsv(struct uwb_rsv *rsv)
+static void uwb_dbg_new_rsv(struct uwb_pal *pal, struct uwb_rsv *rsv)
 {
-	struct uwb_rc *rc = rsv->rc;
+	struct uwb_dbg *dbg = container_of(pal, struct uwb_dbg, pal);
 
-	if (rc->dbg->accept) {
-		list_add_tail(&rsv->pal_node, &rc->dbg->rsvs);
-		uwb_rsv_accept(rsv, uwb_dbg_rsv_cb, NULL);
+	if (dbg->accept) {
+		list_add_tail(&rsv->pal_node, &dbg->rsvs);
+		uwb_rsv_accept(rsv, uwb_dbg_rsv_cb, dbg);
 	}
 }
 
diff --git a/include/linux/uwb.h b/include/linux/uwb.h
index c4854848999..effd97998fd 100644
--- a/include/linux/uwb.h
+++ b/include/linux/uwb.h
@@ -405,7 +405,7 @@ struct uwb_pal {
 	struct list_head node;
 	const char *name;
 	struct device *device;
-	void (*new_rsv)(struct uwb_rsv *rsv);
+	void (*new_rsv)(struct uwb_pal *pal, struct uwb_rsv *rsv);
 };
 
 void uwb_pal_init(struct uwb_pal *pal);
-- 
cgit v1.2.3-70-g09d2


From 0231022cc32d5f2e7f3c06b75691dda0ad6aec33 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Mon, 17 Nov 2008 03:22:41 +0100
Subject: tracing/function-return-tracer: add the overrun field

Impact: help to find the better depth of trace

We decided to arbitrary define the depth of function return trace as
"20". Perhaps this is not enough. To help finding an optimal depth, we
measure now the overrun: the number of functions that have been missed
for the current thread. By default this is not displayed, we have to
do set a particular flag on the return tracer: echo overrun >
/debug/tracing/trace_options And the overrun will be printed on the
right.

As the trace shows below, the current 20 depth is not enough.

update_wall_time+0x37f/0x8c0 -> update_xtime_cache (345 ns) (Overruns: 2838)
update_wall_time+0x384/0x8c0 -> clocksource_get_next (1141 ns) (Overruns: 2838)
do_timer+0x23/0x100 -> update_wall_time (3882 ns) (Overruns: 2838)
tick_do_update_jiffies64+0xbf/0x160 -> do_timer (5339 ns) (Overruns: 2838)
tick_sched_timer+0x6a/0xf0 -> tick_do_update_jiffies64 (7209 ns) (Overruns: 2838)
vgacon_set_cursor_size+0x98/0x120 -> native_io_delay (2613 ns) (Overruns: 274)
vgacon_cursor+0x16e/0x1d0 -> vgacon_set_cursor_size (33151 ns) (Overruns: 274)
set_cursor+0x5f/0x80 -> vgacon_cursor (36432 ns) (Overruns: 274)
con_flush_chars+0x34/0x40 -> set_cursor (38790 ns) (Overruns: 274)
release_console_sem+0x1ec/0x230 -> up (721 ns) (Overruns: 274)
release_console_sem+0x225/0x230 -> wake_up_klogd (316 ns) (Overruns: 274)
con_flush_chars+0x39/0x40 -> release_console_sem (2996 ns) (Overruns: 274)
con_write+0x22/0x30 -> con_flush_chars (46067 ns) (Overruns: 274)
n_tty_write+0x1cc/0x360 -> con_write (292670 ns) (Overruns: 274)
smp_apic_timer_interrupt+0x2a/0x90 -> native_apic_mem_write (330 ns) (Overruns: 274)
irq_enter+0x17/0x70 -> idle_cpu (413 ns) (Overruns: 274)
smp_apic_timer_interrupt+0x2f/0x90 -> irq_enter (1525 ns) (Overruns: 274)
ktime_get_ts+0x40/0x70 -> getnstimeofday (465 ns) (Overruns: 274)
ktime_get_ts+0x60/0x70 -> set_normalized_timespec (436 ns) (Overruns: 274)
ktime_get+0x16/0x30 -> ktime_get_ts (2501 ns) (Overruns: 274)
hrtimer_interrupt+0x77/0x1a0 -> ktime_get (3439 ns) (Overruns: 274)

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Acked-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/include/asm/thread_info.h    |  7 +++++++
 arch/x86/kernel/ftrace.c              | 10 ++++++---
 include/linux/ftrace.h                |  2 ++
 include/linux/sched.h                 |  1 +
 kernel/trace/trace.c                  |  1 +
 kernel/trace/trace.h                  |  1 +
 kernel/trace/trace_functions_return.c | 38 +++++++++++++++++++++++++++++------
 7 files changed, 51 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
index a71158369fd..e90e81ef6ab 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -21,6 +21,7 @@ struct task_struct;
 struct exec_domain;
 #include <asm/processor.h>
 #include <asm/ftrace.h>
+#include <asm/atomic.h>
 
 struct thread_info {
 	struct task_struct	*task;		/* main task structure */
@@ -45,6 +46,11 @@ struct thread_info {
 	int		curr_ret_stack;
 	/* Stack of return addresses for return function tracing */
 	struct ftrace_ret_stack	ret_stack[FTRACE_RET_STACK_SIZE];
+	/*
+	 * Number of functions that haven't been traced
+	 * because of depth overrun.
+	 */
+	atomic_t	trace_overrun;
 #endif
 };
 
@@ -61,6 +67,7 @@ struct thread_info {
 		.fn = do_no_restart_syscall,	\
 	},					\
 	.curr_ret_stack = -1,\
+	.trace_overrun	= ATOMIC_INIT(0)	\
 }
 #else
 #define INIT_THREAD_INFO(tsk)			\
diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c
index 924153edd97..356bb1eb6e9 100644
--- a/arch/x86/kernel/ftrace.c
+++ b/arch/x86/kernel/ftrace.c
@@ -353,8 +353,10 @@ static int push_return_trace(unsigned long ret, unsigned long long time,
 	struct thread_info *ti = current_thread_info();
 
 	/* The return trace stack is full */
-	if (ti->curr_ret_stack == FTRACE_RET_STACK_SIZE - 1)
+	if (ti->curr_ret_stack == FTRACE_RET_STACK_SIZE - 1) {
+		atomic_inc(&ti->trace_overrun);
 		return -EBUSY;
+	}
 
 	index = ++ti->curr_ret_stack;
 	barrier();
@@ -367,7 +369,7 @@ static int push_return_trace(unsigned long ret, unsigned long long time,
 
 /* Retrieve a function return address to the trace stack on thread info.*/
 static void pop_return_trace(unsigned long *ret, unsigned long long *time,
-				unsigned long *func)
+				unsigned long *func, unsigned long *overrun)
 {
 	int index;
 
@@ -376,6 +378,7 @@ static void pop_return_trace(unsigned long *ret, unsigned long long *time,
 	*ret = ti->ret_stack[index].ret;
 	*func = ti->ret_stack[index].func;
 	*time = ti->ret_stack[index].calltime;
+	*overrun = atomic_read(&ti->trace_overrun);
 	ti->curr_ret_stack--;
 }
 
@@ -386,7 +389,8 @@ static void pop_return_trace(unsigned long *ret, unsigned long long *time,
 unsigned long ftrace_return_to_handler(void)
 {
 	struct ftrace_retfunc trace;
-	pop_return_trace(&trace.ret, &trace.calltime, &trace.func);
+	pop_return_trace(&trace.ret, &trace.calltime, &trace.func,
+			&trace.overrun);
 	trace.rettime = cpu_clock(raw_smp_processor_id());
 	ftrace_function_return(&trace);
 
diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index f1af1aab00e..f7ba4ea5e12 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -318,6 +318,8 @@ struct ftrace_retfunc {
 	unsigned long func; /* Current function */
 	unsigned long long calltime;
 	unsigned long long rettime;
+	/* Number of functions that overran the depth limit for current task */
+	unsigned long overrun;
 };
 
 #ifdef CONFIG_FUNCTION_RET_TRACER
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 61c8cc36028..c8e0db46420 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2016,6 +2016,7 @@ static inline void setup_thread_stack(struct task_struct *p, struct task_struct
 	 * used.
 	 */
 	task_thread_info(p)->curr_ret_stack = -1;
+	atomic_set(&task_thread_info(p)->trace_overrun, 0);
 #endif
 }
 
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 9531fddcfb8..e97c29a6e7b 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -853,6 +853,7 @@ static void __trace_function_return(struct trace_array *tr,
 	entry->parent_ip	= trace->ret;
 	entry->rettime		= trace->rettime;
 	entry->calltime		= trace->calltime;
+	entry->overrun		= trace->overrun;
 	ring_buffer_unlock_commit(global_trace.buffer, event, irq_flags);
 }
 #endif
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 9d22618bf99..2cb12fd98f6 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -60,6 +60,7 @@ struct ftrace_ret_entry {
 	unsigned long		parent_ip;
 	unsigned long long	calltime;
 	unsigned long long	rettime;
+	unsigned long		overrun;
 };
 extern struct tracer boot_tracer;
 
diff --git a/kernel/trace/trace_functions_return.c b/kernel/trace/trace_functions_return.c
index a68564af022..e00d64509c9 100644
--- a/kernel/trace/trace_functions_return.c
+++ b/kernel/trace/trace_functions_return.c
@@ -14,6 +14,19 @@
 #include "trace.h"
 
 
+#define TRACE_RETURN_PRINT_OVERRUN	0x1
+static struct tracer_opt trace_opts[] = {
+	/* Display overruns or not */
+	{ TRACER_OPT(overrun, TRACE_RETURN_PRINT_OVERRUN) },
+	{ } /* Empty entry */
+};
+
+static struct tracer_flags tracer_flags = {
+	.val = 0, /* Don't display overruns by default */
+	.opts = trace_opts
+};
+
+
 static int return_trace_init(struct trace_array *tr)
 {
 	int cpu;
@@ -42,26 +55,39 @@ print_return_function(struct trace_iterator *iter)
 		ret = trace_seq_printf(s, "%pF -> ", (void *)field->parent_ip);
 		if (!ret)
 			return TRACE_TYPE_PARTIAL_LINE;
+
 		ret = seq_print_ip_sym(s, field->ip,
 					trace_flags & TRACE_ITER_SYM_MASK);
 		if (!ret)
 			return TRACE_TYPE_PARTIAL_LINE;
-		ret = trace_seq_printf(s, " (%llu ns)\n",
+
+		ret = trace_seq_printf(s, " (%llu ns)",
 					field->rettime - field->calltime);
 		if (!ret)
 			return TRACE_TYPE_PARTIAL_LINE;
-		else
-			return TRACE_TYPE_HANDLED;
+
+		if (tracer_flags.val & TRACE_RETURN_PRINT_OVERRUN) {
+			ret = trace_seq_printf(s, " (Overruns: %lu)",
+						field->overrun);
+			if (!ret)
+				return TRACE_TYPE_PARTIAL_LINE;
+		}
+
+		ret = trace_seq_printf(s, "\n");
+		if (!ret)
+			return TRACE_TYPE_PARTIAL_LINE;
+
+		return TRACE_TYPE_HANDLED;
 	}
 	return TRACE_TYPE_UNHANDLED;
 }
 
-static struct tracer return_trace __read_mostly =
-{
+static struct tracer return_trace __read_mostly = {
 	.name	     = "return",
 	.init	     = return_trace_init,
 	.reset	     = return_trace_reset,
-	.print_line = print_return_function
+	.print_line = print_return_function,
+	.flags		= &tracer_flags,
 };
 
 static __init int init_return_trace(void)
-- 
cgit v1.2.3-70-g09d2


From 1e291b14c8f1101b9093434489bd4dc0e03f3d0f Mon Sep 17 00:00:00 2001
From: Michael Ellerman <michael@ellerman.id.au>
Date: Wed, 12 Nov 2008 18:54:42 +0000
Subject: of: Add helpers for finding device nodes which have a given property

This commit adds a routine for finding a device node which has a
certain property.  The contents of the property are not taken into
account, merely the presence or absence of the property.

Based on that routine, we add a for_each_ macro for iterating over all
nodes that have a certain property.

Signed-off-by: Michael Ellerman <michael@ellerman.id.au>
Signed-off-by: Paul Mackerras <paulus@samba.org>
---
 drivers/of/base.c  | 35 +++++++++++++++++++++++++++++++++++
 include/linux/of.h |  6 ++++++
 2 files changed, 41 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/of/base.c b/drivers/of/base.c
index 7c79e94a35e..4f884a358a7 100644
--- a/drivers/of/base.c
+++ b/drivers/of/base.c
@@ -328,6 +328,41 @@ struct device_node *of_find_compatible_node(struct device_node *from,
 }
 EXPORT_SYMBOL(of_find_compatible_node);
 
+/**
+ *	of_find_node_with_property - Find a node which has a property with
+ *                                   the given name.
+ *	@from:		The node to start searching from or NULL, the node
+ *			you pass will not be searched, only the next one
+ *			will; typically, you pass what the previous call
+ *			returned. of_node_put() will be called on it
+ *	@prop_name:	The name of the property to look for.
+ *
+ *	Returns a node pointer with refcount incremented, use
+ *	of_node_put() on it when done.
+ */
+struct device_node *of_find_node_with_property(struct device_node *from,
+	const char *prop_name)
+{
+	struct device_node *np;
+	struct property *pp;
+
+	read_lock(&devtree_lock);
+	np = from ? from->allnext : allnodes;
+	for (; np; np = np->allnext) {
+		for (pp = np->properties; pp != 0; pp = pp->next) {
+			if (of_prop_cmp(pp->name, prop_name) == 0) {
+				of_node_get(np);
+				goto out;
+			}
+		}
+	}
+out:
+	of_node_put(from);
+	read_unlock(&devtree_lock);
+	return np;
+}
+EXPORT_SYMBOL(of_find_node_with_property);
+
 /**
  * of_match_node - Tell if an device_node has a matching of_match structure
  *	@matches:	array of of device match structures to search in
diff --git a/include/linux/of.h b/include/linux/of.h
index e2488f5e7cb..6a7efa242f5 100644
--- a/include/linux/of.h
+++ b/include/linux/of.h
@@ -57,6 +57,12 @@ extern struct device_node *of_get_next_child(const struct device_node *node,
 	for (child = of_get_next_child(parent, NULL); child != NULL; \
 	     child = of_get_next_child(parent, child))
 
+extern struct device_node *of_find_node_with_property(
+	struct device_node *from, const char *prop_name);
+#define for_each_node_with_property(dn, prop_name) \
+	for (dn = of_find_node_with_property(NULL, prop_name); dn; \
+	     dn = of_find_node_with_property(dn, prop_name))
+
 extern struct property *of_find_property(const struct device_node *np,
 					 const char *name,
 					 int *lenp);
-- 
cgit v1.2.3-70-g09d2


From 6fae35f9cea92793a98b2d9ab21235e5ae035581 Mon Sep 17 00:00:00 2001
From: David Vrabel <david.vrabel@csr.com>
Date: Mon, 17 Nov 2008 15:53:42 +0000
Subject: uwb: add basic radio manager

The UWB radio manager coordinates the use of the radio between the
PALs that may be using it.  PALs request use of the radio with
uwb_radio_start() and the radio manager will start beaconing if its
not already doing so.  When the last PAL has called uwb_radio_stop()
beaconing will be stopped.

In the future, the radio manager will have a more sophisticated channel
selection algorithm, probably following the Channel Selection Policy
from the WiMedia Alliance when it is finalized.  For now, channel 9
(BG1, TFC1) is selected.

The user may override the channel selected by the radio manager and may
force the radio to stop beaconing.

The WUSB Host Controller PAL makes use of this and there are two new
debug PAL commands that can be used for testing.

Signed-off-by: David Vrabel <david.vrabel@csr.com>
---
 Documentation/ABI/testing/sysfs-class-uwb_rc |  14 +-
 Documentation/usb/wusb-cbaf                  |   9 --
 drivers/usb/host/hwa-hc.c                    |   1 -
 drivers/usb/host/whci/hcd.c                  |   2 -
 drivers/usb/wusbcore/devconnect.c            |   5 +-
 drivers/usb/wusbcore/mmc.c                   |  75 ++--------
 drivers/usb/wusbcore/pal.c                   |  16 ++-
 drivers/usb/wusbcore/wusbhc.h                |   8 +-
 drivers/uwb/Makefile                         |   1 +
 drivers/uwb/beacon.c                         |  26 ++--
 drivers/uwb/drp.c                            |  24 +---
 drivers/uwb/lc-rc.c                          |  11 +-
 drivers/uwb/pal.c                            |  20 +--
 drivers/uwb/radio.c                          | 202 +++++++++++++++++++++++++++
 drivers/uwb/reset.c                          |   6 +-
 drivers/uwb/rsv.c                            |   4 +-
 drivers/uwb/uwb-debug.c                      |  26 +++-
 drivers/uwb/uwb-internal.h                   |   5 +
 drivers/uwb/wlp/wlp-lc.c                     |   5 +-
 include/linux/uwb.h                          |  23 ++-
 include/linux/uwb/debug-cmd.h                |   2 +
 21 files changed, 323 insertions(+), 162 deletions(-)
 create mode 100644 drivers/uwb/radio.c

(limited to 'include/linux')

diff --git a/Documentation/ABI/testing/sysfs-class-uwb_rc b/Documentation/ABI/testing/sysfs-class-uwb_rc
index a0d18dbeb7a..6a5fd072849 100644
--- a/Documentation/ABI/testing/sysfs-class-uwb_rc
+++ b/Documentation/ABI/testing/sysfs-class-uwb_rc
@@ -32,14 +32,16 @@ Contact:        linux-usb@vger.kernel.org
 Description:
                 Write:
 
-                <channel> [<bpst offset>]
+                <channel>
 
-                to start beaconing on a specific channel, or stop
-                beaconing if <channel> is -1.  Valid channels depends
-                on the radio controller's supported band groups.
+                to force a specific channel to be used when beaconing,
+                or, if <channel> is -1, to prohibit beaconing.  If
+                <channel> is 0, then the default channel selection
+                algorithm will be used.  Valid channels depends on the
+                radio controller's supported band groups.
 
-                <bpst offset> may be used to try and join a specific
-                beacon group if more than one was found during a scan.
+                Reading returns the currently active channel, or -1 if
+                the radio controller is not beaconing.
 
 What:           /sys/class/uwb_rc/uwbN/scan
 Date:           July 2008
diff --git a/Documentation/usb/wusb-cbaf b/Documentation/usb/wusb-cbaf
index 2e78b70f3ad..426ddaaef96 100644
--- a/Documentation/usb/wusb-cbaf
+++ b/Documentation/usb/wusb-cbaf
@@ -80,12 +80,6 @@ case $1 in
     start)
         for dev in ${2:-$hdevs}
           do
-          uwb_rc=$(readlink -f $dev/uwb_rc)
-          if cat $uwb_rc/beacon | grep -q -- "-1"
-              then
-              echo 13 0 > $uwb_rc/beacon
-              echo I: started beaconing on ch 13 on $(basename $uwb_rc) >&2
-          fi
           echo $host_CHID > $dev/wusb_chid
           echo I: started host $(basename $dev) >&2
         done
@@ -95,9 +89,6 @@ case $1 in
           do
           echo 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 > $dev/wusb_chid
           echo I: stopped host $(basename $dev) >&2
-          uwb_rc=$(readlink -f $dev/uwb_rc)
-          echo -1 | cat > $uwb_rc/beacon
-          echo I: stopped beaconing on $(basename $uwb_rc) >&2
         done
         ;;
     set-chid)
diff --git a/drivers/usb/host/hwa-hc.c b/drivers/usb/host/hwa-hc.c
index 2827353e97e..2a4d36fa70b 100644
--- a/drivers/usb/host/hwa-hc.c
+++ b/drivers/usb/host/hwa-hc.c
@@ -221,7 +221,6 @@ static void hwahc_op_stop(struct usb_hcd *usb_hcd)
 
 	d_fnstart(4, dev, "(hwahc %p)\n", hwahc);
 	mutex_lock(&wusbhc->mutex);
-	wusbhc_stop(wusbhc);
 	wusb_cluster_id_put(wusbhc->cluster_id);
 	mutex_unlock(&wusbhc->mutex);
 	d_fnend(4, dev, "(hwahc %p) = %d\n", hwahc, result);
diff --git a/drivers/usb/host/whci/hcd.c b/drivers/usb/host/whci/hcd.c
index de1e07271b8..f599f89d3be 100644
--- a/drivers/usb/host/whci/hcd.c
+++ b/drivers/usb/host/whci/hcd.c
@@ -91,8 +91,6 @@ static void whc_stop(struct usb_hcd *usb_hcd)
 
 	mutex_lock(&wusbhc->mutex);
 
-	wusbhc_stop(wusbhc);
-
 	/* stop HC */
 	le_writel(0, whc->base + WUSBINTR);
 	whc_write_wusbcmd(whc, WUSBCMD_RUN, 0);
diff --git a/drivers/usb/wusbcore/devconnect.c b/drivers/usb/wusbcore/devconnect.c
index c01c7a80744..08a1ec90386 100644
--- a/drivers/usb/wusbcore/devconnect.c
+++ b/drivers/usb/wusbcore/devconnect.c
@@ -1124,8 +1124,7 @@ void wusbhc_devconnect_destroy(struct wusbhc *wusbhc)
  * FIXME: This also enables the keep alives but this is not necessary
  * until there are connected and authenticated devices.
  */
-int wusbhc_devconnect_start(struct wusbhc *wusbhc,
-			    const struct wusb_ckhdid *chid)
+int wusbhc_devconnect_start(struct wusbhc *wusbhc)
 {
 	struct device *dev = wusbhc->dev;
 	struct wuie_host_info *hi;
@@ -1138,7 +1137,7 @@ int wusbhc_devconnect_start(struct wusbhc *wusbhc,
 	hi->hdr.bLength       = sizeof(*hi);
 	hi->hdr.bIEIdentifier = WUIE_ID_HOST_INFO;
 	hi->attributes        = cpu_to_le16((wusbhc->rsv->stream << 3) | WUIE_HI_CAP_ALL);
-	hi->CHID              = *chid;
+	hi->CHID              = wusbhc->chid;
 	result = wusbhc_mmcie_set(wusbhc, 0, 0, &hi->hdr);
 	if (result < 0) {
 		dev_err(dev, "Cannot add Host Info MMCIE: %d\n", result);
diff --git a/drivers/usb/wusbcore/mmc.c b/drivers/usb/wusbcore/mmc.c
index af2aee0fdff..5463ecebafd 100644
--- a/drivers/usb/wusbcore/mmc.c
+++ b/drivers/usb/wusbcore/mmc.c
@@ -162,12 +162,11 @@ EXPORT_SYMBOL_GPL(wusbhc_mmcie_rm);
 /*
  * wusbhc_start - start transmitting MMCs and accepting connections
  * @wusbhc: the HC to start
- * @chid: the CHID to use for this host
  *
  * Establishes a cluster reservation, enables device connections, and
  * starts MMCs with appropriate DNTS parameters.
  */
-int wusbhc_start(struct wusbhc *wusbhc, const struct wusb_ckhdid *chid)
+int wusbhc_start(struct wusbhc *wusbhc)
 {
 	int result;
 	struct device *dev = wusbhc->dev;
@@ -181,7 +180,7 @@ int wusbhc_start(struct wusbhc *wusbhc, const struct wusb_ckhdid *chid)
 		goto error_rsv_establish;
 	}
 
-	result = wusbhc_devconnect_start(wusbhc, chid);
+	result = wusbhc_devconnect_start(wusbhc);
 	if (result < 0) {
 		dev_err(dev, "error enabling device connections: %d\n", result);
 		goto error_devconnect_start;
@@ -218,34 +217,6 @@ error_rsv_establish:
 	return result;
 }
 
-/*
- * Disconnect all from the WUSB Channel
- *
- * Send a Host Disconnect IE in the MMC, wait, don't send it any more
- */
-static int __wusbhc_host_disconnect_ie(struct wusbhc *wusbhc)
-{
-	int result = -ENOMEM;
-	struct wuie_host_disconnect *host_disconnect_ie;
-	might_sleep();
-	host_disconnect_ie = kmalloc(sizeof(*host_disconnect_ie), GFP_KERNEL);
-	if (host_disconnect_ie == NULL)
-		goto error_alloc;
-	host_disconnect_ie->hdr.bLength       = sizeof(*host_disconnect_ie);
-	host_disconnect_ie->hdr.bIEIdentifier = WUIE_ID_HOST_DISCONNECT;
-	result = wusbhc_mmcie_set(wusbhc, 0, 0, &host_disconnect_ie->hdr);
-	if (result < 0)
-		goto error_mmcie_set;
-
-	/* WUSB1.0[8.5.3.1 & 7.5.2] */
-	msleep(100);
-	wusbhc_mmcie_rm(wusbhc, &host_disconnect_ie->hdr);
-error_mmcie_set:
-	kfree(host_disconnect_ie);
-error_alloc:
-	return result;
-}
-
 /*
  * wusbhc_stop - stop transmitting MMCs
  * @wusbhc: the HC to stop
@@ -264,29 +235,6 @@ void wusbhc_stop(struct wusbhc *wusbhc)
 }
 EXPORT_SYMBOL_GPL(wusbhc_stop);
 
-/*
- * Change the CHID in a WUSB Channel
- *
- * If it is just a new CHID, send a Host Disconnect IE and then change
- * the CHID IE.
- */
-static int __wusbhc_chid_change(struct wusbhc *wusbhc,
-				const struct wusb_ckhdid *chid)
-{
-	int result = -ENOSYS;
-	struct device *dev = wusbhc->dev;
-	dev_err(dev, "%s() not implemented yet\n", __func__);
-	return result;
-
-	BUG_ON(wusbhc->wuie_host_info == NULL);
-	__wusbhc_host_disconnect_ie(wusbhc);
-	wusbhc->wuie_host_info->CHID = *chid;
-	result = wusbhc_mmcie_set(wusbhc, 0, 0, &wusbhc->wuie_host_info->hdr);
-	if (result < 0)
-		dev_err(dev, "Can't update Host Info WUSB IE: %d\n", result);
-	return result;
-}
-
 /*
  * Set/reset/update a new CHID
  *
@@ -302,16 +250,19 @@ int wusbhc_chid_set(struct wusbhc *wusbhc, const struct wusb_ckhdid *chid)
 		chid = NULL;
 
 	mutex_lock(&wusbhc->mutex);
-	if (wusbhc->active) {
-		if (chid)
-			result = __wusbhc_chid_change(wusbhc, chid);
-		else
-			wusbhc_stop(wusbhc);
-	} else {
-		if (chid)
-			wusbhc_start(wusbhc, chid);
+	if (chid) {
+		if (wusbhc->active) {
+			mutex_unlock(&wusbhc->mutex);
+			return -EBUSY;
+		}
+		wusbhc->chid = *chid;
 	}
 	mutex_unlock(&wusbhc->mutex);
+
+	if (chid)
+		result = uwb_radio_start(&wusbhc->pal);
+	else
+		uwb_radio_stop(&wusbhc->pal);
 	return result;
 }
 EXPORT_SYMBOL_GPL(wusbhc_chid_set);
diff --git a/drivers/usb/wusbcore/pal.c b/drivers/usb/wusbcore/pal.c
index 7cc51e9905c..d0b172c5ecc 100644
--- a/drivers/usb/wusbcore/pal.c
+++ b/drivers/usb/wusbcore/pal.c
@@ -18,6 +18,16 @@
  */
 #include "wusbhc.h"
 
+static void wusbhc_channel_changed(struct uwb_pal *pal, int channel)
+{
+	struct wusbhc *wusbhc = container_of(pal, struct wusbhc, pal);
+
+	if (channel < 0)
+		wusbhc_stop(wusbhc);
+	else
+		wusbhc_start(wusbhc);
+}
+
 /**
  * wusbhc_pal_register - register the WUSB HC as a UWB PAL
  * @wusbhc: the WUSB HC
@@ -28,8 +38,10 @@ int wusbhc_pal_register(struct wusbhc *wusbhc)
 
 	wusbhc->pal.name   = "wusbhc";
 	wusbhc->pal.device = wusbhc->usb_hcd.self.controller;
+	wusbhc->pal.rc     = wusbhc->uwb_rc;
+	wusbhc->pal.channel_changed = wusbhc_channel_changed;
 
-	return uwb_pal_register(wusbhc->uwb_rc, &wusbhc->pal);
+	return uwb_pal_register(&wusbhc->pal);
 }
 
 /**
@@ -38,5 +50,5 @@ int wusbhc_pal_register(struct wusbhc *wusbhc)
  */
 void wusbhc_pal_unregister(struct wusbhc *wusbhc)
 {
-	uwb_pal_unregister(wusbhc->uwb_rc, &wusbhc->pal);
+	uwb_pal_unregister(&wusbhc->pal);
 }
diff --git a/drivers/usb/wusbcore/wusbhc.h b/drivers/usb/wusbcore/wusbhc.h
index 8fef934ad2f..797c2453a35 100644
--- a/drivers/usb/wusbcore/wusbhc.h
+++ b/drivers/usb/wusbcore/wusbhc.h
@@ -252,7 +252,8 @@ struct wusbhc {
 	struct uwb_pal pal;
 
 	unsigned trust_timeout;			/* in jiffies */
-	struct wuie_host_info *wuie_host_info;	/* Includes CHID */
+	struct wusb_ckhdid chid;
+	struct wuie_host_info *wuie_host_info;
 
 	struct mutex mutex;			/* locks everything else */
 	u16 cluster_id;				/* Wireless USB Cluster ID */
@@ -376,15 +377,14 @@ static inline void wusbhc_put(struct wusbhc *wusbhc)
 	usb_put_hcd(&wusbhc->usb_hcd);
 }
 
-int wusbhc_start(struct wusbhc *wusbhc, const struct wusb_ckhdid *chid);
+int wusbhc_start(struct wusbhc *wusbhc);
 void wusbhc_stop(struct wusbhc *wusbhc);
 extern int wusbhc_chid_set(struct wusbhc *, const struct wusb_ckhdid *);
 
 /* Device connect handling */
 extern int wusbhc_devconnect_create(struct wusbhc *);
 extern void wusbhc_devconnect_destroy(struct wusbhc *);
-extern int wusbhc_devconnect_start(struct wusbhc *wusbhc,
-				   const struct wusb_ckhdid *chid);
+extern int wusbhc_devconnect_start(struct wusbhc *wusbhc);
 extern void wusbhc_devconnect_stop(struct wusbhc *wusbhc);
 extern void wusbhc_handle_dn(struct wusbhc *, u8 srcaddr,
 			     struct wusb_dn_hdr *dn_hdr, size_t size);
diff --git a/drivers/uwb/Makefile b/drivers/uwb/Makefile
index 2b99c3e6167..ce21a95da04 100644
--- a/drivers/uwb/Makefile
+++ b/drivers/uwb/Makefile
@@ -18,6 +18,7 @@ uwb-objs :=		\
 	lc-rc.o		\
 	neh.o		\
 	pal.o		\
+	radio.o		\
 	reset.o		\
 	rsv.o		\
 	scan.o		\
diff --git a/drivers/uwb/beacon.c b/drivers/uwb/beacon.c
index d9f2a8acc59..247956098af 100644
--- a/drivers/uwb/beacon.c
+++ b/drivers/uwb/beacon.c
@@ -119,7 +119,6 @@ int uwb_rc_beacon(struct uwb_rc *rc, int channel, unsigned bpst_offset)
 	int result;
 	struct device *dev = &rc->uwb_dev.dev;
 
-	mutex_lock(&rc->uwb_dev.mutex);
 	if (channel < 0)
 		channel = -1;
 	if (channel == -1)
@@ -128,7 +127,7 @@ int uwb_rc_beacon(struct uwb_rc *rc, int channel, unsigned bpst_offset)
 		/* channel >= 0...dah */
 		result = uwb_rc_start_beacon(rc, bpst_offset, channel);
 		if (result < 0)
-			goto out_up;
+			return result;
 		if (le16_to_cpu(rc->ies->wIELength) > 0) {
 			result = uwb_rc_set_ie(rc, rc->ies);
 			if (result < 0) {
@@ -137,19 +136,14 @@ int uwb_rc_beacon(struct uwb_rc *rc, int channel, unsigned bpst_offset)
 				result = uwb_rc_stop_beacon(rc);
 				channel = -1;
 				bpst_offset = 0;
-			} else
-				result = 0;
+			}
 		}
 	}
 
-	if (result < 0)
-		goto out_up;
-	rc->beaconing = channel;
-
-	uwb_notify(rc, NULL, uwb_bg_joined(rc) ? UWB_NOTIF_BG_JOIN : UWB_NOTIF_BG_LEAVE);
-
-out_up:
-	mutex_unlock(&rc->uwb_dev.mutex);
+	if (result >= 0) {
+		rc->beaconing = channel;
+		uwb_notify(rc, NULL, uwb_bg_joined(rc) ? UWB_NOTIF_BG_JOIN : UWB_NOTIF_BG_LEAVE);
+	}
 	return result;
 }
 
@@ -618,9 +612,6 @@ static ssize_t uwb_rc_beacon_show(struct device *dev,
 
 /*
  * Start beaconing on the specified channel, or stop beaconing.
- *
- * The BPST offset of when to start searching for a beacon group to
- * join may be specified.
  */
 static ssize_t uwb_rc_beacon_store(struct device *dev,
 				   struct device_attribute *attr,
@@ -629,12 +620,11 @@ static ssize_t uwb_rc_beacon_store(struct device *dev,
 	struct uwb_dev *uwb_dev = to_uwb_dev(dev);
 	struct uwb_rc *rc = uwb_dev->rc;
 	int channel;
-	unsigned bpst_offset = 0;
 	ssize_t result = -EINVAL;
 
-	result = sscanf(buf, "%d %u\n", &channel, &bpst_offset);
+	result = sscanf(buf, "%d", &channel);
 	if (result >= 1)
-		result = uwb_rc_beacon(rc, channel, bpst_offset);
+		result = uwb_radio_force_channel(rc, channel);
 
 	return result < 0 ? result : size;
 }
diff --git a/drivers/uwb/drp.c b/drivers/uwb/drp.c
index c0b1e5e2bd0..fe328146adb 100644
--- a/drivers/uwb/drp.c
+++ b/drivers/uwb/drp.c
@@ -37,14 +37,13 @@
  *
  * A DRP Availability IE is appended.
  *
- * rc->uwb_dev.mutex is held
+ * rc->rsvs_mutex is held
  *
  * FIXME We currently ignore the returned value indicating the remaining space
  * in beacon. This could be used to deny reservation requests earlier if
  * determined that they would cause the beacon space to be exceeded.
  */
-static
-int uwb_rc_gen_send_drp_ie(struct uwb_rc *rc)
+int uwb_rc_send_all_drp_ie(struct uwb_rc *rc)
 {
 	int result;
 	struct device *dev = &rc->uwb_dev.dev;
@@ -102,25 +101,6 @@ error_cmd:
 	kfree(cmd);
 error:
 	return result;
-
-}
-/**
- * Send all DRP IEs associated with this host
- *
- * @returns:    >= 0 number of bytes still available in the beacon
- *              < 0 errno code on error.
- *
- * As per the protocol we obtain the host controller device lock to access
- * bandwidth structures.
- */
-int uwb_rc_send_all_drp_ie(struct uwb_rc *rc)
-{
-	int result;
-
-	mutex_lock(&rc->uwb_dev.mutex);
-	result = uwb_rc_gen_send_drp_ie(rc);
-	mutex_unlock(&rc->uwb_dev.mutex);
-	return result;
 }
 
 void uwb_drp_handle_timeout(struct uwb_rsv *rsv)
diff --git a/drivers/uwb/lc-rc.c b/drivers/uwb/lc-rc.c
index f00633d334d..9cf21e6bb62 100644
--- a/drivers/uwb/lc-rc.c
+++ b/drivers/uwb/lc-rc.c
@@ -189,9 +189,9 @@ static int uwb_rc_setup(struct uwb_rc *rc)
 	int result;
 	struct device *dev = &rc->uwb_dev.dev;
 
-	result = uwb_rc_reset(rc);
+	result = uwb_radio_setup(rc);
 	if (result < 0) {
-		dev_err(dev, "cannot reset UWB radio: %d\n", result);
+		dev_err(dev, "cannot setup UWB radio: %d\n", result);
 		goto error;
 	}
 	result = uwb_rc_mac_addr_setup(rc);
@@ -311,12 +311,7 @@ void uwb_rc_rm(struct uwb_rc *rc)
 
 	uwb_dbg_del_rc(rc);
 	uwb_rsv_remove_all(rc);
-	uwb_rc_ie_rm(rc, UWB_IDENTIFICATION_IE);
-	if (rc->beaconing >= 0)
-		uwb_rc_beacon(rc, -1, 0);
-	if (rc->scan_type != UWB_SCAN_DISABLED)
-		uwb_rc_scan(rc, rc->scanning, UWB_SCAN_DISABLED, 0);
-	uwb_rc_reset(rc);
+	uwb_radio_shutdown(rc);
 
 	rc->stop(rc);
 
diff --git a/drivers/uwb/pal.c b/drivers/uwb/pal.c
index 1afb38eacb9..605765124f5 100644
--- a/drivers/uwb/pal.c
+++ b/drivers/uwb/pal.c
@@ -32,13 +32,13 @@ EXPORT_SYMBOL_GPL(uwb_pal_init);
 
 /**
  * uwb_pal_register - register a UWB PAL
- * @rc: the radio controller the PAL will be using
  * @pal: the PAL
  *
  * The PAL must be initialized with uwb_pal_init().
  */
-int uwb_pal_register(struct uwb_rc *rc, struct uwb_pal *pal)
+int uwb_pal_register(struct uwb_pal *pal)
 {
+	struct uwb_rc *rc = pal->rc;
 	int ret;
 
 	if (pal->device) {
@@ -54,9 +54,9 @@ int uwb_pal_register(struct uwb_rc *rc, struct uwb_pal *pal)
 		}
 	}
 
-	spin_lock(&rc->pal_lock);
+	mutex_lock(&rc->uwb_dev.mutex);
 	list_add(&pal->node, &rc->pals);
-	spin_unlock(&rc->pal_lock);
+	mutex_unlock(&rc->uwb_dev.mutex);
 
 	return 0;
 }
@@ -64,14 +64,17 @@ EXPORT_SYMBOL_GPL(uwb_pal_register);
 
 /**
  * uwb_pal_register - unregister a UWB PAL
- * @rc: the radio controller the PAL was using
  * @pal: the PAL
  */
-void uwb_pal_unregister(struct uwb_rc *rc, struct uwb_pal *pal)
+void uwb_pal_unregister(struct uwb_pal *pal)
 {
-	spin_lock(&rc->pal_lock);
+	struct uwb_rc *rc = pal->rc;
+
+	uwb_radio_stop(pal);
+
+	mutex_lock(&rc->uwb_dev.mutex);
 	list_del(&pal->node);
-	spin_unlock(&rc->pal_lock);
+	mutex_unlock(&rc->uwb_dev.mutex);
 
 	if (pal->device) {
 		sysfs_remove_link(&rc->uwb_dev.dev.kobj, pal->name);
@@ -86,6 +89,5 @@ EXPORT_SYMBOL_GPL(uwb_pal_unregister);
  */
 void uwb_rc_pal_init(struct uwb_rc *rc)
 {
-	spin_lock_init(&rc->pal_lock);
 	INIT_LIST_HEAD(&rc->pals);
 }
diff --git a/drivers/uwb/radio.c b/drivers/uwb/radio.c
new file mode 100644
index 00000000000..f0d55495f5e
--- /dev/null
+++ b/drivers/uwb/radio.c
@@ -0,0 +1,202 @@
+/*
+ * UWB radio (channel) management.
+ *
+ * Copyright (C) 2008 Cambridge Silicon Radio Ltd.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version
+ * 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+#include <linux/kernel.h>
+#include <linux/uwb.h>
+
+#include "uwb-internal.h"
+
+
+static int uwb_radio_select_channel(struct uwb_rc *rc)
+{
+	/*
+	 * Default to channel 9 (BG1, TFC1) unless the user has
+	 * selected a specific channel or there are no active PALs.
+	 */
+	if (rc->active_pals == 0)
+		return -1;
+	if (rc->beaconing_forced)
+		return rc->beaconing_forced;
+	return 9;
+}
+
+
+/*
+ * Notify all active PALs that the channel has changed.
+ */
+static void uwb_radio_channel_changed(struct uwb_rc *rc, int channel)
+{
+	struct uwb_pal *pal;
+
+	list_for_each_entry(pal, &rc->pals, node) {
+		if (pal->channel && channel != pal->channel) {
+			pal->channel = channel;
+			if (pal->channel_changed)
+				pal->channel_changed(pal, pal->channel);
+		}
+	}
+}
+
+/*
+ * Change to a new channel and notify any active PALs of the new
+ * channel.
+ *
+ * When stopping the radio, PALs need to be notified first so they can
+ * terminate any active reservations.
+ */
+static int uwb_radio_change_channel(struct uwb_rc *rc, int channel)
+{
+	int ret = 0;
+
+	if (channel == -1)
+		uwb_radio_channel_changed(rc, channel);
+
+	if (channel != rc->beaconing) {
+		if (rc->beaconing != -1 && channel != -1) {
+			/*
+			 * FIXME: should signal the channel change
+			 * with a Channel Change IE.
+			 */
+			ret = uwb_radio_change_channel(rc, -1);
+			if (ret < 0)
+				return ret;
+		}
+		ret = uwb_rc_beacon(rc, channel, 0);
+	}
+
+	if (channel != -1)
+		uwb_radio_channel_changed(rc, rc->beaconing);
+
+	return ret;
+}
+
+/**
+ * uwb_radio_start - request that the radio be started
+ * @pal: the PAL making the request.
+ *
+ * If the radio is not already active, aa suitable channel is selected
+ * and beacons are started.
+ */
+int uwb_radio_start(struct uwb_pal *pal)
+{
+	struct uwb_rc *rc = pal->rc;
+	int ret = 0;
+
+	mutex_lock(&rc->uwb_dev.mutex);
+
+	if (!pal->channel) {
+		pal->channel = -1;
+		rc->active_pals++;
+		ret = uwb_radio_change_channel(rc, uwb_radio_select_channel(rc));
+	}
+
+	mutex_unlock(&rc->uwb_dev.mutex);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(uwb_radio_start);
+
+/**
+ * uwb_radio_stop - request tha the radio be stopped.
+ * @pal: the PAL making the request.
+ *
+ * Stops the radio if no other PAL is making use of it.
+ */
+void uwb_radio_stop(struct uwb_pal *pal)
+{
+	struct uwb_rc *rc = pal->rc;
+
+	mutex_lock(&rc->uwb_dev.mutex);
+
+	if (pal->channel) {
+		rc->active_pals--;
+		uwb_radio_change_channel(rc, uwb_radio_select_channel(rc));
+		pal->channel = 0;
+	}
+
+	mutex_unlock(&rc->uwb_dev.mutex);
+}
+EXPORT_SYMBOL_GPL(uwb_radio_stop);
+
+/*
+ * uwb_radio_force_channel - force a specific channel to be used
+ * @rc: the radio controller.
+ * @channel: the channel to use; -1 to force the radio to stop; 0 to
+ *   use the default channel selection algorithm.
+ */
+int uwb_radio_force_channel(struct uwb_rc *rc, int channel)
+{
+	int ret = 0;
+
+	mutex_lock(&rc->uwb_dev.mutex);
+
+	rc->beaconing_forced = channel;
+	ret = uwb_radio_change_channel(rc, uwb_radio_select_channel(rc));
+
+	mutex_unlock(&rc->uwb_dev.mutex);
+	return ret;
+}
+
+/*
+ * uwb_radio_setup - setup the radio manager
+ * @rc: the radio controller.
+ *
+ * The radio controller is reset to ensure it's in a known state
+ * before it's used.
+ */
+int uwb_radio_setup(struct uwb_rc *rc)
+{
+	return uwb_rc_reset(rc);
+}
+
+/*
+ * uwb_radio_reset_state - reset any radio manager state
+ * @rc: the radio controller.
+ *
+ * All internal radio manager state is reset to values corresponding
+ * to a reset radio controller.
+ */
+void uwb_radio_reset_state(struct uwb_rc *rc)
+{
+	struct uwb_pal *pal;
+
+	mutex_lock(&rc->uwb_dev.mutex);
+
+	list_for_each_entry(pal, &rc->pals, node) {
+		if (pal->channel) {
+			pal->channel = -1;
+			if (pal->channel_changed)
+				pal->channel_changed(pal, -1);
+		}
+	}
+
+	rc->beaconing = -1;
+	rc->scanning = -1;
+
+	mutex_unlock(&rc->uwb_dev.mutex);
+}
+
+/*
+ * uwb_radio_shutdown - shutdown the radio manager
+ * @rc: the radio controller.
+ *
+ * The radio controller is reset.
+ */
+void uwb_radio_shutdown(struct uwb_rc *rc)
+{
+	uwb_radio_reset_state(rc);
+	uwb_rc_reset(rc);
+}
diff --git a/drivers/uwb/reset.c b/drivers/uwb/reset.c
index e39b32099af..ce8283cc809 100644
--- a/drivers/uwb/reset.c
+++ b/drivers/uwb/reset.c
@@ -365,11 +365,7 @@ void uwb_rc_pre_reset(struct uwb_rc *rc)
 	rc->stop(rc);
 	uwbd_flush(rc);
 
-	mutex_lock(&rc->uwb_dev.mutex);
-	rc->beaconing = -1;
-	rc->scanning = -1;
-	mutex_unlock(&rc->uwb_dev.mutex);
-
+	uwb_radio_reset_state(rc);
 	uwb_rsv_remove_all(rc);
 }
 EXPORT_SYMBOL_GPL(uwb_rc_pre_reset);
diff --git a/drivers/uwb/rsv.c b/drivers/uwb/rsv.c
index 935d5b536db..1cd84f92754 100644
--- a/drivers/uwb/rsv.c
+++ b/drivers/uwb/rsv.c
@@ -555,14 +555,14 @@ static struct uwb_rsv *uwb_rsv_new_target(struct uwb_rc *rc,
 	 * deny the request.
 	 */
 	rsv->state = UWB_RSV_STATE_T_DENIED;
-	spin_lock(&rc->pal_lock);
+	mutex_lock(&rc->uwb_dev.mutex);
 	list_for_each_entry(pal, &rc->pals, node) {
 		if (pal->new_rsv)
 			pal->new_rsv(pal, rsv);
 		if (rsv->state == UWB_RSV_STATE_T_ACCEPTED)
 			break;
 	}
-	spin_unlock(&rc->pal_lock);
+	mutex_unlock(&rc->uwb_dev.mutex);
 
 	list_add_tail(&rsv->rc_node, &rc->reservations);
 	state = rsv->state;
diff --git a/drivers/uwb/uwb-debug.c b/drivers/uwb/uwb-debug.c
index 217ebaac128..0e58071a232 100644
--- a/drivers/uwb/uwb-debug.c
+++ b/drivers/uwb/uwb-debug.c
@@ -192,7 +192,7 @@ static ssize_t command_write(struct file *file, const char __user *buf,
 {
 	struct uwb_rc *rc = file->private_data;
 	struct uwb_dbg_cmd cmd;
-	int ret;
+	int ret = 0;
 
 	if (len != sizeof(struct uwb_dbg_cmd))
 		return -EINVAL;
@@ -213,6 +213,12 @@ static ssize_t command_write(struct file *file, const char __user *buf,
 	case UWB_DBG_CMD_IE_RM:
 		ret = cmd_ie_rm(rc, &cmd.ie_rm);
 		break;
+	case UWB_DBG_CMD_RADIO_START:
+		ret = uwb_radio_start(&rc->dbg->pal);
+		break;
+	case UWB_DBG_CMD_RADIO_STOP:
+		uwb_radio_stop(&rc->dbg->pal);
+		break;
 	default:
 		return -EINVAL;
 	}
@@ -306,6 +312,17 @@ static struct file_operations drp_avail_fops = {
 	.owner   = THIS_MODULE,
 };
 
+static void uwb_dbg_channel_changed(struct uwb_pal *pal, int channel)
+{
+	struct uwb_dbg *dbg = container_of(pal, struct uwb_dbg, pal);
+	struct device *dev = &pal->rc->uwb_dev.dev;
+
+	if (channel > 0)
+		dev_info(dev, "debug: channel %d started\n", channel);
+	else
+		dev_info(dev, "debug: channel stopped\n");
+}
+
 static void uwb_dbg_new_rsv(struct uwb_pal *pal, struct uwb_rsv *rsv)
 {
 	struct uwb_dbg *dbg = container_of(pal, struct uwb_dbg, pal);
@@ -329,8 +346,11 @@ void uwb_dbg_add_rc(struct uwb_rc *rc)
 	INIT_LIST_HEAD(&rc->dbg->rsvs);
 
 	uwb_pal_init(&rc->dbg->pal);
+	rc->dbg->pal.rc = rc;
+	rc->dbg->pal.channel_changed = uwb_dbg_channel_changed;
 	rc->dbg->pal.new_rsv = uwb_dbg_new_rsv;
-	uwb_pal_register(rc, &rc->dbg->pal);
+	uwb_pal_register(&rc->dbg->pal);
+
 	if (root_dir) {
 		rc->dbg->root_d = debugfs_create_dir(dev_name(&rc->uwb_dev.dev),
 						     root_dir);
@@ -364,7 +384,7 @@ void uwb_dbg_del_rc(struct uwb_rc *rc)
 		uwb_rsv_terminate(rsv);
 	}
 
-	uwb_pal_unregister(rc, &rc->dbg->pal);
+	uwb_pal_unregister(&rc->dbg->pal);
 
 	if (root_dir) {
 		debugfs_remove(rc->dbg->drp_avail_f);
diff --git a/drivers/uwb/uwb-internal.h b/drivers/uwb/uwb-internal.h
index af95541dabc..9c0cdb4ded0 100644
--- a/drivers/uwb/uwb-internal.h
+++ b/drivers/uwb/uwb-internal.h
@@ -238,6 +238,11 @@ struct uwb_dev *uwb_dev_get_by_devaddr(struct uwb_rc *rc,
 struct uwb_dev *uwb_dev_get_by_macaddr(struct uwb_rc *rc,
 				       const struct uwb_mac_addr *macaddr);
 
+int uwb_radio_setup(struct uwb_rc *rc);
+void uwb_radio_reset_state(struct uwb_rc *rc);
+void uwb_radio_shutdown(struct uwb_rc *rc);
+int uwb_radio_force_channel(struct uwb_rc *rc, int channel);
+
 /* -- UWB Sysfs representation */
 extern struct class uwb_rc_class;
 extern struct device_attribute dev_attr_mac_address;
diff --git a/drivers/uwb/wlp/wlp-lc.c b/drivers/uwb/wlp/wlp-lc.c
index 0799402e73f..7e5eb49b03b 100644
--- a/drivers/uwb/wlp/wlp-lc.c
+++ b/drivers/uwb/wlp/wlp-lc.c
@@ -543,7 +543,8 @@ int wlp_setup(struct wlp *wlp, struct uwb_rc *rc)
 	uwb_notifs_register(rc, &wlp->uwb_notifs_handler);
 
 	uwb_pal_init(&wlp->pal);
-	result = uwb_pal_register(rc, &wlp->pal);
+	wlp->pal.rc = rc;
+	result = uwb_pal_register(&wlp->pal);
 	if (result < 0)
 		uwb_notifs_deregister(wlp->rc, &wlp->uwb_notifs_handler);
 
@@ -557,7 +558,7 @@ void wlp_remove(struct wlp *wlp)
 	struct device *dev = &wlp->rc->uwb_dev.dev;
 	d_fnstart(6, dev, "wlp %p\n", wlp);
 	wlp_neighbors_release(wlp);
-	uwb_pal_unregister(wlp->rc, &wlp->pal);
+	uwb_pal_unregister(&wlp->pal);
 	uwb_notifs_deregister(wlp->rc, &wlp->uwb_notifs_handler);
 	wlp_eda_release(&wlp->eda);
 	mutex_lock(&wlp->mutex);
diff --git a/include/linux/uwb.h b/include/linux/uwb.h
index effd97998fd..7d3ebf046f9 100644
--- a/include/linux/uwb.h
+++ b/include/linux/uwb.h
@@ -355,6 +355,7 @@ struct uwb_rc {
 	u8 ctx_roll;
 
 	int beaconing;			/* Beaconing state [channel number] */
+	int beaconing_forced;
 	int scanning;
 	enum uwb_scan_type scan_type:3;
 	unsigned ready:1;
@@ -373,8 +374,8 @@ struct uwb_rc {
 	struct uwb_rc_cmd_set_ie *ies;
 	size_t ies_capacity;
 
-	spinlock_t pal_lock;
 	struct list_head pals;
+	int active_pals;
 
 	struct uwb_dbg *dbg;
 };
@@ -382,11 +383,17 @@ struct uwb_rc {
 
 /**
  * struct uwb_pal - a UWB PAL
- * @name:    descriptive name for this PAL (wushc, wlp, etc.).
+ * @name:    descriptive name for this PAL (wusbhc, wlp, etc.).
  * @device:  a device for the PAL.  Used to link the PAL and the radio
  *           controller in sysfs.
+ * @rc:      the radio controller the PAL uses.
+ * @channel_changed: called when the channel used by the radio changes.
+ *           A channel of -1 means the channel has been stopped.
  * @new_rsv: called when a peer requests a reservation (may be NULL if
  *           the PAL cannot accept reservation requests).
+ * @channel: channel being used by the PAL; 0 if the PAL isn't using
+ *           the radio; -1 if the PAL wishes to use the radio but
+ *           cannot.
  *
  * A Protocol Adaptation Layer (PAL) is a user of the WiMedia UWB
  * radio platform (e.g., WUSB, WLP or Bluetooth UWB AMP).
@@ -405,12 +412,20 @@ struct uwb_pal {
 	struct list_head node;
 	const char *name;
 	struct device *device;
+	struct uwb_rc *rc;
+
+	void (*channel_changed)(struct uwb_pal *pal, int channel);
 	void (*new_rsv)(struct uwb_pal *pal, struct uwb_rsv *rsv);
+
+	int channel;
 };
 
 void uwb_pal_init(struct uwb_pal *pal);
-int uwb_pal_register(struct uwb_rc *rc, struct uwb_pal *pal);
-void uwb_pal_unregister(struct uwb_rc *rc, struct uwb_pal *pal);
+int uwb_pal_register(struct uwb_pal *pal);
+void uwb_pal_unregister(struct uwb_pal *pal);
+
+int uwb_radio_start(struct uwb_pal *pal);
+void uwb_radio_stop(struct uwb_pal *pal);
 
 /*
  * General public API
diff --git a/include/linux/uwb/debug-cmd.h b/include/linux/uwb/debug-cmd.h
index 6a16566f022..07efbe17db5 100644
--- a/include/linux/uwb/debug-cmd.h
+++ b/include/linux/uwb/debug-cmd.h
@@ -34,6 +34,8 @@ enum uwb_dbg_cmd_type {
 	UWB_DBG_CMD_RSV_TERMINATE = 2,
 	UWB_DBG_CMD_IE_ADD = 3,
 	UWB_DBG_CMD_IE_RM = 4,
+	UWB_DBG_CMD_RADIO_START = 5,
+	UWB_DBG_CMD_RADIO_STOP = 6,
 };
 
 struct uwb_dbg_cmd_rsv_establish {
-- 
cgit v1.2.3-70-g09d2


From e8e1594c8126b1b773988fa2e3bfec76cff88336 Mon Sep 17 00:00:00 2001
From: David Vrabel <david.vrabel@csr.com>
Date: Mon, 17 Nov 2008 16:16:51 +0000
Subject: wlp: start/stop radio on network interface up/down

Signed-off-by: David Vrabel <david.vrabel@csr.com>
---
 drivers/uwb/i1480/i1480u-wlp/lc.c     |  2 +-
 drivers/uwb/i1480/i1480u-wlp/netdev.c | 51 ++++++++---------------------------
 drivers/uwb/uwb-debug.c               |  3 ---
 drivers/uwb/wlp/wlp-lc.c              | 14 +++++++++-
 include/linux/wlp.h                   |  3 ++-
 5 files changed, 27 insertions(+), 46 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/uwb/i1480/i1480u-wlp/lc.c b/drivers/uwb/i1480/i1480u-wlp/lc.c
index 384306c11bb..488b2e30a0a 100644
--- a/drivers/uwb/i1480/i1480u-wlp/lc.c
+++ b/drivers/uwb/i1480/i1480u-wlp/lc.c
@@ -206,7 +206,7 @@ int i1480u_add(struct i1480u *i1480u, struct usb_interface *iface)
 	wlp->fill_device_info = i1480u_fill_device_info;
 	wlp->stop_queue = i1480u_stop_queue;
 	wlp->start_queue = i1480u_start_queue;
-	result = wlp_setup(wlp, rc);
+	result = wlp_setup(wlp, rc, net_dev);
 	if (result < 0) {
 		dev_err(&iface->dev, "Cannot setup WLP\n");
 		goto error_wlp_setup;
diff --git a/drivers/uwb/i1480/i1480u-wlp/netdev.c b/drivers/uwb/i1480/i1480u-wlp/netdev.c
index 8802ac43d87..2eafb973cd0 100644
--- a/drivers/uwb/i1480/i1480u-wlp/netdev.c
+++ b/drivers/uwb/i1480/i1480u-wlp/netdev.c
@@ -207,6 +207,11 @@ int i1480u_open(struct net_device *net_dev)
 	result = i1480u_rx_setup(i1480u);		/* Alloc RX stuff */
 	if (result < 0)
 		goto error_rx_setup;
+
+	result = uwb_radio_start(&wlp->pal);
+	if (result < 0)
+		goto error_radio_start;
+
 	netif_wake_queue(net_dev);
 #ifdef i1480u_FLOW_CONTROL
 	result = usb_submit_urb(i1480u->notif_urb, GFP_KERNEL);;
@@ -215,25 +220,20 @@ int i1480u_open(struct net_device *net_dev)
 		goto error_notif_urb_submit;
 	}
 #endif
-	i1480u->uwb_notifs_handler.cb = i1480u_uwb_notifs_cb;
-	i1480u->uwb_notifs_handler.data = i1480u;
-	if (uwb_bg_joined(rc))
-		netif_carrier_on(net_dev);
-	else
-		netif_carrier_off(net_dev);
-	uwb_notifs_register(rc, &i1480u->uwb_notifs_handler);
 	/* Interface is up with an address, now we can create WSS */
 	result = wlp_wss_setup(net_dev, &wlp->wss);
 	if (result < 0) {
 		dev_err(dev, "Can't create WSS: %d. \n", result);
-		goto error_notif_deregister;
+		goto error_wss_setup;
 	}
 	return 0;
-error_notif_deregister:
-	uwb_notifs_deregister(rc, &i1480u->uwb_notifs_handler);
+error_wss_setup:
 #ifdef i1480u_FLOW_CONTROL
+	usb_kill_urb(i1480u->notif_urb);
 error_notif_urb_submit:
 #endif
+	uwb_radio_stop(&wlp->pal);
+error_radio_start:
 	netif_stop_queue(net_dev);
 	i1480u_rx_release(i1480u);
 error_rx_setup:
@@ -248,16 +248,15 @@ int i1480u_stop(struct net_device *net_dev)
 {
 	struct i1480u *i1480u = netdev_priv(net_dev);
 	struct wlp *wlp = &i1480u->wlp;
-	struct uwb_rc *rc = wlp->rc;
 
 	BUG_ON(wlp->rc == NULL);
 	wlp_wss_remove(&wlp->wss);
-	uwb_notifs_deregister(rc, &i1480u->uwb_notifs_handler);
 	netif_carrier_off(net_dev);
 #ifdef i1480u_FLOW_CONTROL
 	usb_kill_urb(i1480u->notif_urb);
 #endif
 	netif_stop_queue(net_dev);
+	uwb_radio_stop(&wlp->pal);
 	i1480u_rx_release(i1480u);
 	i1480u_tx_release(i1480u);
 	return 0;
@@ -303,34 +302,6 @@ int i1480u_change_mtu(struct net_device *net_dev, int mtu)
 	return 0;
 }
 
-
-/**
- * Callback function to handle events from UWB
- * When we see other devices we know the carrier is ok,
- * if we are the only device in the beacon group we set the carrier
- * state to off.
- * */
-void i1480u_uwb_notifs_cb(void *data, struct uwb_dev *uwb_dev,
-			  enum uwb_notifs event)
-{
-	struct i1480u *i1480u = data;
-	struct net_device *net_dev = i1480u->net_dev;
-	struct device *dev = &i1480u->usb_iface->dev;
-	switch (event) {
-	case UWB_NOTIF_BG_JOIN:
-		netif_carrier_on(net_dev);
-		dev_info(dev, "Link is up\n");
-		break;
-	case UWB_NOTIF_BG_LEAVE:
-		netif_carrier_off(net_dev);
-		dev_info(dev, "Link is down\n");
-		break;
-	default:
-		dev_err(dev, "don't know how to handle event %d from uwb\n",
-				event);
-	}
-}
-
 /**
  * Stop the network queue
  *
diff --git a/drivers/uwb/uwb-debug.c b/drivers/uwb/uwb-debug.c
index 0e58071a232..e02fb83469d 100644
--- a/drivers/uwb/uwb-debug.c
+++ b/drivers/uwb/uwb-debug.c
@@ -33,8 +33,6 @@
 #include <linux/seq_file.h>
 
 #include <linux/uwb/debug-cmd.h>
-#define D_LOCAL 0
-#include <linux/uwb/debug.h>
 
 #include "uwb-internal.h"
 
@@ -314,7 +312,6 @@ static struct file_operations drp_avail_fops = {
 
 static void uwb_dbg_channel_changed(struct uwb_pal *pal, int channel)
 {
-	struct uwb_dbg *dbg = container_of(pal, struct uwb_dbg, pal);
 	struct device *dev = &pal->rc->uwb_dev.dev;
 
 	if (channel > 0)
diff --git a/drivers/uwb/wlp/wlp-lc.c b/drivers/uwb/wlp/wlp-lc.c
index 7e5eb49b03b..e531093c416 100644
--- a/drivers/uwb/wlp/wlp-lc.c
+++ b/drivers/uwb/wlp/wlp-lc.c
@@ -526,7 +526,17 @@ void wlp_uwb_notifs_cb(void *_wlp, struct uwb_dev *uwb_dev,
 	}
 }
 
-int wlp_setup(struct wlp *wlp, struct uwb_rc *rc)
+static void wlp_channel_changed(struct uwb_pal *pal, int channel)
+{
+	struct wlp *wlp = container_of(pal, struct wlp, pal);
+
+	if (channel < 0)
+		netif_carrier_off(wlp->ndev);
+	else
+		netif_carrier_on(wlp->ndev);
+}
+
+int wlp_setup(struct wlp *wlp, struct uwb_rc *rc, struct net_device *ndev)
 {
 	struct device *dev = &rc->uwb_dev.dev;
 	int result;
@@ -537,6 +547,7 @@ int wlp_setup(struct wlp *wlp, struct uwb_rc *rc)
 	BUG_ON(wlp->stop_queue == NULL);
 	BUG_ON(wlp->start_queue == NULL);
 	wlp->rc = rc;
+	wlp->ndev = ndev;
 	wlp_eda_init(&wlp->eda);/* Set up address cache */
 	wlp->uwb_notifs_handler.cb = wlp_uwb_notifs_cb;
 	wlp->uwb_notifs_handler.data = wlp;
@@ -544,6 +555,7 @@ int wlp_setup(struct wlp *wlp, struct uwb_rc *rc)
 
 	uwb_pal_init(&wlp->pal);
 	wlp->pal.rc = rc;
+	wlp->pal.channel_changed = wlp_channel_changed;
 	result = uwb_pal_register(&wlp->pal);
 	if (result < 0)
 		uwb_notifs_deregister(wlp->rc, &wlp->uwb_notifs_handler);
diff --git a/include/linux/wlp.h b/include/linux/wlp.h
index 033545e145c..ac95ce6606a 100644
--- a/include/linux/wlp.h
+++ b/include/linux/wlp.h
@@ -646,6 +646,7 @@ struct wlp_wss {
 struct wlp {
 	struct mutex mutex;
 	struct uwb_rc *rc;		/* UWB radio controller */
+	struct net_device *ndev;
 	struct uwb_pal pal;
 	struct wlp_eda eda;
 	struct wlp_uuid uuid;
@@ -675,7 +676,7 @@ struct wlp_wss_attribute {
 static struct wlp_wss_attribute wss_attr_##_name = __ATTR(_name, _mode,	\
 							  _show, _store)
 
-extern int wlp_setup(struct wlp *, struct uwb_rc *);
+extern int wlp_setup(struct wlp *, struct uwb_rc *, struct net_device *ndev);
 extern void wlp_remove(struct wlp *);
 extern ssize_t wlp_neighborhood_show(struct wlp *, char *);
 extern int wlp_wss_setup(struct net_device *, struct wlp_wss *);
-- 
cgit v1.2.3-70-g09d2


From 0996e6382482ce9014787693d3884e9468153a5c Mon Sep 17 00:00:00 2001
From: David Vrabel <david.vrabel@csr.com>
Date: Mon, 17 Nov 2008 16:23:22 +0000
Subject: uwb: remove unused beacon group join/leave events

The UWB_NOTIF_BG_JOIN/UWB_NOTIF_BG_LEAVE events have been
superceeded by the channel_changed callback in struct uwb_pal.

Signed-off-by: David Vrabel <david.vrabel@csr.com>
---
 drivers/uwb/beacon.c | 17 +----------------
 include/linux/uwb.h  |  7 +++----
 2 files changed, 4 insertions(+), 20 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/uwb/beacon.c b/drivers/uwb/beacon.c
index 247956098af..d9c60cb9499 100644
--- a/drivers/uwb/beacon.c
+++ b/drivers/uwb/beacon.c
@@ -140,10 +140,8 @@ int uwb_rc_beacon(struct uwb_rc *rc, int channel, unsigned bpst_offset)
 		}
 	}
 
-	if (result >= 0) {
+	if (result >= 0)
 		rc->beaconing = channel;
-		uwb_notify(rc, NULL, uwb_bg_joined(rc) ? UWB_NOTIF_BG_JOIN : UWB_NOTIF_BG_LEAVE);
-	}
 	return result;
 }
 
@@ -581,19 +579,6 @@ error:
 	return result;
 }
 
-/**
- * uwb_bg_joined - is the RC in a beacon group?
- * @rc: the radio controller
- *
- * Returns true if the radio controller is in a beacon group (even if
- * it's the sole member).
- */
-int uwb_bg_joined(struct uwb_rc *rc)
-{
-	return rc->beaconing != -1;
-}
-EXPORT_SYMBOL_GPL(uwb_bg_joined);
-
 /*
  * Print beaconing state.
  */
diff --git a/include/linux/uwb.h b/include/linux/uwb.h
index 7d3ebf046f9..1719709d60c 100644
--- a/include/linux/uwb.h
+++ b/include/linux/uwb.h
@@ -479,7 +479,6 @@ ssize_t uwb_rc_vcmd(struct uwb_rc *rc, const char *cmd_name,
 		    struct uwb_rccb *cmd, size_t cmd_size,
 		    u8 expected_type, u16 expected_event,
 		    struct uwb_rceb **preply);
-int uwb_bg_joined(struct uwb_rc *rc);
 
 size_t __uwb_addr_print(char *, size_t, const unsigned char *, int);
 
@@ -568,7 +567,9 @@ static inline bool uwb_rsv_is_owner(struct uwb_rsv *rsv)
 }
 
 /**
- * Events generated by UWB that can be passed to any listeners
+ * enum uwb_notifs - UWB events that can be passed to any listeners
+ * @UWB_NOTIF_ONAIR: a new neighbour has joined the beacon group.
+ * @UWB_NOTIF_OFFAIR: a neighbour has left the beacon group.
  *
  * Higher layers can register callback functions with the radio
  * controller using uwb_notifs_register(). The radio controller
@@ -576,8 +577,6 @@ static inline bool uwb_rsv_is_owner(struct uwb_rsv *rsv)
  * nodes when an event occurs.
  */
 enum uwb_notifs {
-	UWB_NOTIF_BG_JOIN = 0,	/* radio controller joined a beacon group */
-	UWB_NOTIF_BG_LEAVE = 1,	/* radio controller left a beacon group */
 	UWB_NOTIF_ONAIR,
 	UWB_NOTIF_OFFAIR,
 };
-- 
cgit v1.2.3-70-g09d2


From d314774cf2cd5dfeb39a00d37deee65d4c627927 Mon Sep 17 00:00:00 2001
From: Stephen Hemminger <shemminger@vyatta.com>
Date: Wed, 19 Nov 2008 21:32:24 -0800
Subject: netdev: network device operations infrastructure

This patch changes the network device internal API to move adminstrative
operations out of the network device structure and into a separate structure.

This patch involves some hackery to maintain compatablity between the
new and old model, so all 300+ drivers don't have to be changed at once.
For drivers that aren't converted yet, the netdevice_ops virt function list
still resides in the net_device structure. For old protocols, the new
net_device_ops are copied out to the old net_device pointers.

After the transistion is completed the nag message can be changed to
an WARN_ON, and the compatiablity code can be made configurable.

Some function pointers aren't moved:
* destructor can't be in net_device_ops because
  it may need to be referenced after the module is unloaded.
* neighbor setup is manipulated in a couple of places that need special
  consideration
* hard_start_xmit is in the fast path for transmit.

Signed-off-by: Stephen Hemminger <shemminger@vyatta.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h | 232 +++++++++++++++++++++++++++++++++-------------
 net/Kconfig               |   3 +
 net/core/dev.c            | 109 +++++++++++++++-------
 net/core/netpoll.c        |   7 +-
 net/core/rtnetlink.c      |   9 +-
 net/sched/sch_generic.c   |   4 +-
 6 files changed, 259 insertions(+), 105 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 12d7f4469dc..9060f5f3517 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -451,6 +451,131 @@ struct netdev_queue {
 	struct Qdisc		*qdisc_sleeping;
 } ____cacheline_aligned_in_smp;
 
+
+/*
+ * This structure defines the management hooks for network devices.
+ * The following hooks can bed defined and are optonal (can be null)
+ * unless otherwise noted.
+ *
+ * int (*ndo_init)(struct net_device *dev);
+ *     This function is called once when network device is registered.
+ *     The network device can use this to any late stage initializaton
+ *     or semantic validattion. It can fail with an error code which will
+ *     be propogated back to register_netdev
+ *
+ * void (*ndo_uninit)(struct net_device *dev);
+ *     This function is called when device is unregistered or when registration
+ *     fails. It is not called if init fails.
+ *
+ * int (*ndo_open)(struct net_device *dev);
+ *     This function is called when network device transistions to the up
+ *     state.
+ *
+ * int (*ndo_stop)(struct net_device *dev);
+ *     This function is called when network device transistions to the down
+ *     state.
+ *
+ * void (*ndo_change_rx_flags)(struct net_device *dev, int flags);
+ *	This function is called to allow device receiver to make
+ *	changes to configuration when multicast or promiscious is enabled.
+ *
+ * void (*ndo_set_rx_mode)(struct net_device *dev);
+ *	This function is called device changes address list filtering.
+ *
+ * void (*ndo_set_multicast_list)(struct net_device *dev);
+ *	This function is called when the multicast address list changes.
+ *
+ * int (*ndo_set_mac_address)(struct net_device *dev, void *addr);
+ *	This function  is called when the Media Access Control address
+ *	needs to be changed. If not this interface is not defined, the
+ *	mac address can not be changed.
+ *
+ * int (*ndo_validate_addr)(struct net_device *dev);
+ *	Test if Media Access Control address is valid for the device.
+ *
+ * int (*ndo_do_ioctl)(struct net_device *dev, struct ifreq *ifr, int cmd);
+ *	Called when a user request an ioctl which can't be handled by
+ *	the generic interface code. If not defined ioctl's return
+ *	not supported error code.
+ *
+ * int (*ndo_set_config)(struct net_device *dev, struct ifmap *map);
+ *	Used to set network devices bus interface parameters. This interface
+ *	is retained for legacy reason, new devices should use the bus
+ *	interface (PCI) for low level management.
+ *
+ * int (*ndo_change_mtu)(struct net_device *dev, int new_mtu);
+ *	Called when a user wants to change the Maximum Transfer Unit
+ *	of a device. If not defined, any request to change MTU will
+ *	will return an error.
+ *
+ * void (*ndo_tx_timeout) (struct net_device *dev);
+ *	Callback uses when the transmitter has not made any progress
+ *	for dev->watchdog ticks.
+ *
+ * struct net_device_stats* (*get_stats)(struct net_device *dev);
+ *	Called when a user wants to get the network device usage
+ *	statistics. If not defined, the counters in dev->stats will
+ *	be used.
+ *
+ * void (*ndo_vlan_rx_register)(struct net_device *dev, struct vlan_group *grp);
+ *	If device support VLAN receive accleration
+ *	(ie. dev->features & NETIF_F_HW_VLAN_RX), then this function is called
+ *	when vlan groups for the device changes.  Note: grp is NULL
+ *	if no vlan's groups are being used.
+ *
+ * void (*ndo_vlan_rx_add_vid)(struct net_device *dev, unsigned short vid);
+ *	If device support VLAN filtering (dev->features & NETIF_F_HW_VLAN_FILTER)
+ *	this function is called when a VLAN id is registered.
+ *
+ * void (*ndo_vlan_rx_kill_vid)(struct net_device *dev, unsigned short vid);
+ *	If device support VLAN filtering (dev->features & NETIF_F_HW_VLAN_FILTER)
+ *	this function is called when a VLAN id is unregistered.
+ *
+ * void (*ndo_poll_controller)(struct net_device *dev);
+ */
+struct net_device_ops {
+	int			(*ndo_init)(struct net_device *dev);
+	void			(*ndo_uninit)(struct net_device *dev);
+	int			(*ndo_open)(struct net_device *dev);
+	int			(*ndo_stop)(struct net_device *dev);
+#define HAVE_CHANGE_RX_FLAGS
+	void			(*ndo_change_rx_flags)(struct net_device *dev,
+						       int flags);
+#define HAVE_SET_RX_MODE
+	void			(*ndo_set_rx_mode)(struct net_device *dev);
+#define HAVE_MULTICAST
+	void			(*ndo_set_multicast_list)(struct net_device *dev);
+#define HAVE_SET_MAC_ADDR
+	int			(*ndo_set_mac_address)(struct net_device *dev,
+						       void *addr);
+#define HAVE_VALIDATE_ADDR
+	int			(*ndo_validate_addr)(struct net_device *dev);
+#define HAVE_PRIVATE_IOCTL
+	int			(*ndo_do_ioctl)(struct net_device *dev,
+					        struct ifreq *ifr, int cmd);
+#define HAVE_SET_CONFIG
+	int			(*ndo_set_config)(struct net_device *dev,
+					          struct ifmap *map);
+#define HAVE_CHANGE_MTU
+	int			(*ndo_change_mtu)(struct net_device *dev, int new_mtu);
+
+#define HAVE_TX_TIMEOUT
+	void			(*ndo_tx_timeout) (struct net_device *dev);
+
+	struct net_device_stats* (*ndo_get_stats)(struct net_device *dev);
+
+	void			(*ndo_vlan_rx_register)(struct net_device *dev,
+						        struct vlan_group *grp);
+	void			(*ndo_vlan_rx_add_vid)(struct net_device *dev,
+						       unsigned short vid);
+	void			(*ndo_vlan_rx_kill_vid)(struct net_device *dev,
+						        unsigned short vid);
+#ifdef CONFIG_NET_POLL_CONTROLLER
+#define HAVE_NETDEV_POLL
+	void                    (*ndo_poll_controller)(struct net_device *dev);
+#endif
+};
+
 /*
  *	The DEVICE structure.
  *	Actually, this whole structure is a big mistake.  It mixes I/O
@@ -498,11 +623,6 @@ struct net_device
 #ifdef CONFIG_NETPOLL
 	struct list_head	napi_list;
 #endif
-	
-	/* The device initialization function. Called only once. */
-	int			(*init)(struct net_device *dev);
-
-	/* ------- Fields preinitialized in Space.c finish here ------- */
 
 	/* Net device features */
 	unsigned long		features;
@@ -546,15 +666,13 @@ struct net_device
 	 * for all in netdev_increment_features.
 	 */
 #define NETIF_F_ONE_FOR_ALL	(NETIF_F_GSO_SOFTWARE | NETIF_F_GSO_ROBUST | \
-				 NETIF_F_SG | NETIF_F_HIGHDMA | \
+				 NETIF_F_SG | NETIF_F_HIGHDMA |		\
 				 NETIF_F_FRAGLIST)
 
 	/* Interface index. Unique device identifier	*/
 	int			ifindex;
 	int			iflink;
 
-
-	struct net_device_stats* (*get_stats)(struct net_device *dev);
 	struct net_device_stats	stats;
 
 #ifdef CONFIG_WIRELESS_EXT
@@ -564,18 +682,13 @@ struct net_device
 	/* Instance data managed by the core of Wireless Extensions. */
 	struct iw_public_data *	wireless_data;
 #endif
+	/* Management operations */
+	const struct net_device_ops *netdev_ops;
 	const struct ethtool_ops *ethtool_ops;
 
 	/* Hardware header description */
 	const struct header_ops *header_ops;
 
-	/*
-	 * This marks the end of the "visible" part of the structure. All
-	 * fields hereafter are internal to the system, and may change at
-	 * will (read: may be cleaned up at will).
-	 */
-
-
 	unsigned int		flags;	/* interface flags (a la BSD)	*/
 	unsigned short		gflags;
         unsigned short          priv_flags; /* Like 'flags' but invisible to userspace. */
@@ -634,7 +747,7 @@ struct net_device
 	unsigned long		last_rx;	/* Time of last Rx	*/
 	/* Interface address info used in eth_type_trans() */
 	unsigned char		dev_addr[MAX_ADDR_LEN];	/* hw address, (before bcast 
-							because most packets are unicast) */
+							   because most packets are unicast) */
 
 	unsigned char		broadcast[MAX_ADDR_LEN];	/* hw bcast add	*/
 
@@ -648,6 +761,10 @@ struct net_device
 	/* Number of TX queues currently active in device  */
 	unsigned int		real_num_tx_queues;
 
+	/* Map buffer to appropriate transmit queue */
+	u16			(*select_queue)(struct net_device *dev,
+						struct sk_buff *skb);
+
 	unsigned long		tx_queue_len;	/* Max frames per queue allowed */
 	spinlock_t		tx_global_lock;
 /*
@@ -662,9 +779,6 @@ struct net_device
 	int			watchdog_timeo; /* used by dev_watchdog() */
 	struct timer_list	watchdog_timer;
 
-/*
- * refcnt is a very hot point, so align it on SMP
- */
 	/* Number of references to this device */
 	atomic_t		refcnt ____cacheline_aligned_in_smp;
 
@@ -683,56 +797,14 @@ struct net_device
 	       NETREG_RELEASED,		/* called free_netdev */
 	} reg_state;
 
-	/* Called after device is detached from network. */
-	void			(*uninit)(struct net_device *dev);
-	/* Called after last user reference disappears. */
-	void			(*destructor)(struct net_device *dev);
+	/* Called from unregister, can be used to call free_netdev */
+	void (*destructor)(struct net_device *dev);
 
-	/* Pointers to interface service routines.	*/
-	int			(*open)(struct net_device *dev);
-	int			(*stop)(struct net_device *dev);
-#define HAVE_NETDEV_POLL
-#define HAVE_CHANGE_RX_FLAGS
-	void			(*change_rx_flags)(struct net_device *dev,
-						   int flags);
-#define HAVE_SET_RX_MODE
-	void			(*set_rx_mode)(struct net_device *dev);
-#define HAVE_MULTICAST			 
-	void			(*set_multicast_list)(struct net_device *dev);
-#define HAVE_SET_MAC_ADDR  		 
-	int			(*set_mac_address)(struct net_device *dev,
-						   void *addr);
-#define HAVE_VALIDATE_ADDR
-	int			(*validate_addr)(struct net_device *dev);
-#define HAVE_PRIVATE_IOCTL
-	int			(*do_ioctl)(struct net_device *dev,
-					    struct ifreq *ifr, int cmd);
-#define HAVE_SET_CONFIG
-	int			(*set_config)(struct net_device *dev,
-					      struct ifmap *map);
-#define HAVE_CHANGE_MTU
-	int			(*change_mtu)(struct net_device *dev, int new_mtu);
+	int (*neigh_setup)(struct net_device *dev, struct neigh_parms *);
 
-#define HAVE_TX_TIMEOUT
-	void			(*tx_timeout) (struct net_device *dev);
-
-	void			(*vlan_rx_register)(struct net_device *dev,
-						    struct vlan_group *grp);
-	void			(*vlan_rx_add_vid)(struct net_device *dev,
-						   unsigned short vid);
-	void			(*vlan_rx_kill_vid)(struct net_device *dev,
-						    unsigned short vid);
-
-	int			(*neigh_setup)(struct net_device *dev, struct neigh_parms *);
 #ifdef CONFIG_NETPOLL
 	struct netpoll_info	*npinfo;
 #endif
-#ifdef CONFIG_NET_POLL_CONTROLLER
-	void                    (*poll_controller)(struct net_device *dev);
-#endif
-
-	u16			(*select_queue)(struct net_device *dev,
-						struct sk_buff *skb);
 
 #ifdef CONFIG_NET_NS
 	/* Network namespace this network device is inside */
@@ -763,6 +835,38 @@ struct net_device
 	/* for setting kernel sock attribute on TCP connection setup */
 #define GSO_MAX_SIZE		65536
 	unsigned int		gso_max_size;
+
+#ifdef CONFIG_COMPAT_NET_DEV_OPS
+	struct {
+		int			(*init)(struct net_device *dev);
+		void			(*uninit)(struct net_device *dev);
+		int			(*open)(struct net_device *dev);
+		int			(*stop)(struct net_device *dev);
+		void			(*change_rx_flags)(struct net_device *dev,
+							   int flags);
+		void			(*set_rx_mode)(struct net_device *dev);
+		void			(*set_multicast_list)(struct net_device *dev);
+		int			(*set_mac_address)(struct net_device *dev,
+							   void *addr);
+		int			(*validate_addr)(struct net_device *dev);
+		int			(*do_ioctl)(struct net_device *dev,
+						    struct ifreq *ifr, int cmd);
+		int			(*set_config)(struct net_device *dev,
+						      struct ifmap *map);
+		int			(*change_mtu)(struct net_device *dev, int new_mtu);
+		void			(*tx_timeout) (struct net_device *dev);
+		struct net_device_stats* (*get_stats)(struct net_device *dev);
+		void			(*vlan_rx_register)(struct net_device *dev,
+							    struct vlan_group *grp);
+		void			(*vlan_rx_add_vid)(struct net_device *dev,
+							   unsigned short vid);
+		void			(*vlan_rx_kill_vid)(struct net_device *dev,
+							    unsigned short vid);
+#ifdef CONFIG_NET_POLL_CONTROLLER
+		void                    (*poll_controller)(struct net_device *dev);
+#endif
+#endif
+	};
 };
 #define to_net_dev(d) container_of(d, struct net_device, dev)
 
diff --git a/net/Kconfig b/net/Kconfig
index 8c3d97ca0d9..4e2e40ba8ba 100644
--- a/net/Kconfig
+++ b/net/Kconfig
@@ -32,6 +32,9 @@ config NET_NS
 	  Allow user space to create what appear to be multiple instances
 	  of the network stack.
 
+config COMPAT_NET_DEV_OPS
+       def_bool y
+
 source "net/packet/Kconfig"
 source "net/unix/Kconfig"
 source "net/xfrm/Kconfig"
diff --git a/net/core/dev.c b/net/core/dev.c
index e08c0fcd603..ca14ab407b3 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -1059,6 +1059,7 @@ void dev_load(struct net *net, const char *name)
  */
 int dev_open(struct net_device *dev)
 {
+	const struct net_device_ops *ops = dev->netdev_ops;
 	int ret = 0;
 
 	ASSERT_RTNL();
@@ -1081,11 +1082,11 @@ int dev_open(struct net_device *dev)
 	 */
 	set_bit(__LINK_STATE_START, &dev->state);
 
-	if (dev->validate_addr)
-		ret = dev->validate_addr(dev);
+	if (ops->ndo_validate_addr)
+		ret = ops->ndo_validate_addr(dev);
 
-	if (!ret && dev->open)
-		ret = dev->open(dev);
+	if (!ret && ops->ndo_open)
+		ret = ops->ndo_open(dev);
 
 	/*
 	 *	If it went open OK then:
@@ -1129,6 +1130,7 @@ int dev_open(struct net_device *dev)
  */
 int dev_close(struct net_device *dev)
 {
+	const struct net_device_ops *ops = dev->netdev_ops;
 	ASSERT_RTNL();
 
 	might_sleep();
@@ -1161,8 +1163,8 @@ int dev_close(struct net_device *dev)
 	 *	We allow it to be called even after a DETACH hot-plug
 	 *	event.
 	 */
-	if (dev->stop)
-		dev->stop(dev);
+	if (ops->ndo_stop)
+		ops->ndo_stop(dev);
 
 	/*
 	 *	Device is now down.
@@ -2930,8 +2932,10 @@ int netdev_set_master(struct net_device *slave, struct net_device *master)
 
 static void dev_change_rx_flags(struct net_device *dev, int flags)
 {
-	if (dev->flags & IFF_UP && dev->change_rx_flags)
-		dev->change_rx_flags(dev, flags);
+	const struct net_device_ops *ops = dev->netdev_ops;
+
+	if ((dev->flags & IFF_UP) && ops->ndo_change_rx_flags)
+		ops->ndo_change_rx_flags(dev, flags);
 }
 
 static int __dev_set_promiscuity(struct net_device *dev, int inc)
@@ -3051,6 +3055,8 @@ int dev_set_allmulti(struct net_device *dev, int inc)
  */
 void __dev_set_rx_mode(struct net_device *dev)
 {
+	const struct net_device_ops *ops = dev->netdev_ops;
+
 	/* dev_open will call this function so the list will stay sane. */
 	if (!(dev->flags&IFF_UP))
 		return;
@@ -3058,8 +3064,8 @@ void __dev_set_rx_mode(struct net_device *dev)
 	if (!netif_device_present(dev))
 		return;
 
-	if (dev->set_rx_mode)
-		dev->set_rx_mode(dev);
+	if (ops->ndo_set_rx_mode)
+		ops->ndo_set_rx_mode(dev);
 	else {
 		/* Unicast addresses changes may only happen under the rtnl,
 		 * therefore calling __dev_set_promiscuity here is safe.
@@ -3072,8 +3078,8 @@ void __dev_set_rx_mode(struct net_device *dev)
 			dev->uc_promisc = 0;
 		}
 
-		if (dev->set_multicast_list)
-			dev->set_multicast_list(dev);
+		if (ops->ndo_set_multicast_list)
+			ops->ndo_set_multicast_list(dev);
 	}
 }
 
@@ -3432,6 +3438,7 @@ int dev_change_flags(struct net_device *dev, unsigned flags)
  */
 int dev_set_mtu(struct net_device *dev, int new_mtu)
 {
+	const struct net_device_ops *ops = dev->netdev_ops;
 	int err;
 
 	if (new_mtu == dev->mtu)
@@ -3445,10 +3452,11 @@ int dev_set_mtu(struct net_device *dev, int new_mtu)
 		return -ENODEV;
 
 	err = 0;
-	if (dev->change_mtu)
-		err = dev->change_mtu(dev, new_mtu);
+	if (ops->ndo_change_mtu)
+		err = ops->ndo_change_mtu(dev, new_mtu);
 	else
 		dev->mtu = new_mtu;
+
 	if (!err && dev->flags & IFF_UP)
 		call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
 	return err;
@@ -3463,15 +3471,16 @@ int dev_set_mtu(struct net_device *dev, int new_mtu)
  */
 int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa)
 {
+	const struct net_device_ops *ops = dev->netdev_ops;
 	int err;
 
-	if (!dev->set_mac_address)
+	if (!ops->ndo_set_mac_address)
 		return -EOPNOTSUPP;
 	if (sa->sa_family != dev->type)
 		return -EINVAL;
 	if (!netif_device_present(dev))
 		return -ENODEV;
-	err = dev->set_mac_address(dev, sa);
+	err = ops->ndo_set_mac_address(dev, sa);
 	if (!err)
 		call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
 	return err;
@@ -3551,6 +3560,7 @@ static int dev_ifsioc(struct net *net, struct ifreq *ifr, unsigned int cmd)
 {
 	int err;
 	struct net_device *dev = __dev_get_by_name(net, ifr->ifr_name);
+	const struct net_device_ops *ops = dev->netdev_ops;
 
 	if (!dev)
 		return -ENODEV;
@@ -3578,15 +3588,15 @@ static int dev_ifsioc(struct net *net, struct ifreq *ifr, unsigned int cmd)
 			return 0;
 
 		case SIOCSIFMAP:
-			if (dev->set_config) {
+			if (ops->ndo_set_config) {
 				if (!netif_device_present(dev))
 					return -ENODEV;
-				return dev->set_config(dev, &ifr->ifr_map);
+				return ops->ndo_set_config(dev, &ifr->ifr_map);
 			}
 			return -EOPNOTSUPP;
 
 		case SIOCADDMULTI:
-			if ((!dev->set_multicast_list && !dev->set_rx_mode) ||
+			if ((!ops->ndo_set_multicast_list && !ops->ndo_set_rx_mode) ||
 			    ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
 				return -EINVAL;
 			if (!netif_device_present(dev))
@@ -3595,7 +3605,7 @@ static int dev_ifsioc(struct net *net, struct ifreq *ifr, unsigned int cmd)
 					  dev->addr_len, 1);
 
 		case SIOCDELMULTI:
-			if ((!dev->set_multicast_list && !dev->set_rx_mode) ||
+			if ((!ops->ndo_set_multicast_list && !ops->ndo_set_rx_mode) ||
 			    ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
 				return -EINVAL;
 			if (!netif_device_present(dev))
@@ -3633,10 +3643,9 @@ static int dev_ifsioc(struct net *net, struct ifreq *ifr, unsigned int cmd)
 			    cmd == SIOCBRDELIF ||
 			    cmd == SIOCWANDEV) {
 				err = -EOPNOTSUPP;
-				if (dev->do_ioctl) {
+				if (ops->ndo_do_ioctl) {
 					if (netif_device_present(dev))
-						err = dev->do_ioctl(dev, ifr,
-								    cmd);
+						err = ops->ndo_do_ioctl(dev, ifr, cmd);
 					else
 						err = -ENODEV;
 				}
@@ -3897,8 +3906,8 @@ static void rollback_registered(struct net_device *dev)
 	 */
 	dev_addr_discard(dev);
 
-	if (dev->uninit)
-		dev->uninit(dev);
+	if (dev->netdev_ops->ndo_uninit)
+		dev->netdev_ops->ndo_uninit(dev);
 
 	/* Notifier chain MUST detach us from master device. */
 	WARN_ON(dev->master);
@@ -3988,7 +3997,7 @@ int register_netdevice(struct net_device *dev)
 	struct hlist_head *head;
 	struct hlist_node *p;
 	int ret;
-	struct net *net;
+	struct net *net = dev_net(dev);
 
 	BUG_ON(dev_boot_phase);
 	ASSERT_RTNL();
@@ -3997,8 +4006,7 @@ int register_netdevice(struct net_device *dev)
 
 	/* When net_device's are persistent, this will be fatal. */
 	BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
-	BUG_ON(!dev_net(dev));
-	net = dev_net(dev);
+	BUG_ON(!net);
 
 	spin_lock_init(&dev->addr_list_lock);
 	netdev_set_addr_lockdep_class(dev);
@@ -4006,9 +4014,46 @@ int register_netdevice(struct net_device *dev)
 
 	dev->iflink = -1;
 
+#ifdef CONFIG_COMPAT_NET_DEV_OPS
+	/* Netdevice_ops API compatiability support.
+	 * This is temporary until all network devices are converted.
+	 */
+	if (dev->netdev_ops) {
+		const struct net_device_ops *ops = dev->netdev_ops;
+
+		dev->init = ops->ndo_init;
+		dev->uninit = ops->ndo_uninit;
+		dev->open = ops->ndo_open;
+		dev->change_rx_flags = ops->ndo_change_rx_flags;
+		dev->set_rx_mode = ops->ndo_set_rx_mode;
+		dev->set_multicast_list = ops->ndo_set_multicast_list;
+		dev->set_mac_address = ops->ndo_set_mac_address;
+		dev->validate_addr = ops->ndo_validate_addr;
+		dev->do_ioctl = ops->ndo_do_ioctl;
+		dev->set_config = ops->ndo_set_config;
+		dev->change_mtu = ops->ndo_change_mtu;
+		dev->tx_timeout = ops->ndo_tx_timeout;
+		dev->get_stats = ops->ndo_get_stats;
+		dev->vlan_rx_register = ops->ndo_vlan_rx_register;
+		dev->vlan_rx_add_vid = ops->ndo_vlan_rx_add_vid;
+		dev->vlan_rx_kill_vid = ops->ndo_vlan_rx_kill_vid;
+#ifdef CONFIG_NET_POLL_CONTROLLER
+		dev->poll_controller = ops->ndo_poll_controller;
+#endif
+	} else {
+		char drivername[64];
+		pr_info("%s (%s): not using net_device_ops yet\n",
+			dev->name, netdev_drivername(dev, drivername, 64));
+
+		/* This works only because net_device_ops and the
+		   compatiablity structure are the same. */
+		dev->netdev_ops = (void *) &(dev->init);
+	}
+#endif
+
 	/* Init, if this function is available */
-	if (dev->init) {
-		ret = dev->init(dev);
+	if (dev->netdev_ops->ndo_init) {
+		ret = dev->netdev_ops->ndo_init(dev);
 		if (ret) {
 			if (ret > 0)
 				ret = -EIO;
@@ -4086,8 +4131,8 @@ out:
 	return ret;
 
 err_uninit:
-	if (dev->uninit)
-		dev->uninit(dev);
+	if (dev->netdev_ops->ndo_uninit)
+		dev->netdev_ops->ndo_uninit(dev);
 	goto out;
 }
 
diff --git a/net/core/netpoll.c b/net/core/netpoll.c
index fc4e28e23b8..630df603444 100644
--- a/net/core/netpoll.c
+++ b/net/core/netpoll.c
@@ -172,12 +172,13 @@ static void service_arp_queue(struct netpoll_info *npi)
 void netpoll_poll(struct netpoll *np)
 {
 	struct net_device *dev = np->dev;
+	const struct net_device_ops *ops = dev->netdev_ops;
 
-	if (!dev || !netif_running(dev) || !dev->poll_controller)
+	if (!dev || !netif_running(dev) || !ops->ndo_poll_controller)
 		return;
 
 	/* Process pending work on NIC */
-	dev->poll_controller(dev);
+	ops->ndo_poll_controller(dev);
 
 	poll_napi(dev);
 
@@ -694,7 +695,7 @@ int netpoll_setup(struct netpoll *np)
 		atomic_inc(&npinfo->refcnt);
 	}
 
-	if (!ndev->poll_controller) {
+	if (!ndev->netdev_ops->ndo_poll_controller) {
 		printk(KERN_ERR "%s: %s doesn't support polling, aborting.\n",
 		       np->name, np->dev_name);
 		err = -ENOTSUPP;
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 4dfb6b4d455..6f8e0778e56 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -762,6 +762,7 @@ static int validate_linkmsg(struct net_device *dev, struct nlattr *tb[])
 static int do_setlink(struct net_device *dev, struct ifinfomsg *ifm,
 		      struct nlattr **tb, char *ifname, int modified)
 {
+	const struct net_device_ops *ops = dev->netdev_ops;
 	int send_addr_notify = 0;
 	int err;
 
@@ -783,7 +784,7 @@ static int do_setlink(struct net_device *dev, struct ifinfomsg *ifm,
 		struct rtnl_link_ifmap *u_map;
 		struct ifmap k_map;
 
-		if (!dev->set_config) {
+		if (!ops->ndo_set_config) {
 			err = -EOPNOTSUPP;
 			goto errout;
 		}
@@ -801,7 +802,7 @@ static int do_setlink(struct net_device *dev, struct ifinfomsg *ifm,
 		k_map.dma = (unsigned char) u_map->dma;
 		k_map.port = (unsigned char) u_map->port;
 
-		err = dev->set_config(dev, &k_map);
+		err = ops->ndo_set_config(dev, &k_map);
 		if (err < 0)
 			goto errout;
 
@@ -812,7 +813,7 @@ static int do_setlink(struct net_device *dev, struct ifinfomsg *ifm,
 		struct sockaddr *sa;
 		int len;
 
-		if (!dev->set_mac_address) {
+		if (!ops->ndo_set_mac_address) {
 			err = -EOPNOTSUPP;
 			goto errout;
 		}
@@ -831,7 +832,7 @@ static int do_setlink(struct net_device *dev, struct ifinfomsg *ifm,
 		sa->sa_family = dev->type;
 		memcpy(sa->sa_data, nla_data(tb[IFLA_ADDRESS]),
 		       dev->addr_len);
-		err = dev->set_mac_address(dev, sa);
+		err = ops->ndo_set_mac_address(dev, sa);
 		kfree(sa);
 		if (err)
 			goto errout;
diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index 80c8f3dbbea..95ab55c064f 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -224,7 +224,7 @@ static void dev_watchdog(unsigned long arg)
 				char drivername[64];
 				WARN_ONCE(1, KERN_INFO "NETDEV WATCHDOG: %s (%s): transmit timed out\n",
 				       dev->name, netdev_drivername(dev, drivername, 64));
-				dev->tx_timeout(dev);
+				dev->netdev_ops->ndo_tx_timeout(dev);
 			}
 			if (!mod_timer(&dev->watchdog_timer,
 				       round_jiffies(jiffies +
@@ -239,7 +239,7 @@ static void dev_watchdog(unsigned long arg)
 
 void __netdev_watchdog_up(struct net_device *dev)
 {
-	if (dev->tx_timeout) {
+	if (dev->netdev_ops->ndo_tx_timeout) {
 		if (dev->watchdog_timeo <= 0)
 			dev->watchdog_timeo = 5*HZ;
 		if (!mod_timer(&dev->watchdog_timer,
-- 
cgit v1.2.3-70-g09d2


From eeda3fd64f75bcbfaa70ce946513abaf3f23b8e0 Mon Sep 17 00:00:00 2001
From: Stephen Hemminger <shemminger@vyatta.com>
Date: Wed, 19 Nov 2008 21:40:23 -0800
Subject: netdev: introduce dev_get_stats()

In order for the network device ops get_stats call to be immutable, the handling
of the default internal network device stats block has to be changed. Add a new
helper function which replaces the old use of internal_get_stats.

Note: change return code to make it clear that the caller should not
go changing the returned statistics.

Signed-off-by: Stephen Hemminger <shemminger@vyatta.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 arch/s390/appldata/appldata_net_sum.c |  4 ++--
 drivers/net/bonding/bond_main.c       |  5 +++--
 drivers/net/sfc/ethtool.c             |  2 +-
 drivers/parisc/led.c                  |  4 ++--
 include/linux/netdevice.h             |  4 +++-
 net/core/dev.c                        | 23 ++++++++++++++++++-----
 net/core/net-sysfs.c                  |  3 +--
 net/core/rtnetlink.c                  |  6 +++---
 8 files changed, 33 insertions(+), 18 deletions(-)

(limited to 'include/linux')

diff --git a/arch/s390/appldata/appldata_net_sum.c b/arch/s390/appldata/appldata_net_sum.c
index 3b746556e1a..fa741f84c5b 100644
--- a/arch/s390/appldata/appldata_net_sum.c
+++ b/arch/s390/appldata/appldata_net_sum.c
@@ -67,7 +67,6 @@ static void appldata_get_net_sum_data(void *data)
 	int i;
 	struct appldata_net_sum_data *net_data;
 	struct net_device *dev;
-	struct net_device_stats *stats;
 	unsigned long rx_packets, tx_packets, rx_bytes, tx_bytes, rx_errors,
 			tx_errors, rx_dropped, tx_dropped, collisions;
 
@@ -86,7 +85,8 @@ static void appldata_get_net_sum_data(void *data)
 	collisions = 0;
 	read_lock(&dev_base_lock);
 	for_each_netdev(&init_net, dev) {
-		stats = dev->get_stats(dev);
+		const struct net_device_stats *stats = dev_get_stats(dev);
+
 		rx_packets += stats->rx_packets;
 		tx_packets += stats->tx_packets;
 		rx_bytes   += stats->rx_bytes;
diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index a08ea480805..db5f5c24a25 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -3899,7 +3899,7 @@ static int bond_close(struct net_device *bond_dev)
 static struct net_device_stats *bond_get_stats(struct net_device *bond_dev)
 {
 	struct bonding *bond = netdev_priv(bond_dev);
-	struct net_device_stats *stats = &(bond->stats), *sstats;
+	struct net_device_stats *stats = &bond->stats;
 	struct net_device_stats local_stats;
 	struct slave *slave;
 	int i;
@@ -3909,7 +3909,8 @@ static struct net_device_stats *bond_get_stats(struct net_device *bond_dev)
 	read_lock_bh(&bond->lock);
 
 	bond_for_each_slave(bond, slave, i) {
-		sstats = slave->dev->get_stats(slave->dev);
+		const struct net_device_stats *sstats = dev_get_stats(slave->dev);
+
 		local_stats.rx_packets += sstats->rx_packets;
 		local_stats.rx_bytes += sstats->rx_bytes;
 		local_stats.rx_errors += sstats->rx_errors;
diff --git a/drivers/net/sfc/ethtool.c b/drivers/net/sfc/ethtool.c
index abd8fcd6e62..cd92c4d8dbc 100644
--- a/drivers/net/sfc/ethtool.c
+++ b/drivers/net/sfc/ethtool.c
@@ -426,7 +426,7 @@ static void efx_ethtool_get_stats(struct net_device *net_dev,
 	EFX_BUG_ON_PARANOID(stats->n_stats != EFX_ETHTOOL_NUM_STATS);
 
 	/* Update MAC and NIC statistics */
-	net_dev->get_stats(net_dev);
+	dev_get_stats(net_dev);
 
 	/* Fill detailed statistics buffer */
 	for (i = 0; i < EFX_ETHTOOL_NUM_STATS; i++) {
diff --git a/drivers/parisc/led.c b/drivers/parisc/led.c
index f9b12664f9f..454b6532e40 100644
--- a/drivers/parisc/led.c
+++ b/drivers/parisc/led.c
@@ -360,13 +360,13 @@ static __inline__ int led_get_net_activity(void)
 	read_lock(&dev_base_lock);
 	rcu_read_lock();
 	for_each_netdev(&init_net, dev) {
-	    struct net_device_stats *stats;
+	    const struct net_device_stats *stats;
 	    struct in_device *in_dev = __in_dev_get_rcu(dev);
 	    if (!in_dev || !in_dev->ifa_list)
 		continue;
 	    if (ipv4_is_loopback(in_dev->ifa_list->ifa_local))
 		continue;
-	    stats = dev->get_stats(dev);
+	    stats = dev_get_stats(dev);
 	    rx_total += stats->rx_packets;
 	    tx_total += stats->tx_packets;
 	}
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 9060f5f3517..981a089d514 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -864,9 +864,9 @@ struct net_device
 							    unsigned short vid);
 #ifdef CONFIG_NET_POLL_CONTROLLER
 		void                    (*poll_controller)(struct net_device *dev);
-#endif
 #endif
 	};
+#endif
 };
 #define to_net_dev(d) container_of(d, struct net_device, dev)
 
@@ -1780,6 +1780,8 @@ extern void		netdev_features_change(struct net_device *dev);
 /* Load a device via the kmod */
 extern void		dev_load(struct net *net, const char *name);
 extern void		dev_mcast_init(void);
+extern const struct net_device_stats *dev_get_stats(struct net_device *dev);
+
 extern int		netdev_max_backlog;
 extern int		weight_p;
 extern int		netdev_set_master(struct net_device *dev, struct net_device *master);
diff --git a/net/core/dev.c b/net/core/dev.c
index ca14ab407b3..8843f4e3f5e 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2620,7 +2620,7 @@ void dev_seq_stop(struct seq_file *seq, void *v)
 
 static void dev_seq_printf_stats(struct seq_file *seq, struct net_device *dev)
 {
-	struct net_device_stats *stats = dev->get_stats(dev);
+	const struct net_device_stats *stats = dev_get_stats(dev);
 
 	seq_printf(seq, "%6s:%8lu %7lu %4lu %4lu %4lu %5lu %10lu %9lu "
 		   "%8lu %7lu %4lu %4lu %4lu %5lu %7lu %10lu\n",
@@ -4288,10 +4288,24 @@ void netdev_run_todo(void)
 	}
 }
 
-static struct net_device_stats *internal_stats(struct net_device *dev)
-{
-	return &dev->stats;
+/**
+ *	dev_get_stats	- get network device statistics
+ *	@dev: device to get statistics from
+ *
+ *	Get network statistics from device. The device driver may provide
+ *	its own method by setting dev->netdev_ops->get_stats; otherwise
+ *	the internal statistics structure is used.
+ */
+const struct net_device_stats *dev_get_stats(struct net_device *dev)
+ {
+	const struct net_device_ops *ops = dev->netdev_ops;
+
+	if (ops->ndo_get_stats)
+		return ops->ndo_get_stats(dev);
+	else
+		return &dev->stats;
 }
+EXPORT_SYMBOL(dev_get_stats);
 
 static void netdev_init_one_queue(struct net_device *dev,
 				  struct netdev_queue *queue,
@@ -4370,7 +4384,6 @@ struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name,
 
 	netdev_init_queues(dev);
 
-	dev->get_stats = internal_stats;
 	netpoll_netdev_init(dev);
 	setup(dev);
 	strcpy(dev->name, name);
diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
index 146dcfeb060..afd42d71732 100644
--- a/net/core/net-sysfs.c
+++ b/net/core/net-sysfs.c
@@ -270,7 +270,6 @@ static ssize_t netstat_show(const struct device *d,
 			    unsigned long offset)
 {
 	struct net_device *dev = to_net_dev(d);
-	struct net_device_stats *stats;
 	ssize_t ret = -EINVAL;
 
 	WARN_ON(offset > sizeof(struct net_device_stats) ||
@@ -278,7 +277,7 @@ static ssize_t netstat_show(const struct device *d,
 
 	read_lock(&dev_base_lock);
 	if (dev_isalive(dev)) {
-		stats = dev->get_stats(dev);
+		const struct net_device_stats *stats = dev_get_stats(dev);
 		ret = sprintf(buf, fmt_ulong,
 			      *(unsigned long *)(((u8 *) stats) + offset));
 	}
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 6f8e0778e56..790dd205bb5 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -551,7 +551,7 @@ static void set_operstate(struct net_device *dev, unsigned char transition)
 }
 
 static void copy_rtnl_link_stats(struct rtnl_link_stats *a,
-				 struct net_device_stats *b)
+				 const struct net_device_stats *b)
 {
 	a->rx_packets = b->rx_packets;
 	a->tx_packets = b->tx_packets;
@@ -609,7 +609,7 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev,
 	struct netdev_queue *txq;
 	struct ifinfomsg *ifm;
 	struct nlmsghdr *nlh;
-	struct net_device_stats *stats;
+	const struct net_device_stats *stats;
 	struct nlattr *attr;
 
 	nlh = nlmsg_put(skb, pid, seq, type, sizeof(*ifm), flags);
@@ -666,7 +666,7 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev,
 	if (attr == NULL)
 		goto nla_put_failure;
 
-	stats = dev->get_stats(dev);
+	stats = dev_get_stats(dev);
 	copy_rtnl_link_stats(nla_data(attr), stats);
 
 	if (dev->rtnl_link_ops) {
-- 
cgit v1.2.3-70-g09d2


From ccad637b0c57de1825ffd34c311bf71487545ac2 Mon Sep 17 00:00:00 2001
From: Stephen Hemminger <shemminger@vyatta.com>
Date: Wed, 19 Nov 2008 22:42:31 -0800
Subject: netdev: expose ethernet address primitives

When ethernet devices are converted, the function pointer setup
by eth_setup() need to be done during intialization.

Signed-off-by: Stephen Hemminger <shemminger@vyatta.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/usb/gadget/u_ether.c |  4 ++--
 include/linux/etherdevice.h  |  4 ++++
 net/ethernet/eth.c           | 13 ++++++++-----
 3 files changed, 14 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/usb/gadget/u_ether.c b/drivers/usb/gadget/u_ether.c
index 00fa5239879..d9739d52f8f 100644
--- a/drivers/usb/gadget/u_ether.c
+++ b/drivers/usb/gadget/u_ether.c
@@ -146,7 +146,7 @@ static inline int qlen(struct usb_gadget *gadget)
 
 /* NETWORK DRIVER HOOKUP (to the layer above this driver) */
 
-static int eth_change_mtu(struct net_device *net, int new_mtu)
+static int ueth_change_mtu(struct net_device *net, int new_mtu)
 {
 	struct eth_dev	*dev = netdev_priv(net);
 	unsigned long	flags;
@@ -764,7 +764,7 @@ int __init gether_setup(struct usb_gadget *g, u8 ethaddr[ETH_ALEN])
 	if (ethaddr)
 		memcpy(ethaddr, dev->host_mac, ETH_ALEN);
 
-	net->change_mtu = eth_change_mtu;
+	net->change_mtu = ueth_change_mtu;
 	net->hard_start_xmit = eth_start_xmit;
 	net->open = eth_open;
 	net->stop = eth_stop;
diff --git a/include/linux/etherdevice.h b/include/linux/etherdevice.h
index 25d62e6e329..0e5e9706003 100644
--- a/include/linux/etherdevice.h
+++ b/include/linux/etherdevice.h
@@ -41,6 +41,10 @@ extern int eth_header_cache(const struct neighbour *neigh, struct hh_cache *hh);
 extern void eth_header_cache_update(struct hh_cache *hh,
 				    const struct net_device *dev,
 				    const unsigned char *haddr);
+extern int eth_mac_addr(struct net_device *dev, void *p);
+extern int eth_change_mtu(struct net_device *dev, int new_mtu);
+extern int eth_validate_addr(struct net_device *dev);
+
 
 
 extern struct net_device *alloc_etherdev_mq(int sizeof_priv, unsigned int queue_count);
diff --git a/net/ethernet/eth.c b/net/ethernet/eth.c
index b9d85af2dd3..a87a171d991 100644
--- a/net/ethernet/eth.c
+++ b/net/ethernet/eth.c
@@ -282,7 +282,7 @@ EXPORT_SYMBOL(eth_header_cache_update);
  * This doesn't change hardware matching, so needs to be overridden
  * for most real devices.
  */
-static int eth_mac_addr(struct net_device *dev, void *p)
+int eth_mac_addr(struct net_device *dev, void *p)
 {
 	struct sockaddr *addr = p;
 
@@ -293,6 +293,7 @@ static int eth_mac_addr(struct net_device *dev, void *p)
 	memcpy(dev->dev_addr, addr->sa_data, ETH_ALEN);
 	return 0;
 }
+EXPORT_SYMBOL(eth_mac_addr);
 
 /**
  * eth_change_mtu - set new MTU size
@@ -302,21 +303,23 @@ static int eth_mac_addr(struct net_device *dev, void *p)
  * Allow changing MTU size. Needs to be overridden for devices
  * supporting jumbo frames.
  */
-static int eth_change_mtu(struct net_device *dev, int new_mtu)
+int eth_change_mtu(struct net_device *dev, int new_mtu)
 {
 	if (new_mtu < 68 || new_mtu > ETH_DATA_LEN)
 		return -EINVAL;
 	dev->mtu = new_mtu;
 	return 0;
 }
+EXPORT_SYMBOL(eth_change_mtu);
 
-static int eth_validate_addr(struct net_device *dev)
+int eth_validate_addr(struct net_device *dev)
 {
 	if (!is_valid_ether_addr(dev->dev_addr))
 		return -EADDRNOTAVAIL;
 
 	return 0;
 }
+EXPORT_SYMBOL(eth_validate_addr);
 
 const struct header_ops eth_header_ops ____cacheline_aligned = {
 	.create		= eth_header,
@@ -334,11 +337,11 @@ const struct header_ops eth_header_ops ____cacheline_aligned = {
 void ether_setup(struct net_device *dev)
 {
 	dev->header_ops		= &eth_header_ops;
-
+#ifdef CONFIG_COMPAT_NET_DEV_OPS
 	dev->change_mtu		= eth_change_mtu;
 	dev->set_mac_address 	= eth_mac_addr;
 	dev->validate_addr	= eth_validate_addr;
-
+#endif
 	dev->type		= ARPHRD_ETHER;
 	dev->hard_header_len 	= ETH_HLEN;
 	dev->mtu		= ETH_DATA_LEN;
-- 
cgit v1.2.3-70-g09d2


From d214c7537bbf2f247991fb65b3420b0b3d712c67 Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Thu, 20 Nov 2008 00:49:27 -0800
Subject: filter: add SKF_AD_NLATTR_NEST to look for nested attributes

SKF_AD_NLATTR allows us to find the first matching attribute in a
stream of netlink attributes from one offset to the end of the
netlink message. This is not suitable to look for a specific
matching inside a set of nested attributes.

For example, in ctnetlink messages, if we look for the CTA_V6_SRC
attribute in a message that talks about an IPv4 connection,
SKF_AD_NLATTR returns the offset of CTA_STATUS which has the same
value of CTA_V6_SRC but outside the nest. To differenciate
CTA_STATUS and CTA_V6_SRC, we would have to make assumptions on the
size of the attribute and the usual offset, resulting in horrible
BSF code.

This patch adds SKF_AD_NLATTR_NEST, which is a variant of
SKF_AD_NLATTR, that looks for an attribute inside the limits of
a nested attributes, but not further.

This patch validates that we have enough room to look for the
nested attributes - based on a suggestion from Patrick McHardy.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
Acked-by: Patrick McHardy <kaber@trash.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/filter.h |  3 ++-
 net/core/filter.c      | 19 +++++++++++++++++++
 2 files changed, 21 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/filter.h b/include/linux/filter.h
index b6ea9aa9e85..1354aaf6abb 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -122,7 +122,8 @@ struct sock_fprog	/* Required for SO_ATTACH_FILTER. */
 #define SKF_AD_PKTTYPE 	4
 #define SKF_AD_IFINDEX 	8
 #define SKF_AD_NLATTR	12
-#define SKF_AD_MAX 	16
+#define SKF_AD_NLATTR_NEST	16
+#define SKF_AD_MAX	20
 #define SKF_NET_OFF   (-0x100000)
 #define SKF_LL_OFF    (-0x200000)
 
diff --git a/net/core/filter.c b/net/core/filter.c
index df374435583..d1d779ca096 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -319,6 +319,25 @@ load_b:
 				A = 0;
 			continue;
 		}
+		case SKF_AD_NLATTR_NEST: {
+			struct nlattr *nla;
+
+			if (skb_is_nonlinear(skb))
+				return 0;
+			if (A > skb->len - sizeof(struct nlattr))
+				return 0;
+
+			nla = (struct nlattr *)&skb->data[A];
+			if (nla->nla_len > A - skb->len)
+				return 0;
+
+			nla = nla_find_nested(nla, X);
+			if (nla)
+				A = (void *)nla - (void *)skb->data;
+			else
+				A = 0;
+			continue;
+		}
 		default:
 			return 0;
 		}
-- 
cgit v1.2.3-70-g09d2


From 0c19b0adb8dd33dbd10ff48e41971231c486855c Mon Sep 17 00:00:00 2001
From: Patrick McHardy <kaber@trash.net>
Date: Thu, 20 Nov 2008 04:08:29 -0800
Subject: netlink: avoid memset of 0 bytes sparse warning

A netlink attribute padding of zero triggers this sparse warning:

include/linux/netlink.h:245:8: warning: memset with byte count of 0

Avoid the memset when the size parameter is constant and requires no padding.

Signed-off-by: Patrick McHardy <kaber@trash.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netlink.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/netlink.h b/include/linux/netlink.h
index 9ff1b54908f..51b09a1f46c 100644
--- a/include/linux/netlink.h
+++ b/include/linux/netlink.h
@@ -242,7 +242,8 @@ __nlmsg_put(struct sk_buff *skb, u32 pid, u32 seq, int type, int len, int flags)
 	nlh->nlmsg_flags = flags;
 	nlh->nlmsg_pid = pid;
 	nlh->nlmsg_seq = seq;
-	memset(NLMSG_DATA(nlh) + len, 0, NLMSG_ALIGN(size) - size);
+	if (!__builtin_constant_p(size) || NLMSG_ALIGN(size) - size != 0)
+		memset(NLMSG_DATA(nlh) + len, 0, NLMSG_ALIGN(size) - size);
 	return nlh;
 }
 
-- 
cgit v1.2.3-70-g09d2


From 13d2a1d2b032de08d7dcab6a1edcd47802681f96 Mon Sep 17 00:00:00 2001
From: Patrick McHardy <kaber@trash.net>
Date: Thu, 20 Nov 2008 04:10:00 -0800
Subject: pkt_sched: add DRR scheduler

Add classful DRR scheduler as a more flexible replacement for SFQ.

The main difference to the algorithm described in "Efficient Fair Queueing
using Deficit Round Robin" is that this implementation doesn't drop packets
from the longest queue on overrun because its classful and limits are
handled by each individual child qdisc.

Signed-off-by: Patrick McHardy <kaber@trash.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/pkt_sched.h |  16 ++
 net/sched/Kconfig         |  11 +
 net/sched/Makefile        |   1 +
 net/sched/sch_drr.c       | 505 ++++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 533 insertions(+)
 create mode 100644 net/sched/sch_drr.c

(limited to 'include/linux')

diff --git a/include/linux/pkt_sched.h b/include/linux/pkt_sched.h
index 5d921fa91a5..e3f133adba7 100644
--- a/include/linux/pkt_sched.h
+++ b/include/linux/pkt_sched.h
@@ -500,4 +500,20 @@ struct tc_netem_corrupt
 
 #define NETEM_DIST_SCALE	8192
 
+/* DRR */
+
+enum
+{
+	TCA_DRR_UNSPEC,
+	TCA_DRR_QUANTUM,
+	__TCA_DRR_MAX
+};
+
+#define TCA_DRR_MAX	(__TCA_DRR_MAX - 1)
+
+struct tc_drr_stats
+{
+	u32	deficit;
+};
+
 #endif
diff --git a/net/sched/Kconfig b/net/sched/Kconfig
index 36543b6fcef..4f7ef0db302 100644
--- a/net/sched/Kconfig
+++ b/net/sched/Kconfig
@@ -194,6 +194,17 @@ config NET_SCH_NETEM
 
 	  If unsure, say N.
 
+config NET_SCH_DRR
+	tristate "Deficit Round Robin scheduler (DRR)"
+	help
+	  Say Y here if you want to use the Deficit Round Robin (DRR) packet
+	  scheduling algorithm.
+
+	  To compile this driver as a module, choose M here: the module
+	  will be called sch_drr.
+
+	  If unsure, say N.
+
 config NET_SCH_INGRESS
 	tristate "Ingress Qdisc"
 	depends on NET_CLS_ACT
diff --git a/net/sched/Makefile b/net/sched/Makefile
index 70b35f8708c..54d950cd4b8 100644
--- a/net/sched/Makefile
+++ b/net/sched/Makefile
@@ -30,6 +30,7 @@ obj-$(CONFIG_NET_SCH_PRIO)	+= sch_prio.o
 obj-$(CONFIG_NET_SCH_MULTIQ)	+= sch_multiq.o
 obj-$(CONFIG_NET_SCH_ATM)	+= sch_atm.o
 obj-$(CONFIG_NET_SCH_NETEM)	+= sch_netem.o
+obj-$(CONFIG_NET_SCH_DRR)	+= sch_drr.o
 obj-$(CONFIG_NET_CLS_U32)	+= cls_u32.o
 obj-$(CONFIG_NET_CLS_ROUTE4)	+= cls_route.o
 obj-$(CONFIG_NET_CLS_FW)	+= cls_fw.o
diff --git a/net/sched/sch_drr.c b/net/sched/sch_drr.c
new file mode 100644
index 00000000000..e71a5de23e2
--- /dev/null
+++ b/net/sched/sch_drr.c
@@ -0,0 +1,505 @@
+/*
+ * net/sched/sch_drr.c         Deficit Round Robin scheduler
+ *
+ * Copyright (c) 2008 Patrick McHardy <kaber@trash.net>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * version 2 as published by the Free Software Foundation.
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/errno.h>
+#include <linux/netdevice.h>
+#include <linux/pkt_sched.h>
+#include <net/sch_generic.h>
+#include <net/pkt_sched.h>
+#include <net/pkt_cls.h>
+
+struct drr_class {
+	struct Qdisc_class_common	common;
+	unsigned int			refcnt;
+	unsigned int			filter_cnt;
+
+	struct gnet_stats_basic		bstats;
+	struct gnet_stats_queue		qstats;
+	struct gnet_stats_rate_est	rate_est;
+	struct list_head		alist;
+	struct Qdisc			*qdisc;
+
+	u32				quantum;
+	u32				deficit;
+};
+
+struct drr_sched {
+	struct list_head		active;
+	struct tcf_proto		*filter_list;
+	struct Qdisc_class_hash		clhash;
+};
+
+static struct drr_class *drr_find_class(struct Qdisc *sch, u32 classid)
+{
+	struct drr_sched *q = qdisc_priv(sch);
+	struct Qdisc_class_common *clc;
+
+	clc = qdisc_class_find(&q->clhash, classid);
+	if (clc == NULL)
+		return NULL;
+	return container_of(clc, struct drr_class, common);
+}
+
+static void drr_purge_queue(struct drr_class *cl)
+{
+	unsigned int len = cl->qdisc->q.qlen;
+
+	qdisc_reset(cl->qdisc);
+	qdisc_tree_decrease_qlen(cl->qdisc, len);
+}
+
+static const struct nla_policy drr_policy[TCA_DRR_MAX + 1] = {
+	[TCA_DRR_QUANTUM]	= { .type = NLA_U32 },
+};
+
+static int drr_change_class(struct Qdisc *sch, u32 classid, u32 parentid,
+			    struct nlattr **tca, unsigned long *arg)
+{
+	struct drr_sched *q = qdisc_priv(sch);
+	struct drr_class *cl = (struct drr_class *)*arg;
+	struct nlattr *tb[TCA_DRR_MAX + 1];
+	u32 quantum;
+	int err;
+
+	err = nla_parse_nested(tb, TCA_DRR_MAX, tca[TCA_OPTIONS], drr_policy);
+	if (err < 0)
+		return err;
+
+	if (tb[TCA_DRR_QUANTUM]) {
+		quantum = nla_get_u32(tb[TCA_DRR_QUANTUM]);
+		if (quantum == 0)
+			return -EINVAL;
+	} else
+		quantum = psched_mtu(qdisc_dev(sch));
+
+	if (cl != NULL) {
+		sch_tree_lock(sch);
+		if (tb[TCA_DRR_QUANTUM])
+			cl->quantum = quantum;
+		sch_tree_unlock(sch);
+
+		if (tca[TCA_RATE])
+			gen_replace_estimator(&cl->bstats, &cl->rate_est,
+					      qdisc_root_sleeping_lock(sch),
+					      tca[TCA_RATE]);
+		return 0;
+	}
+
+	cl = kzalloc(sizeof(struct drr_class), GFP_KERNEL);
+	if (cl == NULL)
+		return -ENOBUFS;
+
+	cl->refcnt	   = 1;
+	cl->common.classid = classid;
+	cl->quantum	   = quantum;
+	cl->qdisc	   = qdisc_create_dflt(qdisc_dev(sch), sch->dev_queue,
+					       &pfifo_qdisc_ops, classid);
+	if (cl->qdisc == NULL)
+		cl->qdisc = &noop_qdisc;
+
+	if (tca[TCA_RATE])
+		gen_replace_estimator(&cl->bstats, &cl->rate_est,
+				      qdisc_root_sleeping_lock(sch),
+				      tca[TCA_RATE]);
+
+	sch_tree_lock(sch);
+	qdisc_class_hash_insert(&q->clhash, &cl->common);
+	sch_tree_unlock(sch);
+
+	qdisc_class_hash_grow(sch, &q->clhash);
+
+	*arg = (unsigned long)cl;
+	return 0;
+}
+
+static void drr_destroy_class(struct Qdisc *sch, struct drr_class *cl)
+{
+	gen_kill_estimator(&cl->bstats, &cl->rate_est);
+	qdisc_destroy(cl->qdisc);
+	kfree(cl);
+}
+
+static int drr_delete_class(struct Qdisc *sch, unsigned long arg)
+{
+	struct drr_sched *q = qdisc_priv(sch);
+	struct drr_class *cl = (struct drr_class *)arg;
+
+	if (cl->filter_cnt > 0)
+		return -EBUSY;
+
+	sch_tree_lock(sch);
+
+	drr_purge_queue(cl);
+	qdisc_class_hash_remove(&q->clhash, &cl->common);
+
+	if (--cl->refcnt == 0)
+		drr_destroy_class(sch, cl);
+
+	sch_tree_unlock(sch);
+	return 0;
+}
+
+static unsigned long drr_get_class(struct Qdisc *sch, u32 classid)
+{
+	struct drr_class *cl = drr_find_class(sch, classid);
+
+	if (cl != NULL)
+		cl->refcnt++;
+
+	return (unsigned long)cl;
+}
+
+static void drr_put_class(struct Qdisc *sch, unsigned long arg)
+{
+	struct drr_class *cl = (struct drr_class *)arg;
+
+	if (--cl->refcnt == 0)
+		drr_destroy_class(sch, cl);
+}
+
+static struct tcf_proto **drr_tcf_chain(struct Qdisc *sch, unsigned long cl)
+{
+	struct drr_sched *q = qdisc_priv(sch);
+
+	if (cl)
+		return NULL;
+
+	return &q->filter_list;
+}
+
+static unsigned long drr_bind_tcf(struct Qdisc *sch, unsigned long parent,
+				  u32 classid)
+{
+	struct drr_class *cl = drr_find_class(sch, classid);
+
+	if (cl != NULL)
+		cl->filter_cnt++;
+
+	return (unsigned long)cl;
+}
+
+static void drr_unbind_tcf(struct Qdisc *sch, unsigned long arg)
+{
+	struct drr_class *cl = (struct drr_class *)arg;
+
+	cl->filter_cnt--;
+}
+
+static int drr_graft_class(struct Qdisc *sch, unsigned long arg,
+			   struct Qdisc *new, struct Qdisc **old)
+{
+	struct drr_class *cl = (struct drr_class *)arg;
+
+	if (new == NULL) {
+		new = qdisc_create_dflt(qdisc_dev(sch), sch->dev_queue,
+					&pfifo_qdisc_ops, cl->common.classid);
+		if (new == NULL)
+			new = &noop_qdisc;
+	}
+
+	sch_tree_lock(sch);
+	drr_purge_queue(cl);
+	*old = xchg(&cl->qdisc, new);
+	sch_tree_unlock(sch);
+	return 0;
+}
+
+static struct Qdisc *drr_class_leaf(struct Qdisc *sch, unsigned long arg)
+{
+	struct drr_class *cl = (struct drr_class *)arg;
+
+	return cl->qdisc;
+}
+
+static void drr_qlen_notify(struct Qdisc *csh, unsigned long arg)
+{
+	struct drr_class *cl = (struct drr_class *)arg;
+
+	if (cl->qdisc->q.qlen == 0)
+		list_del(&cl->alist);
+}
+
+static int drr_dump_class(struct Qdisc *sch, unsigned long arg,
+			  struct sk_buff *skb, struct tcmsg *tcm)
+{
+	struct drr_class *cl = (struct drr_class *)arg;
+	struct nlattr *nest;
+
+	tcm->tcm_parent	= TC_H_ROOT;
+	tcm->tcm_handle	= cl->common.classid;
+	tcm->tcm_info	= cl->qdisc->handle;
+
+	nest = nla_nest_start(skb, TCA_OPTIONS);
+	if (nest == NULL)
+		goto nla_put_failure;
+	NLA_PUT_U32(skb, TCA_DRR_QUANTUM, cl->quantum);
+	return nla_nest_end(skb, nest);
+
+nla_put_failure:
+	nla_nest_cancel(skb, nest);
+	return -EMSGSIZE;
+}
+
+static int drr_dump_class_stats(struct Qdisc *sch, unsigned long arg,
+				struct gnet_dump *d)
+{
+	struct drr_class *cl = (struct drr_class *)arg;
+	struct tc_drr_stats xstats;
+
+	memset(&xstats, 0, sizeof(xstats));
+	if (cl->qdisc->q.qlen)
+		xstats.deficit = cl->deficit;
+
+	if (gnet_stats_copy_basic(d, &cl->bstats) < 0 ||
+	    gnet_stats_copy_rate_est(d, &cl->rate_est) < 0 ||
+	    gnet_stats_copy_queue(d, &cl->qdisc->qstats) < 0)
+		return -1;
+
+	return gnet_stats_copy_app(d, &xstats, sizeof(xstats));
+}
+
+static void drr_walk(struct Qdisc *sch, struct qdisc_walker *arg)
+{
+	struct drr_sched *q = qdisc_priv(sch);
+	struct drr_class *cl;
+	struct hlist_node *n;
+	unsigned int i;
+
+	if (arg->stop)
+		return;
+
+	for (i = 0; i < q->clhash.hashsize; i++) {
+		hlist_for_each_entry(cl, n, &q->clhash.hash[i], common.hnode) {
+			if (arg->count < arg->skip) {
+				arg->count++;
+				continue;
+			}
+			if (arg->fn(sch, (unsigned long)cl, arg) < 0) {
+				arg->stop = 1;
+				return;
+			}
+			arg->count++;
+		}
+	}
+}
+
+static struct drr_class *drr_classify(struct sk_buff *skb, struct Qdisc *sch,
+				      int *qerr)
+{
+	struct drr_sched *q = qdisc_priv(sch);
+	struct drr_class *cl;
+	struct tcf_result res;
+	int result;
+
+	if (TC_H_MAJ(skb->priority ^ sch->handle) == 0) {
+		cl = drr_find_class(sch, skb->priority);
+		if (cl != NULL)
+			return cl;
+	}
+
+	*qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
+	result = tc_classify(skb, q->filter_list, &res);
+	if (result >= 0) {
+#ifdef CONFIG_NET_CLS_ACT
+		switch (result) {
+		case TC_ACT_QUEUED:
+		case TC_ACT_STOLEN:
+			*qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
+		case TC_ACT_SHOT:
+			return NULL;
+		}
+#endif
+		cl = (struct drr_class *)res.class;
+		if (cl == NULL)
+			cl = drr_find_class(sch, res.classid);
+		return cl;
+	}
+	return NULL;
+}
+
+static int drr_enqueue(struct sk_buff *skb, struct Qdisc *sch)
+{
+	struct drr_sched *q = qdisc_priv(sch);
+	struct drr_class *cl;
+	unsigned int len;
+	int err;
+
+	cl = drr_classify(skb, sch, &err);
+	if (cl == NULL) {
+		if (err & __NET_XMIT_BYPASS)
+			sch->qstats.drops++;
+		kfree_skb(skb);
+		return err;
+	}
+
+	len = qdisc_pkt_len(skb);
+	err = qdisc_enqueue(skb, cl->qdisc);
+	if (unlikely(err != NET_XMIT_SUCCESS)) {
+		if (net_xmit_drop_count(err)) {
+			cl->qstats.drops++;
+			sch->qstats.drops++;
+		}
+		return err;
+	}
+
+	if (cl->qdisc->q.qlen == 1) {
+		list_add_tail(&cl->alist, &q->active);
+		cl->deficit = cl->quantum;
+	}
+
+	cl->bstats.packets++;
+	cl->bstats.bytes += len;
+	sch->bstats.packets++;
+	sch->bstats.bytes += len;
+
+	sch->q.qlen++;
+	return err;
+}
+
+static struct sk_buff *drr_dequeue(struct Qdisc *sch)
+{
+	struct drr_sched *q = qdisc_priv(sch);
+	struct drr_class *cl;
+	struct sk_buff *skb;
+	unsigned int len;
+
+	while (!list_empty(&q->active)) {
+		cl = list_first_entry(&q->active, struct drr_class, alist);
+		skb = cl->qdisc->ops->peek(cl->qdisc);
+		if (skb == NULL)
+			goto skip;
+
+		len = qdisc_pkt_len(skb);
+		if (len <= cl->deficit) {
+			cl->deficit -= len;
+			skb = qdisc_dequeue_peeked(cl->qdisc);
+			if (cl->qdisc->q.qlen == 0)
+				list_del(&cl->alist);
+			sch->q.qlen--;
+			return skb;
+		}
+
+		cl->deficit += cl->quantum;
+skip:
+		list_move_tail(&cl->alist, &q->active);
+	}
+	return NULL;
+}
+
+static unsigned int drr_drop(struct Qdisc *sch)
+{
+	struct drr_sched *q = qdisc_priv(sch);
+	struct drr_class *cl;
+	unsigned int len;
+
+	list_for_each_entry(cl, &q->active, alist) {
+		if (cl->qdisc->ops->drop) {
+			len = cl->qdisc->ops->drop(cl->qdisc);
+			if (len > 0) {
+				if (cl->qdisc->q.qlen == 0)
+					list_del(&cl->alist);
+				return len;
+			}
+		}
+	}
+	return 0;
+}
+
+static int drr_init_qdisc(struct Qdisc *sch, struct nlattr *opt)
+{
+	struct drr_sched *q = qdisc_priv(sch);
+	int err;
+
+	err = qdisc_class_hash_init(&q->clhash);
+	if (err < 0)
+		return err;
+	INIT_LIST_HEAD(&q->active);
+	return 0;
+}
+
+static void drr_reset_qdisc(struct Qdisc *sch)
+{
+	struct drr_sched *q = qdisc_priv(sch);
+	struct drr_class *cl;
+	struct hlist_node *n;
+	unsigned int i;
+
+	for (i = 0; i < q->clhash.hashsize; i++) {
+		hlist_for_each_entry(cl, n, &q->clhash.hash[i], common.hnode) {
+			if (cl->qdisc->q.qlen)
+				list_del(&cl->alist);
+			qdisc_reset(cl->qdisc);
+		}
+	}
+	sch->q.qlen = 0;
+}
+
+static void drr_destroy_qdisc(struct Qdisc *sch)
+{
+	struct drr_sched *q = qdisc_priv(sch);
+	struct drr_class *cl;
+	struct hlist_node *n, *next;
+	unsigned int i;
+
+	tcf_destroy_chain(&q->filter_list);
+
+	for (i = 0; i < q->clhash.hashsize; i++) {
+		hlist_for_each_entry_safe(cl, n, next, &q->clhash.hash[i],
+					  common.hnode)
+			drr_destroy_class(sch, cl);
+	}
+	qdisc_class_hash_destroy(&q->clhash);
+}
+
+static const struct Qdisc_class_ops drr_class_ops = {
+	.change		= drr_change_class,
+	.delete		= drr_delete_class,
+	.get		= drr_get_class,
+	.put		= drr_put_class,
+	.tcf_chain	= drr_tcf_chain,
+	.bind_tcf	= drr_bind_tcf,
+	.unbind_tcf	= drr_unbind_tcf,
+	.graft		= drr_graft_class,
+	.leaf		= drr_class_leaf,
+	.qlen_notify	= drr_qlen_notify,
+	.dump		= drr_dump_class,
+	.dump_stats	= drr_dump_class_stats,
+	.walk		= drr_walk,
+};
+
+static struct Qdisc_ops drr_qdisc_ops __read_mostly = {
+	.cl_ops		= &drr_class_ops,
+	.id		= "drr",
+	.priv_size	= sizeof(struct drr_sched),
+	.enqueue	= drr_enqueue,
+	.dequeue	= drr_dequeue,
+	.peek		= qdisc_peek_dequeued,
+	.drop		= drr_drop,
+	.init		= drr_init_qdisc,
+	.reset		= drr_reset_qdisc,
+	.destroy	= drr_destroy_qdisc,
+	.owner		= THIS_MODULE,
+};
+
+static int __init drr_init(void)
+{
+	return register_qdisc(&drr_qdisc_ops);
+}
+
+static void __exit drr_exit(void)
+{
+	unregister_qdisc(&drr_qdisc_ops);
+}
+
+module_init(drr_init);
+module_exit(drr_exit);
+MODULE_LICENSE("GPL");
-- 
cgit v1.2.3-70-g09d2


From 018a7bf1e55000dd792194238c9043918d24d3dd Mon Sep 17 00:00:00 2001
From: Andy Whitcroft <apw@canonical.com>
Date: Thu, 20 Nov 2008 15:59:56 +0100
Subject: netfilter: ip{,6}t_policy.h should include xp_policy.h

It seems that all of the include/netfilter_{ipv4,ipv6}/{ipt,ip6t}_*.h which
share constants include the corresponding include/netfilter/xp_*.h files.
Neither ipt_policy.h not ip6t_policy.h do.  Make these consistant with
the norm.

Signed-off-by: Andy Whitcroft <apw@canonical.com>
Signed-off-by: Patrick McHardy <kaber@trash.net>
---
 include/linux/netfilter_ipv4/ipt_policy.h  | 2 ++
 include/linux/netfilter_ipv6/ip6t_policy.h | 2 ++
 2 files changed, 4 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/netfilter_ipv4/ipt_policy.h b/include/linux/netfilter_ipv4/ipt_policy.h
index b9478a25530..1037fb2cd20 100644
--- a/include/linux/netfilter_ipv4/ipt_policy.h
+++ b/include/linux/netfilter_ipv4/ipt_policy.h
@@ -1,6 +1,8 @@
 #ifndef _IPT_POLICY_H
 #define _IPT_POLICY_H
 
+#include <linux/netfilter/xt_policy.h>
+
 #define IPT_POLICY_MAX_ELEM		XT_POLICY_MAX_ELEM
 
 /* ipt_policy_flags */
diff --git a/include/linux/netfilter_ipv6/ip6t_policy.h b/include/linux/netfilter_ipv6/ip6t_policy.h
index 6bab3163d2f..b1c449d7ec8 100644
--- a/include/linux/netfilter_ipv6/ip6t_policy.h
+++ b/include/linux/netfilter_ipv6/ip6t_policy.h
@@ -1,6 +1,8 @@
 #ifndef _IP6T_POLICY_H
 #define _IP6T_POLICY_H
 
+#include <linux/netfilter/xt_policy.h>
+
 #define IP6T_POLICY_MAX_ELEM		XT_POLICY_MAX_ELEM
 
 /* ip6t_policy_flags */
-- 
cgit v1.2.3-70-g09d2


From 008298231abbeb91bc7be9e8b078607b816d1a4a Mon Sep 17 00:00:00 2001
From: Stephen Hemminger <shemminger@vyatta.com>
Date: Thu, 20 Nov 2008 20:14:53 -0800
Subject: netdev: add more functions to netdevice ops

This patch moves neigh_setup and hard_start_xmit into the network device ops
structure. For bisection, fix all the previously converted drivers as well.
Bonding driver took the biggest hit on this.

Added a prefetch of the hard_start_xmit in the fast path to try and reduce
any impact this would have.

Signed-off-by: Stephen Hemminger <shemminger@vyatta.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/8139cp.c            |  2 +-
 drivers/net/8139too.c           |  2 +-
 drivers/net/acenic.c            |  4 ++-
 drivers/net/atl1e/atl1e_main.c  |  3 ++-
 drivers/net/atlx/atl1.c         |  4 +--
 drivers/net/atlx/atl2.c         |  2 +-
 drivers/net/bonding/bond_main.c | 56 +++++++++++++++++++++++++++++++++--------
 drivers/net/chelsio/cxgb2.c     |  6 ++---
 drivers/net/cxgb3/cxgb3_main.c  |  1 -
 drivers/net/e100.c              |  2 +-
 drivers/net/e1000/e1000_main.c  |  2 +-
 drivers/net/e1000e/netdev.c     |  2 +-
 drivers/net/enic/enic_main.c    |  2 +-
 drivers/net/forcedeth.c         | 22 +++++++++++++---
 drivers/net/ifb.c               |  4 +--
 drivers/net/igb/igb_main.c      |  2 +-
 drivers/net/ixgb/ixgb_main.c    |  2 +-
 drivers/net/ixgbe/ixgbe_main.c  |  2 +-
 drivers/net/loopback.c          |  2 +-
 drivers/net/macvlan.c           |  4 +--
 drivers/net/niu.c               |  2 +-
 drivers/net/ppp_generic.c       |  5 ++--
 drivers/net/r8169.c             |  2 +-
 drivers/net/skge.c              |  2 +-
 drivers/net/sky2.c              |  3 ++-
 drivers/net/tg3.c               | 45 ++++++++++++++++++++++-----------
 drivers/net/tun.c               |  4 +--
 drivers/net/veth.c              |  2 +-
 drivers/net/via-velocity.c      |  2 +-
 include/linux/netdevice.h       | 39 ++++++++++++++++++----------
 net/bridge/br_device.c          | 10 ++++----
 net/bridge/br_if.c              |  2 +-
 net/core/dev.c                  | 12 ++++++---
 net/core/neighbour.c            |  6 ++---
 net/core/netpoll.c              |  6 +++--
 net/core/pktgen.c               |  8 +++---
 36 files changed, 183 insertions(+), 93 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/8139cp.c b/drivers/net/8139cp.c
index 13f75b67872..f6d9d1353dd 100644
--- a/drivers/net/8139cp.c
+++ b/drivers/net/8139cp.c
@@ -1824,6 +1824,7 @@ static const struct net_device_ops cp_netdev_ops = {
 	.ndo_set_multicast_list	= cp_set_rx_mode,
 	.ndo_get_stats		= cp_get_stats,
 	.ndo_do_ioctl		= cp_ioctl,
+	.ndo_start_xmit		= cp_start_xmit,
 	.ndo_tx_timeout		= cp_tx_timeout,
 #if CP_VLAN_TAG_USED
 	.ndo_vlan_rx_register	= cp_vlan_rx_register,
@@ -1949,7 +1950,6 @@ static int cp_init_one (struct pci_dev *pdev, const struct pci_device_id *ent)
 	memcpy(dev->perm_addr, dev->dev_addr, dev->addr_len);
 
 	dev->netdev_ops = &cp_netdev_ops;
-	dev->hard_start_xmit = cp_start_xmit;
 	netif_napi_add(dev, &cp->napi, cp_rx_poll, 16);
 	dev->ethtool_ops = &cp_ethtool_ops;
 	dev->watchdog_timeo = TX_TIMEOUT;
diff --git a/drivers/net/8139too.c b/drivers/net/8139too.c
index f8866552386..445a479db79 100644
--- a/drivers/net/8139too.c
+++ b/drivers/net/8139too.c
@@ -921,6 +921,7 @@ static const struct net_device_ops rtl8139_netdev_ops = {
 	.ndo_stop		= rtl8139_close,
 	.ndo_get_stats		= rtl8139_get_stats,
 	.ndo_validate_addr	= eth_validate_addr,
+	.ndo_start_xmit		= rtl8139_start_xmit,
 	.ndo_set_multicast_list	= rtl8139_set_rx_mode,
 	.ndo_do_ioctl		= netdev_ioctl,
 	.ndo_tx_timeout		= rtl8139_tx_timeout,
@@ -992,7 +993,6 @@ static int __devinit rtl8139_init_one (struct pci_dev *pdev,
 	dev->netdev_ops = &rtl8139_netdev_ops;
 	dev->ethtool_ops = &rtl8139_ethtool_ops;
 	dev->watchdog_timeo = TX_TIMEOUT;
-	dev->hard_start_xmit = rtl8139_start_xmit;
 	netif_napi_add(dev, &tp->napi, rtl8139_poll, 64);
 
 	/* note: the hardware is not capable of sg/csum/highdma, however
diff --git a/drivers/net/acenic.c b/drivers/net/acenic.c
index 309a90ea921..21d24320210 100644
--- a/drivers/net/acenic.c
+++ b/drivers/net/acenic.c
@@ -455,10 +455,13 @@ static const struct net_device_ops ace_netdev_ops = {
 	.ndo_stop		= ace_close,
 	.ndo_tx_timeout		= ace_watchdog,
 	.ndo_get_stats		= ace_get_stats,
+	.ndo_start_xmit		= ace_start_xmit,
 	.ndo_set_multicast_list	= ace_set_multicast_list,
 	.ndo_set_mac_address	= ace_set_mac_addr,
 	.ndo_change_mtu		= ace_change_mtu,
+#if ACENIC_DO_VLAN
 	.ndo_vlan_rx_register	= ace_vlan_rx_register,
+#endif
 };
 
 static int __devinit acenic_probe_one(struct pci_dev *pdev,
@@ -489,7 +492,6 @@ static int __devinit acenic_probe_one(struct pci_dev *pdev,
 	dev->watchdog_timeo = 5*HZ;
 
 	dev->netdev_ops = &ace_netdev_ops;
-	dev->hard_start_xmit = &ace_start_xmit;
 	SET_ETHTOOL_OPS(dev, &ace_ethtool_ops);
 
 	/* we only display this string ONCE */
diff --git a/drivers/net/atl1e/atl1e_main.c b/drivers/net/atl1e/atl1e_main.c
index a815fffc2a5..98b2a7a466b 100644
--- a/drivers/net/atl1e/atl1e_main.c
+++ b/drivers/net/atl1e/atl1e_main.c
@@ -2256,6 +2256,7 @@ static void atl1e_shutdown(struct pci_dev *pdev)
 static const struct net_device_ops atl1e_netdev_ops = {
 	.ndo_open		= atl1e_open,
 	.ndo_stop		= atl1e_close,
+	.ndo_start_xmit		= atl1e_xmit_frame,
 	.ndo_get_stats		= atl1e_get_stats,
 	.ndo_set_multicast_list	= atl1e_set_multi,
 	.ndo_validate_addr	= eth_validate_addr,
@@ -2277,7 +2278,7 @@ static int atl1e_init_netdev(struct net_device *netdev, struct pci_dev *pdev)
 
 	netdev->irq  = pdev->irq;
 	netdev->netdev_ops = &atl1e_netdev_ops;
-	netdev->hard_start_xmit = atl1e_xmit_frame,
+
 	netdev->watchdog_timeo = AT_TX_WATCHDOG;
 	atl1e_set_ethtool_ops(netdev);
 
diff --git a/drivers/net/atlx/atl1.c b/drivers/net/atlx/atl1.c
index 7a0fb04e348..aef7e47fdd2 100644
--- a/drivers/net/atlx/atl1.c
+++ b/drivers/net/atlx/atl1.c
@@ -2883,12 +2883,13 @@ static void atl1_poll_controller(struct net_device *netdev)
 static const struct net_device_ops atl1_netdev_ops = {
 	.ndo_open		= atl1_open,
 	.ndo_stop		= atl1_close,
+	.ndo_start_xmit		= atl1_xmit_frame,
 	.ndo_set_multicast_list	= atlx_set_multi,
 	.ndo_validate_addr	= eth_validate_addr,
 	.ndo_set_mac_address	= atl1_set_mac,
 	.ndo_change_mtu		= atl1_change_mtu,
 	.ndo_do_ioctl		= atlx_ioctl,
-	.ndo_tx_timeout	= atlx_tx_timeout,
+	.ndo_tx_timeout		= atlx_tx_timeout,
 	.ndo_vlan_rx_register	= atlx_vlan_rx_register,
 #ifdef CONFIG_NET_POLL_CONTROLLER
 	.ndo_poll_controller	= atl1_poll_controller,
@@ -2983,7 +2984,6 @@ static int __devinit atl1_probe(struct pci_dev *pdev,
 	adapter->mii.reg_num_mask = 0x1f;
 
 	netdev->netdev_ops = &atl1_netdev_ops;
-	netdev->hard_start_xmit = &atl1_xmit_frame;
 	netdev->watchdog_timeo = 5 * HZ;
 
 	netdev->ethtool_ops = &atl1_ethtool_ops;
diff --git a/drivers/net/atlx/atl2.c b/drivers/net/atlx/atl2.c
index b8d585722e1..0326a84503e 100644
--- a/drivers/net/atlx/atl2.c
+++ b/drivers/net/atlx/atl2.c
@@ -1315,6 +1315,7 @@ static void atl2_poll_controller(struct net_device *netdev)
 static const struct net_device_ops atl2_netdev_ops = {
 	.ndo_open		= atl2_open,
 	.ndo_stop		= atl2_close,
+	.ndo_start_xmit		= atl2_xmit_frame,
 	.ndo_set_multicast_list	= atl2_set_multi,
 	.ndo_validate_addr	= eth_validate_addr,
 	.ndo_set_mac_address	= atl2_set_mac,
@@ -1400,7 +1401,6 @@ static int __devinit atl2_probe(struct pci_dev *pdev,
 
 	atl2_setup_pcicmd(pdev);
 
-	netdev->hard_start_xmit = &atl2_xmit_frame;
 	netdev->netdev_ops = &atl2_netdev_ops;
 	atl2_set_ethtool_ops(netdev);
 	netdev->watchdog_timeo = 5 * HZ;
diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index 614656c8187..a339a805273 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -1377,14 +1377,12 @@ done:
 	return 0;
 }
 
-
 static void bond_setup_by_slave(struct net_device *bond_dev,
 				struct net_device *slave_dev)
 {
 	struct bonding *bond = netdev_priv(bond_dev);
 
-	bond_dev->neigh_setup           = slave_dev->neigh_setup;
-	bond_dev->header_ops		= slave_dev->header_ops;
+	bond_dev->header_ops	    = slave_dev->header_ops;
 
 	bond_dev->type		    = slave_dev->type;
 	bond_dev->hard_header_len   = slave_dev->hard_header_len;
@@ -4124,6 +4122,20 @@ static void bond_set_multicast_list(struct net_device *bond_dev)
 	read_unlock(&bond->lock);
 }
 
+static int bond_neigh_setup(struct net_device *dev, struct neigh_parms *parms)
+{
+	struct bonding *bond = netdev_priv(dev);
+	struct slave *slave = bond->first_slave;
+
+	if (slave) {
+		const struct net_device_ops *slave_ops
+			= slave->dev->netdev_ops;
+		if (slave_ops->ndo_neigh_setup)
+			return slave_ops->ndo_neigh_setup(dev, parms);
+	}
+	return 0;
+}
+
 /*
  * Change the MTU of all of a master's slaves to match the master
  */
@@ -4490,6 +4502,35 @@ static void bond_set_xmit_hash_policy(struct bonding *bond)
 	}
 }
 
+static int bond_start_xmit(struct sk_buff *skb, struct net_device *dev)
+{
+	const struct bonding *bond = netdev_priv(dev);
+
+	switch (bond->params.mode) {
+	case BOND_MODE_ROUNDROBIN:
+		return bond_xmit_roundrobin(skb, dev);
+	case BOND_MODE_ACTIVEBACKUP:
+		return bond_xmit_activebackup(skb, dev);
+	case BOND_MODE_XOR:
+		return bond_xmit_xor(skb, dev);
+	case BOND_MODE_BROADCAST:
+		return bond_xmit_broadcast(skb, dev);
+	case BOND_MODE_8023AD:
+		return bond_3ad_xmit_xor(skb, dev);
+	case BOND_MODE_ALB:
+	case BOND_MODE_TLB:
+		return bond_alb_xmit(skb, dev);
+	default:
+		/* Should never happen, mode already checked */
+		printk(KERN_ERR DRV_NAME ": %s: Error: Unknown bonding mode %d\n",
+		     dev->name, bond->params.mode);
+		WARN_ON_ONCE(1);
+		dev_kfree_skb(skb);
+		return NETDEV_TX_OK;
+	}
+}
+
+
 /*
  * set bond mode specific net device operations
  */
@@ -4499,28 +4540,22 @@ void bond_set_mode_ops(struct bonding *bond, int mode)
 
 	switch (mode) {
 	case BOND_MODE_ROUNDROBIN:
-		bond_dev->hard_start_xmit = bond_xmit_roundrobin;
 		break;
 	case BOND_MODE_ACTIVEBACKUP:
-		bond_dev->hard_start_xmit = bond_xmit_activebackup;
 		break;
 	case BOND_MODE_XOR:
-		bond_dev->hard_start_xmit = bond_xmit_xor;
 		bond_set_xmit_hash_policy(bond);
 		break;
 	case BOND_MODE_BROADCAST:
-		bond_dev->hard_start_xmit = bond_xmit_broadcast;
 		break;
 	case BOND_MODE_8023AD:
 		bond_set_master_3ad_flags(bond);
-		bond_dev->hard_start_xmit = bond_3ad_xmit_xor;
 		bond_set_xmit_hash_policy(bond);
 		break;
 	case BOND_MODE_ALB:
 		bond_set_master_alb_flags(bond);
 		/* FALLTHRU */
 	case BOND_MODE_TLB:
-		bond_dev->hard_start_xmit = bond_alb_xmit;
 		break;
 	default:
 		/* Should never happen, mode already checked */
@@ -4553,12 +4588,13 @@ static const struct ethtool_ops bond_ethtool_ops = {
 static const struct net_device_ops bond_netdev_ops = {
 	.ndo_open		= bond_open,
 	.ndo_stop		= bond_close,
+	.ndo_start_xmit		= bond_start_xmit,
 	.ndo_get_stats		= bond_get_stats,
 	.ndo_do_ioctl		= bond_do_ioctl,
 	.ndo_set_multicast_list	= bond_set_multicast_list,
 	.ndo_change_mtu		= bond_change_mtu,
-	.ndo_validate_addr	= NULL,
 	.ndo_set_mac_address 	= bond_set_mac_address,
+	.ndo_neigh_setup	= bond_neigh_setup,
 	.ndo_vlan_rx_register	= bond_vlan_rx_register,
 	.ndo_vlan_rx_add_vid 	= bond_vlan_rx_add_vid,
 	.ndo_vlan_rx_kill_vid	= bond_vlan_rx_kill_vid,
diff --git a/drivers/net/chelsio/cxgb2.c b/drivers/net/chelsio/cxgb2.c
index 482741797eb..9b6011e7678 100644
--- a/drivers/net/chelsio/cxgb2.c
+++ b/drivers/net/chelsio/cxgb2.c
@@ -915,7 +915,7 @@ static int t1_set_mac_addr(struct net_device *dev, void *p)
 }
 
 #if defined(CONFIG_VLAN_8021Q) || defined(CONFIG_VLAN_8021Q_MODULE)
-static void vlan_rx_register(struct net_device *dev,
+static void t1_vlan_rx_register(struct net_device *dev,
 				   struct vlan_group *grp)
 {
 	struct adapter *adapter = dev->ml_priv;
@@ -1013,6 +1013,7 @@ void t1_fatal_err(struct adapter *adapter)
 static const struct net_device_ops cxgb_netdev_ops = {
 	.ndo_open		= cxgb_open,
 	.ndo_stop		= cxgb_close,
+	.ndo_start_xmit		= t1_start_xmit,
 	.ndo_get_stats		= t1_get_stats,
 	.ndo_validate_addr	= eth_validate_addr,
 	.ndo_set_multicast_list	= t1_set_rxmode,
@@ -1020,7 +1021,7 @@ static const struct net_device_ops cxgb_netdev_ops = {
 	.ndo_change_mtu		= t1_change_mtu,
 	.ndo_set_mac_address	= t1_set_mac_addr,
 #if defined(CONFIG_VLAN_8021Q) || defined(CONFIG_VLAN_8021Q_MODULE)
-	.ndo_vlan_rx_register	= vlan_rx_register,
+	.ndo_vlan_rx_register	= t1_vlan_rx_register,
 #endif
 #ifdef CONFIG_NET_POLL_CONTROLLER
 	.ndo_poll_controller	= t1_netpoll,
@@ -1157,7 +1158,6 @@ static int __devinit init_one(struct pci_dev *pdev,
 		}
 
 		netdev->netdev_ops = &cxgb_netdev_ops;
-		netdev->hard_start_xmit = t1_start_xmit;
 		netdev->hard_header_len += (adapter->flags & TSO_CAPABLE) ?
 			sizeof(struct cpl_tx_pkt_lso) : sizeof(struct cpl_tx_pkt);
 
diff --git a/drivers/net/cxgb3/cxgb3_main.c b/drivers/net/cxgb3/cxgb3_main.c
index a9479be53ec..cd9fcaca70f 100644
--- a/drivers/net/cxgb3/cxgb3_main.c
+++ b/drivers/net/cxgb3/cxgb3_main.c
@@ -2955,7 +2955,6 @@ static int __devinit init_one(struct pci_dev *pdev,
 
 		netdev->features |= NETIF_F_HW_VLAN_TX | NETIF_F_HW_VLAN_RX;
 		netdev->netdev_ops = &cxgb_netdev_ops;
-		netdev->hard_start_xmit = t3_eth_xmit;
 		SET_ETHTOOL_OPS(netdev, &cxgb_ethtool_ops);
 	}
 
diff --git a/drivers/net/e100.c b/drivers/net/e100.c
index 5894716de19..2001a63794f 100644
--- a/drivers/net/e100.c
+++ b/drivers/net/e100.c
@@ -2615,6 +2615,7 @@ static int e100_close(struct net_device *netdev)
 static const struct net_device_ops e100_netdev_ops = {
 	.ndo_open		= e100_open,
 	.ndo_stop		= e100_close,
+	.ndo_start_xmit		= e100_xmit_frame,
 	.ndo_validate_addr	= eth_validate_addr,
 	.ndo_set_multicast_list	= e100_set_multicast_list,
 	.ndo_set_mac_address	= e100_set_mac_address,
@@ -2640,7 +2641,6 @@ static int __devinit e100_probe(struct pci_dev *pdev,
 	}
 
 	netdev->netdev_ops = &e100_netdev_ops;
-	netdev->hard_start_xmit = e100_xmit_frame;
 	SET_ETHTOOL_OPS(netdev, &e100_ethtool_ops);
 	netdev->watchdog_timeo = E100_WATCHDOG_PERIOD;
 	strncpy(netdev->name, pci_name(pdev), sizeof(netdev->name) - 1);
diff --git a/drivers/net/e1000/e1000_main.c b/drivers/net/e1000/e1000_main.c
index debbba390d4..5c098c9d584 100644
--- a/drivers/net/e1000/e1000_main.c
+++ b/drivers/net/e1000/e1000_main.c
@@ -891,6 +891,7 @@ static int e1000_is_need_ioport(struct pci_dev *pdev)
 static const struct net_device_ops e1000_netdev_ops = {
 	.ndo_open		= e1000_open,
 	.ndo_stop		= e1000_close,
+	.ndo_start_xmit		= e1000_xmit_frame,
 	.ndo_get_stats		= e1000_get_stats,
 	.ndo_set_rx_mode	= e1000_set_rx_mode,
 	.ndo_set_mac_address	= e1000_set_mac,
@@ -1001,7 +1002,6 @@ static int __devinit e1000_probe(struct pci_dev *pdev,
 	}
 
 	netdev->netdev_ops = &e1000_netdev_ops;
-	netdev->hard_start_xmit = &e1000_xmit_frame;
 	e1000_set_ethtool_ops(netdev);
 	netdev->watchdog_timeo = 5 * HZ;
 	netif_napi_add(netdev, &adapter->napi, e1000_clean, 64);
diff --git a/drivers/net/e1000e/netdev.c b/drivers/net/e1000e/netdev.c
index ced839e4cae..cc0502bbb9f 100644
--- a/drivers/net/e1000e/netdev.c
+++ b/drivers/net/e1000e/netdev.c
@@ -4707,6 +4707,7 @@ static void e1000_eeprom_checks(struct e1000_adapter *adapter)
 static const struct net_device_ops e1000e_netdev_ops = {
 	.ndo_open		= e1000_open,
 	.ndo_stop		= e1000_close,
+	.ndo_start_xmit		= e1000_xmit_frame,
 	.ndo_get_stats		= e1000_get_stats,
 	.ndo_set_multicast_list	= e1000_set_multi,
 	.ndo_set_mac_address	= e1000_set_mac,
@@ -4822,7 +4823,6 @@ static int __devinit e1000_probe(struct pci_dev *pdev,
 
 	/* construct the net_device struct */
 	netdev->netdev_ops		= &e1000e_netdev_ops;
-	netdev->hard_start_xmit		= &e1000_xmit_frame;
 	e1000e_set_ethtool_ops(netdev);
 	netdev->watchdog_timeo		= 5 * HZ;
 	netif_napi_add(netdev, &adapter->napi, e1000_clean, 64);
diff --git a/drivers/net/enic/enic_main.c b/drivers/net/enic/enic_main.c
index 40f8c88b166..1c409df735d 100644
--- a/drivers/net/enic/enic_main.c
+++ b/drivers/net/enic/enic_main.c
@@ -1593,6 +1593,7 @@ static void enic_iounmap(struct enic *enic)
 static const struct net_device_ops enic_netdev_ops = {
 	.ndo_open		= enic_open,
 	.ndo_stop		= enic_stop,
+	.ndo_start_xmit		= enic_hard_start_xmit,
 	.ndo_get_stats		= enic_get_stats,
 	.ndo_validate_addr	= eth_validate_addr,
 	.ndo_set_multicast_list	= enic_set_multicast_list,
@@ -1830,7 +1831,6 @@ static int __devinit enic_probe(struct pci_dev *pdev,
 	}
 
 	netdev->netdev_ops = &enic_netdev_ops;
-	netdev->hard_start_xmit = enic_hard_start_xmit;
 	netdev->watchdog_timeo = 2 * HZ;
 	netdev->ethtool_ops = &enic_ethtool_ops;
 
diff --git a/drivers/net/forcedeth.c b/drivers/net/forcedeth.c
index dd2e1f670b0..0d7e5750245 100644
--- a/drivers/net/forcedeth.c
+++ b/drivers/net/forcedeth.c
@@ -5412,6 +5412,23 @@ static const struct net_device_ops nv_netdev_ops = {
 	.ndo_open		= nv_open,
 	.ndo_stop		= nv_close,
 	.ndo_get_stats		= nv_get_stats,
+	.ndo_start_xmit		= nv_start_xmit,
+	.ndo_tx_timeout		= nv_tx_timeout,
+	.ndo_change_mtu		= nv_change_mtu,
+	.ndo_validate_addr	= eth_validate_addr,
+	.ndo_set_mac_address	= nv_set_mac_address,
+	.ndo_set_multicast_list	= nv_set_multicast,
+	.ndo_vlan_rx_register	= nv_vlan_rx_register,
+#ifdef CONFIG_NET_POLL_CONTROLLER
+	.ndo_poll_controller	= nv_poll_controller,
+#endif
+};
+
+static const struct net_device_ops nv_netdev_ops_optimized = {
+	.ndo_open		= nv_open,
+	.ndo_stop		= nv_close,
+	.ndo_get_stats		= nv_get_stats,
+	.ndo_start_xmit		= nv_start_xmit_optimized,
 	.ndo_tx_timeout		= nv_tx_timeout,
 	.ndo_change_mtu		= nv_change_mtu,
 	.ndo_validate_addr	= eth_validate_addr,
@@ -5592,11 +5609,10 @@ static int __devinit nv_probe(struct pci_dev *pci_dev, const struct pci_device_i
 		goto out_freering;
 
 	if (!nv_optimized(np))
-		dev->hard_start_xmit = nv_start_xmit;
+		dev->netdev_ops = &nv_netdev_ops;
 	else
-		dev->hard_start_xmit = nv_start_xmit_optimized;
+		dev->netdev_ops = &nv_netdev_ops_optimized;
 
-	dev->netdev_ops = &nv_netdev_ops;
 #ifdef CONFIG_FORCEDETH_NAPI
 	netif_napi_add(dev, &np->napi, nv_napi_poll, RX_WORK_PER_LOOP);
 #endif
diff --git a/drivers/net/ifb.c b/drivers/net/ifb.c
index 363a166df8f..60a26300193 100644
--- a/drivers/net/ifb.c
+++ b/drivers/net/ifb.c
@@ -138,15 +138,15 @@ resched:
 }
 
 static const struct net_device_ops ifb_netdev_ops = {
-	.ndo_validate_addr = eth_validate_addr,
 	.ndo_open	= ifb_open,
 	.ndo_stop	= ifb_close,
+	.ndo_start_xmit	= ifb_xmit,
+	.ndo_validate_addr = eth_validate_addr,
 };
 
 static void ifb_setup(struct net_device *dev)
 {
 	/* Initialize the device structure. */
-	dev->hard_start_xmit = ifb_xmit;
 	dev->destructor = free_netdev;
 	dev->netdev_ops = &ifb_netdev_ops;
 
diff --git a/drivers/net/igb/igb_main.c b/drivers/net/igb/igb_main.c
index ceb0a045879..eca5684d565 100644
--- a/drivers/net/igb/igb_main.c
+++ b/drivers/net/igb/igb_main.c
@@ -953,6 +953,7 @@ static int igb_is_need_ioport(struct pci_dev *pdev)
 static const struct net_device_ops igb_netdev_ops = {
 	.ndo_open 		= igb_open,
 	.ndo_stop		= igb_close,
+	.ndo_start_xmit		= igb_xmit_frame_adv,
 	.ndo_get_stats		= igb_get_stats,
 	.ndo_set_multicast_list	= igb_set_multi,
 	.ndo_set_mac_address	= igb_set_mac,
@@ -1080,7 +1081,6 @@ static int __devinit igb_probe(struct pci_dev *pdev,
 	netdev->netdev_ops = &igb_netdev_ops;
 	igb_set_ethtool_ops(netdev);
 	netdev->watchdog_timeo = 5 * HZ;
-	netdev->hard_start_xmit = &igb_xmit_frame_adv;
 
 	strncpy(netdev->name, pci_name(pdev), sizeof(netdev->name) - 1);
 
diff --git a/drivers/net/ixgb/ixgb_main.c b/drivers/net/ixgb/ixgb_main.c
index 3ca9daa70b3..a04e3892ddf 100644
--- a/drivers/net/ixgb/ixgb_main.c
+++ b/drivers/net/ixgb/ixgb_main.c
@@ -324,6 +324,7 @@ ixgb_reset(struct ixgb_adapter *adapter)
 static const struct net_device_ops ixgb_netdev_ops = {
 	.ndo_open 		= ixgb_open,
 	.ndo_stop		= ixgb_close,
+	.ndo_start_xmit		= ixgb_xmit_frame,
 	.ndo_get_stats		= ixgb_get_stats,
 	.ndo_set_multicast_list	= ixgb_set_multi,
 	.ndo_validate_addr	= eth_validate_addr,
@@ -414,7 +415,6 @@ ixgb_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
 	}
 
 	netdev->netdev_ops = &ixgb_netdev_ops;
-	netdev->hard_start_xmit = &ixgb_xmit_frame;
 	ixgb_set_ethtool_ops(netdev);
 	netdev->watchdog_timeo = 5 * HZ;
 	netif_napi_add(netdev, &adapter->napi, ixgb_clean, 64);
diff --git a/drivers/net/ixgbe/ixgbe_main.c b/drivers/net/ixgbe/ixgbe_main.c
index 7ad07a00680..40108523377 100644
--- a/drivers/net/ixgbe/ixgbe_main.c
+++ b/drivers/net/ixgbe/ixgbe_main.c
@@ -3728,6 +3728,7 @@ static int ixgbe_link_config(struct ixgbe_hw *hw)
 static const struct net_device_ops ixgbe_netdev_ops = {
 	.ndo_open 		= ixgbe_open,
 	.ndo_stop		= ixgbe_close,
+	.ndo_start_xmit		= ixgbe_xmit_frame,
 	.ndo_get_stats		= ixgbe_get_stats,
 	.ndo_set_multicast_list	= ixgbe_set_rx_mode,
 	.ndo_validate_addr	= eth_validate_addr,
@@ -3824,7 +3825,6 @@ static int __devinit ixgbe_probe(struct pci_dev *pdev,
 	}
 
 	netdev->netdev_ops = &ixgbe_netdev_ops;
-	netdev->hard_start_xmit = &ixgbe_xmit_frame;
 	ixgbe_set_ethtool_ops(netdev);
 	netdev->watchdog_timeo = 5 * HZ;
 	strcpy(netdev->name, pci_name(pdev));
diff --git a/drivers/net/loopback.c b/drivers/net/loopback.c
index 958450124de..b7d438a367f 100644
--- a/drivers/net/loopback.c
+++ b/drivers/net/loopback.c
@@ -145,6 +145,7 @@ static void loopback_dev_free(struct net_device *dev)
 
 static const struct net_device_ops loopback_ops = {
 	.ndo_init      = loopback_dev_init,
+	.ndo_start_xmit= loopback_xmit,
 	.ndo_get_stats = loopback_get_stats,
 };
 
@@ -155,7 +156,6 @@ static const struct net_device_ops loopback_ops = {
 static void loopback_setup(struct net_device *dev)
 {
 	dev->mtu		= (16 * 1024) + 20 + 20 + 12;
-	dev->hard_start_xmit	= loopback_xmit;
 	dev->hard_header_len	= ETH_HLEN;	/* 14	*/
 	dev->addr_len		= ETH_ALEN;	/* 6	*/
 	dev->tx_queue_len	= 0;
diff --git a/drivers/net/macvlan.c b/drivers/net/macvlan.c
index d00ea444e0a..e8879217a1d 100644
--- a/drivers/net/macvlan.c
+++ b/drivers/net/macvlan.c
@@ -140,7 +140,7 @@ static struct sk_buff *macvlan_handle_frame(struct sk_buff *skb)
 	return NULL;
 }
 
-static int macvlan_hard_start_xmit(struct sk_buff *skb, struct net_device *dev)
+static int macvlan_start_xmit(struct sk_buff *skb, struct net_device *dev)
 {
 	const struct macvlan_dev *vlan = netdev_priv(dev);
 	unsigned int len = skb->len;
@@ -365,6 +365,7 @@ static const struct net_device_ops macvlan_netdev_ops = {
 	.ndo_init		= macvlan_init,
 	.ndo_open		= macvlan_open,
 	.ndo_stop		= macvlan_stop,
+	.ndo_start_xmit		= macvlan_start_xmit,
 	.ndo_change_mtu		= macvlan_change_mtu,
 	.ndo_change_rx_flags	= macvlan_change_rx_flags,
 	.ndo_set_mac_address	= macvlan_set_mac_address,
@@ -377,7 +378,6 @@ static void macvlan_setup(struct net_device *dev)
 	ether_setup(dev);
 
 	dev->netdev_ops		= &macvlan_netdev_ops;
-	dev->hard_start_xmit	= macvlan_hard_start_xmit;
 	dev->destructor		= free_netdev;
 	dev->header_ops		= &macvlan_hard_header_ops,
 	dev->ethtool_ops	= &macvlan_ethtool_ops;
diff --git a/drivers/net/niu.c b/drivers/net/niu.c
index 318537efd58..a8d10630f80 100644
--- a/drivers/net/niu.c
+++ b/drivers/net/niu.c
@@ -8892,6 +8892,7 @@ static struct net_device * __devinit niu_alloc_and_init(
 static const struct net_device_ops niu_netdev_ops = {
 	.ndo_open		= niu_open,
 	.ndo_stop		= niu_close,
+	.ndo_start_xmit		= niu_start_xmit,
 	.ndo_get_stats		= niu_get_stats,
 	.ndo_set_multicast_list	= niu_set_rx_mode,
 	.ndo_validate_addr	= eth_validate_addr,
@@ -8904,7 +8905,6 @@ static const struct net_device_ops niu_netdev_ops = {
 static void __devinit niu_assign_netdev_ops(struct net_device *dev)
 {
 	dev->netdev_ops = &niu_netdev_ops;
-	dev->hard_start_xmit = niu_start_xmit;
 	dev->ethtool_ops = &niu_ethtool_ops;
 	dev->watchdog_timeo = NIU_TX_TIMEOUT;
 }
diff --git a/drivers/net/ppp_generic.c b/drivers/net/ppp_generic.c
index bad99e8cac3..1b15a088a3b 100644
--- a/drivers/net/ppp_generic.c
+++ b/drivers/net/ppp_generic.c
@@ -972,7 +972,8 @@ ppp_net_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
 }
 
 static const struct net_device_ops ppp_netdev_ops = {
-	.ndo_do_ioctl = ppp_net_ioctl,
+	.ndo_start_xmit = ppp_start_xmit,
+	.ndo_do_ioctl   = ppp_net_ioctl,
 };
 
 static void ppp_setup(struct net_device *dev)
@@ -2437,8 +2438,6 @@ ppp_create_interface(int unit, int *retp)
 	skb_queue_head_init(&ppp->mrq);
 #endif /* CONFIG_PPP_MULTILINK */
 
-	dev->hard_start_xmit = ppp_start_xmit;
-
 	ret = -EEXIST;
 	mutex_lock(&all_ppp_mutex);
 	if (unit < 0)
diff --git a/drivers/net/r8169.c b/drivers/net/r8169.c
index bac58ca628d..dddf6aeff49 100644
--- a/drivers/net/r8169.c
+++ b/drivers/net/r8169.c
@@ -1927,6 +1927,7 @@ static const struct net_device_ops rtl8169_netdev_ops = {
 	.ndo_open		= rtl8169_open,
 	.ndo_stop		= rtl8169_close,
 	.ndo_get_stats		= rtl8169_get_stats,
+	.ndo_start_xmit		= rtl8169_start_xmit,
 	.ndo_tx_timeout		= rtl8169_tx_timeout,
 	.ndo_validate_addr	= eth_validate_addr,
 	.ndo_change_mtu		= rtl8169_change_mtu,
@@ -2125,7 +2126,6 @@ rtl8169_init_one(struct pci_dev *pdev, const struct pci_device_id *ent)
 		dev->dev_addr[i] = RTL_R8(MAC0 + i);
 	memcpy(dev->perm_addr, dev->dev_addr, dev->addr_len);
 
-	dev->hard_start_xmit = rtl8169_start_xmit;
 	SET_ETHTOOL_OPS(dev, &rtl8169_ethtool_ops);
 	dev->watchdog_timeo = RTL8169_TX_TIMEOUT;
 	dev->irq = pdev->irq;
diff --git a/drivers/net/skge.c b/drivers/net/skge.c
index 93c1b1d9296..f73ee797400 100644
--- a/drivers/net/skge.c
+++ b/drivers/net/skge.c
@@ -3805,6 +3805,7 @@ static __exit void skge_debug_cleanup(void)
 static const struct net_device_ops skge_netdev_ops = {
 	.ndo_open		= skge_up,
 	.ndo_stop		= skge_down,
+	.ndo_start_xmit		= skge_xmit_frame,
 	.ndo_do_ioctl		= skge_ioctl,
 	.ndo_get_stats		= skge_get_stats,
 	.ndo_tx_timeout		= skge_tx_timeout,
@@ -3831,7 +3832,6 @@ static struct net_device *skge_devinit(struct skge_hw *hw, int port,
 	}
 
 	SET_NETDEV_DEV(dev, &hw->pdev->dev);
-	dev->hard_start_xmit = skge_xmit_frame;
 	dev->netdev_ops = &skge_netdev_ops;
 	dev->ethtool_ops = &skge_ethtool_ops;
 	dev->watchdog_timeo = TX_WATCHDOG;
diff --git a/drivers/net/sky2.c b/drivers/net/sky2.c
index 251505125cb..3668e81e474 100644
--- a/drivers/net/sky2.c
+++ b/drivers/net/sky2.c
@@ -4047,6 +4047,7 @@ static const struct net_device_ops sky2_netdev_ops[2] = {
   {
 	.ndo_open		= sky2_up,
 	.ndo_stop		= sky2_down,
+	.ndo_start_xmit		= sky2_xmit_frame,
 	.ndo_do_ioctl		= sky2_ioctl,
 	.ndo_validate_addr	= eth_validate_addr,
 	.ndo_set_mac_address	= sky2_set_mac_address,
@@ -4063,6 +4064,7 @@ static const struct net_device_ops sky2_netdev_ops[2] = {
   {
 	.ndo_open		= sky2_up,
 	.ndo_stop		= sky2_down,
+	.ndo_start_xmit		= sky2_xmit_frame,
 	.ndo_do_ioctl		= sky2_ioctl,
 	.ndo_validate_addr	= eth_validate_addr,
 	.ndo_set_mac_address	= sky2_set_mac_address,
@@ -4090,7 +4092,6 @@ static __devinit struct net_device *sky2_init_netdev(struct sky2_hw *hw,
 
 	SET_NETDEV_DEV(dev, &hw->pdev->dev);
 	dev->irq = hw->pdev->irq;
-	dev->hard_start_xmit = sky2_xmit_frame;
 	SET_ETHTOOL_OPS(dev, &sky2_ethtool_ops);
 	dev->watchdog_timeo = TX_WATCHDOG;
 	dev->netdev_ops = &sky2_netdev_ops[port];
diff --git a/drivers/net/tg3.c b/drivers/net/tg3.c
index 4b97cb60136..9ba18e1bc34 100644
--- a/drivers/net/tg3.c
+++ b/drivers/net/tg3.c
@@ -12614,19 +12614,6 @@ static int __devinit tg3_get_invariants(struct tg3 *tp)
 	else
 		tp->tg3_flags &= ~TG3_FLAG_POLL_SERDES;
 
-	/* All chips before 5787 can get confused if TX buffers
-	 * straddle the 4GB address boundary in some cases.
-	 */
-	if (GET_ASIC_REV(tp->pci_chip_rev_id) == ASIC_REV_5755 ||
-	    GET_ASIC_REV(tp->pci_chip_rev_id) == ASIC_REV_5787 ||
-	    GET_ASIC_REV(tp->pci_chip_rev_id) == ASIC_REV_5784 ||
-	    GET_ASIC_REV(tp->pci_chip_rev_id) == ASIC_REV_5761 ||
-	    GET_ASIC_REV(tp->pci_chip_rev_id) == ASIC_REV_5785 ||
-	    GET_ASIC_REV(tp->pci_chip_rev_id) == ASIC_REV_5906)
-		tp->dev->hard_start_xmit = tg3_start_xmit;
-	else
-		tp->dev->hard_start_xmit = tg3_start_xmit_dma_bug;
-
 	tp->rx_offset = 2;
 	if (GET_ASIC_REV(tp->pci_chip_rev_id) == ASIC_REV_5701 &&
 	    (tp->tg3_flags & TG3_FLAG_PCIX_MODE) != 0)
@@ -13346,6 +13333,26 @@ static void __devinit tg3_init_coal(struct tg3 *tp)
 static const struct net_device_ops tg3_netdev_ops = {
 	.ndo_open		= tg3_open,
 	.ndo_stop		= tg3_close,
+	.ndo_start_xmit		= tg3_start_xmit,
+	.ndo_get_stats		= tg3_get_stats,
+	.ndo_validate_addr	= eth_validate_addr,
+	.ndo_set_multicast_list	= tg3_set_rx_mode,
+	.ndo_set_mac_address	= tg3_set_mac_addr,
+	.ndo_do_ioctl		= tg3_ioctl,
+	.ndo_tx_timeout		= tg3_tx_timeout,
+	.ndo_change_mtu		= tg3_change_mtu,
+#if TG3_VLAN_TAG_USED
+	.ndo_vlan_rx_register	= tg3_vlan_rx_register,
+#endif
+#ifdef CONFIG_NET_POLL_CONTROLLER
+	.ndo_poll_controller	= tg3_poll_controller,
+#endif
+};
+
+static const struct net_device_ops tg3_netdev_ops_dma_bug = {
+	.ndo_open		= tg3_open,
+	.ndo_stop		= tg3_close,
+	.ndo_start_xmit		= tg3_start_xmit_dma_bug,
 	.ndo_get_stats		= tg3_get_stats,
 	.ndo_validate_addr	= eth_validate_addr,
 	.ndo_set_multicast_list	= tg3_set_rx_mode,
@@ -13475,7 +13482,6 @@ static int __devinit tg3_init_one(struct pci_dev *pdev,
 	tp->rx_jumbo_pending = TG3_DEF_RX_JUMBO_RING_PENDING;
 	tp->tx_pending = TG3_DEF_TX_RING_PENDING;
 
-	dev->netdev_ops = &tg3_netdev_ops;
 	netif_napi_add(dev, &tp->napi, tg3_poll, 64);
 	dev->ethtool_ops = &tg3_ethtool_ops;
 	dev->watchdog_timeo = TG3_TX_TIMEOUT;
@@ -13488,6 +13494,17 @@ static int __devinit tg3_init_one(struct pci_dev *pdev,
 		goto err_out_iounmap;
 	}
 
+	if (GET_ASIC_REV(tp->pci_chip_rev_id) == ASIC_REV_5755 ||
+	    GET_ASIC_REV(tp->pci_chip_rev_id) == ASIC_REV_5787 ||
+	    GET_ASIC_REV(tp->pci_chip_rev_id) == ASIC_REV_5784 ||
+	    GET_ASIC_REV(tp->pci_chip_rev_id) == ASIC_REV_5761 ||
+	    GET_ASIC_REV(tp->pci_chip_rev_id) == ASIC_REV_5785 ||
+	    GET_ASIC_REV(tp->pci_chip_rev_id) == ASIC_REV_5906)
+		dev->netdev_ops = &tg3_netdev_ops;
+	else
+		dev->netdev_ops = &tg3_netdev_ops_dma_bug;
+
+
 	/* The EPB bridge inside 5714, 5715, and 5780 and any
 	 * device behind the EPB cannot support DMA addresses > 40-bit.
 	 * On 64-bit systems with IOMMU, use 40-bit dma_mask.
diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index b4c94144475..fd0b11ea556 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -308,13 +308,14 @@ tun_net_change_mtu(struct net_device *dev, int new_mtu)
 static const struct net_device_ops tun_netdev_ops = {
 	.ndo_open		= tun_net_open,
 	.ndo_stop		= tun_net_close,
+	.ndo_start_xmit		= tun_net_xmit,
 	.ndo_change_mtu		= tun_net_change_mtu,
-
 };
 
 static const struct net_device_ops tap_netdev_ops = {
 	.ndo_open		= tun_net_open,
 	.ndo_stop		= tun_net_close,
+	.ndo_start_xmit		= tun_net_xmit,
 	.ndo_change_mtu		= tun_net_change_mtu,
 	.ndo_set_multicast_list	= tun_net_mclist,
 	.ndo_set_mac_address	= eth_mac_addr,
@@ -691,7 +692,6 @@ static void tun_setup(struct net_device *dev)
 	tun->owner = -1;
 	tun->group = -1;
 
-	dev->hard_start_xmit = tun_net_xmit;
 	dev->ethtool_ops = &tun_ethtool_ops;
 	dev->destructor = free_netdev;
 	dev->features |= NETIF_F_NETNS_LOCAL;
diff --git a/drivers/net/veth.c b/drivers/net/veth.c
index 4f93a55aaaa..852d0e7c4e6 100644
--- a/drivers/net/veth.c
+++ b/drivers/net/veth.c
@@ -265,6 +265,7 @@ static void veth_dev_free(struct net_device *dev)
 static const struct net_device_ops veth_netdev_ops = {
 	.ndo_init	= veth_dev_init,
 	.ndo_open	= veth_open,
+	.ndo_start_xmit = veth_xmit,
 	.ndo_get_stats	= veth_get_stats,
 };
 
@@ -273,7 +274,6 @@ static void veth_setup(struct net_device *dev)
 	ether_setup(dev);
 
 	dev->netdev_ops = &veth_netdev_ops;
-	dev->hard_start_xmit = veth_xmit;
 	dev->ethtool_ops = &veth_ethtool_ops;
 	dev->features |= NETIF_F_LLTX;
 	dev->destructor = veth_dev_free;
diff --git a/drivers/net/via-velocity.c b/drivers/net/via-velocity.c
index 033e63a6843..58e25d090ae 100644
--- a/drivers/net/via-velocity.c
+++ b/drivers/net/via-velocity.c
@@ -852,6 +852,7 @@ static int velocity_soft_reset(struct velocity_info *vptr)
 static const struct net_device_ops velocity_netdev_ops = {
 	.ndo_open		= velocity_open,
 	.ndo_stop		= velocity_close,
+	.ndo_start_xmit		= velocity_xmit,
 	.ndo_get_stats		= velocity_get_stats,
 	.ndo_validate_addr	= eth_validate_addr,
 	.ndo_set_multicast_list	= velocity_set_multi,
@@ -971,7 +972,6 @@ static int __devinit velocity_found1(struct pci_dev *pdev, const struct pci_devi
 	vptr->phy_id = MII_GET_PHY_ID(vptr->mac_regs);
 
 	dev->irq = pdev->irq;
-	dev->hard_start_xmit = velocity_xmit;
 	dev->netdev_ops = &velocity_netdev_ops;
 	dev->ethtool_ops = &velocity_ethtool_ops;
 
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 981a089d514..d8fb23679ee 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -454,8 +454,8 @@ struct netdev_queue {
 
 /*
  * This structure defines the management hooks for network devices.
- * The following hooks can bed defined and are optonal (can be null)
- * unless otherwise noted.
+ * The following hooks can be defined; unless noted otherwise, they are
+ * optional and can be filled with a null pointer.
  *
  * int (*ndo_init)(struct net_device *dev);
  *     This function is called once when network device is registered.
@@ -475,6 +475,15 @@ struct netdev_queue {
  *     This function is called when network device transistions to the down
  *     state.
  *
+ * int (*ndo_hard_start_xmit)(struct sk_buff *skb, struct net_device *dev);
+ *	Called when a packet needs to be transmitted.
+ *	Must return NETDEV_TX_OK , NETDEV_TX_BUSY, or NETDEV_TX_LOCKED,
+ *	Required can not be NULL.
+ *
+ * u16 (*ndo_select_queue)(struct net_device *dev, struct sk_buff *skb);
+ *	Called to decide which queue to when device supports multiple
+ *	transmit queues.
+ *
  * void (*ndo_change_rx_flags)(struct net_device *dev, int flags);
  *	This function is called to allow device receiver to make
  *	changes to configuration when multicast or promiscious is enabled.
@@ -508,7 +517,7 @@ struct netdev_queue {
  *	of a device. If not defined, any request to change MTU will
  *	will return an error.
  *
- * void (*ndo_tx_timeout) (struct net_device *dev);
+ * void (*ndo_tx_timeout)(struct net_device *dev);
  *	Callback uses when the transmitter has not made any progress
  *	for dev->watchdog ticks.
  *
@@ -538,6 +547,10 @@ struct net_device_ops {
 	void			(*ndo_uninit)(struct net_device *dev);
 	int			(*ndo_open)(struct net_device *dev);
 	int			(*ndo_stop)(struct net_device *dev);
+	int			(*ndo_start_xmit) (struct sk_buff *skb,
+						   struct net_device *dev);
+	u16			(*ndo_select_queue)(struct net_device *dev,
+						    struct sk_buff *skb);
 #define HAVE_CHANGE_RX_FLAGS
 	void			(*ndo_change_rx_flags)(struct net_device *dev,
 						       int flags);
@@ -557,8 +570,10 @@ struct net_device_ops {
 	int			(*ndo_set_config)(struct net_device *dev,
 					          struct ifmap *map);
 #define HAVE_CHANGE_MTU
-	int			(*ndo_change_mtu)(struct net_device *dev, int new_mtu);
-
+	int			(*ndo_change_mtu)(struct net_device *dev,
+						  int new_mtu);
+	int			(*ndo_neigh_setup)(struct net_device *dev,
+						   struct neigh_parms *);
 #define HAVE_TX_TIMEOUT
 	void			(*ndo_tx_timeout) (struct net_device *dev);
 
@@ -761,18 +776,12 @@ struct net_device
 	/* Number of TX queues currently active in device  */
 	unsigned int		real_num_tx_queues;
 
-	/* Map buffer to appropriate transmit queue */
-	u16			(*select_queue)(struct net_device *dev,
-						struct sk_buff *skb);
-
 	unsigned long		tx_queue_len;	/* Max frames per queue allowed */
 	spinlock_t		tx_global_lock;
 /*
  * One part is mostly used on xmit path (device)
  */
 	void			*priv;	/* pointer to private data	*/
-	int			(*hard_start_xmit) (struct sk_buff *skb,
-						    struct net_device *dev);
 	/* These may be needed for future network-power-down code. */
 	unsigned long		trans_start;	/* Time (in jiffies) of last Tx	*/
 
@@ -800,8 +809,6 @@ struct net_device
 	/* Called from unregister, can be used to call free_netdev */
 	void (*destructor)(struct net_device *dev);
 
-	int (*neigh_setup)(struct net_device *dev, struct neigh_parms *);
-
 #ifdef CONFIG_NETPOLL
 	struct netpoll_info	*npinfo;
 #endif
@@ -842,6 +849,10 @@ struct net_device
 		void			(*uninit)(struct net_device *dev);
 		int			(*open)(struct net_device *dev);
 		int			(*stop)(struct net_device *dev);
+		int			(*hard_start_xmit) (struct sk_buff *skb,
+							    struct net_device *dev);
+		u16			(*select_queue)(struct net_device *dev,
+							struct sk_buff *skb);
 		void			(*change_rx_flags)(struct net_device *dev,
 							   int flags);
 		void			(*set_rx_mode)(struct net_device *dev);
@@ -854,6 +865,8 @@ struct net_device
 		int			(*set_config)(struct net_device *dev,
 						      struct ifmap *map);
 		int			(*change_mtu)(struct net_device *dev, int new_mtu);
+		int			(*neigh_setup)(struct net_device *dev,
+						       struct neigh_parms *);
 		void			(*tx_timeout) (struct net_device *dev);
 		struct net_device_stats* (*get_stats)(struct net_device *dev);
 		void			(*vlan_rx_register)(struct net_device *dev,
diff --git a/net/bridge/br_device.c b/net/bridge/br_device.c
index 920ce334839..18538d7460d 100644
--- a/net/bridge/br_device.c
+++ b/net/bridge/br_device.c
@@ -163,10 +163,11 @@ static const struct ethtool_ops br_ethtool_ops = {
 static const struct net_device_ops br_netdev_ops = {
 	.ndo_open		 = br_dev_open,
 	.ndo_stop		 = br_dev_stop,
-	.ndo_set_mac_address = br_set_mac_address,
-	.ndo_set_multicast_list = br_dev_set_multicast_list,
-	.ndo_change_mtu	 = br_change_mtu,
-	.ndo_do_ioctl	= br_dev_ioctl,
+	.ndo_start_xmit		 = br_dev_xmit,
+	.ndo_set_mac_address	 = br_set_mac_address,
+	.ndo_set_multicast_list	 = br_dev_set_multicast_list,
+	.ndo_change_mtu		 = br_change_mtu,
+	.ndo_do_ioctl		 = br_dev_ioctl,
 };
 
 void br_dev_setup(struct net_device *dev)
@@ -175,7 +176,6 @@ void br_dev_setup(struct net_device *dev)
 	ether_setup(dev);
 
 	dev->netdev_ops = &br_netdev_ops;
-	dev->hard_start_xmit = br_dev_xmit;
 	dev->destructor = free_netdev;
 	SET_ETHTOOL_OPS(dev, &br_ethtool_ops);
 	dev->tx_queue_len = 0;
diff --git a/net/bridge/br_if.c b/net/bridge/br_if.c
index ee3a8dd13f5..727c5c510a6 100644
--- a/net/bridge/br_if.c
+++ b/net/bridge/br_if.c
@@ -373,7 +373,7 @@ int br_add_if(struct net_bridge *br, struct net_device *dev)
 	if (dev->flags & IFF_LOOPBACK || dev->type != ARPHRD_ETHER)
 		return -EINVAL;
 
-	if (dev->hard_start_xmit == br_dev_xmit)
+	if (dev->netdev_ops->ndo_start_xmit == br_dev_xmit)
 		return -ELOOP;
 
 	if (dev->br_port != NULL)
diff --git a/net/core/dev.c b/net/core/dev.c
index 8843f4e3f5e..4615e9a443a 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -1660,6 +1660,9 @@ static int dev_gso_segment(struct sk_buff *skb)
 int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
 			struct netdev_queue *txq)
 {
+	const struct net_device_ops *ops = dev->netdev_ops;
+
+	prefetch(&dev->netdev_ops->ndo_start_xmit);
 	if (likely(!skb->next)) {
 		if (!list_empty(&ptype_all))
 			dev_queue_xmit_nit(skb, dev);
@@ -1671,7 +1674,7 @@ int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
 				goto gso;
 		}
 
-		return dev->hard_start_xmit(skb, dev);
+		return ops->ndo_start_xmit(skb, dev);
 	}
 
 gso:
@@ -1681,7 +1684,7 @@ gso:
 
 		skb->next = nskb->next;
 		nskb->next = NULL;
-		rc = dev->hard_start_xmit(nskb, dev);
+		rc = ops->ndo_start_xmit(nskb, dev);
 		if (unlikely(rc)) {
 			nskb->next = skb->next;
 			skb->next = nskb;
@@ -1755,10 +1758,11 @@ static u16 simple_tx_hash(struct net_device *dev, struct sk_buff *skb)
 static struct netdev_queue *dev_pick_tx(struct net_device *dev,
 					struct sk_buff *skb)
 {
+	const struct net_device_ops *ops = dev->netdev_ops;
 	u16 queue_index = 0;
 
-	if (dev->select_queue)
-		queue_index = dev->select_queue(dev, skb);
+	if (ops->ndo_select_queue)
+		queue_index = ops->ndo_select_queue(dev, skb);
 	else if (dev->real_num_tx_queues > 1)
 		queue_index = simple_tx_hash(dev, skb);
 
diff --git a/net/core/neighbour.c b/net/core/neighbour.c
index cca6a55909e..9c3717a23cf 100644
--- a/net/core/neighbour.c
+++ b/net/core/neighbour.c
@@ -1327,9 +1327,9 @@ struct neigh_parms *neigh_parms_alloc(struct net_device *dev,
 				      struct neigh_table *tbl)
 {
 	struct neigh_parms *p, *ref;
-	struct net *net;
+	struct net *net = dev_net(dev);
+	const struct net_device_ops *ops = dev->netdev_ops;
 
-	net = dev_net(dev);
 	ref = lookup_neigh_params(tbl, net, 0);
 	if (!ref)
 		return NULL;
@@ -1341,7 +1341,7 @@ struct neigh_parms *neigh_parms_alloc(struct net_device *dev,
 		p->reachable_time =
 				neigh_rand_reach_time(p->base_reachable_time);
 
-		if (dev->neigh_setup && dev->neigh_setup(dev, p)) {
+		if (ops->ndo_neigh_setup && ops->ndo_neigh_setup(dev, p)) {
 			kfree(p);
 			return NULL;
 		}
diff --git a/net/core/netpoll.c b/net/core/netpoll.c
index 630df603444..96fb0519eb7 100644
--- a/net/core/netpoll.c
+++ b/net/core/netpoll.c
@@ -58,6 +58,7 @@ static void queue_process(struct work_struct *work)
 
 	while ((skb = skb_dequeue(&npinfo->txq))) {
 		struct net_device *dev = skb->dev;
+		const struct net_device_ops *ops = dev->netdev_ops;
 		struct netdev_queue *txq;
 
 		if (!netif_device_present(dev) || !netif_running(dev)) {
@@ -71,7 +72,7 @@ static void queue_process(struct work_struct *work)
 		__netif_tx_lock(txq, smp_processor_id());
 		if (netif_tx_queue_stopped(txq) ||
 		    netif_tx_queue_frozen(txq) ||
-		    dev->hard_start_xmit(skb, dev) != NETDEV_TX_OK) {
+		    ops->ndo_start_xmit(skb, dev) != NETDEV_TX_OK) {
 			skb_queue_head(&npinfo->txq, skb);
 			__netif_tx_unlock(txq);
 			local_irq_restore(flags);
@@ -273,6 +274,7 @@ static void netpoll_send_skb(struct netpoll *np, struct sk_buff *skb)
 	int status = NETDEV_TX_BUSY;
 	unsigned long tries;
 	struct net_device *dev = np->dev;
+	const struct net_device_ops *ops = dev->netdev_ops;
 	struct netpoll_info *npinfo = np->dev->npinfo;
 
 	if (!npinfo || !netif_running(dev) || !netif_device_present(dev)) {
@@ -293,7 +295,7 @@ static void netpoll_send_skb(struct netpoll *np, struct sk_buff *skb)
 		     tries > 0; --tries) {
 			if (__netif_tx_trylock(txq)) {
 				if (!netif_tx_queue_stopped(txq))
-					status = dev->hard_start_xmit(skb, dev);
+					status = ops->ndo_start_xmit(skb, dev);
 				__netif_tx_unlock(txq);
 
 				if (status == NETDEV_TX_OK)
diff --git a/net/core/pktgen.c b/net/core/pktgen.c
index 4e77914c4d4..15e0c2c7aac 100644
--- a/net/core/pktgen.c
+++ b/net/core/pktgen.c
@@ -3352,14 +3352,14 @@ static void pktgen_rem_thread(struct pktgen_thread *t)
 
 static __inline__ void pktgen_xmit(struct pktgen_dev *pkt_dev)
 {
-	struct net_device *odev = NULL;
+	struct net_device *odev = pkt_dev->odev;
+	int (*xmit)(struct sk_buff *, struct net_device *)
+		= odev->netdev_ops->ndo_start_xmit;
 	struct netdev_queue *txq;
 	__u64 idle_start = 0;
 	u16 queue_map;
 	int ret;
 
-	odev = pkt_dev->odev;
-
 	if (pkt_dev->delay_us || pkt_dev->delay_ns) {
 		u64 now;
 
@@ -3440,7 +3440,7 @@ static __inline__ void pktgen_xmit(struct pktgen_dev *pkt_dev)
 
 		atomic_inc(&(pkt_dev->skb->users));
 	      retry_now:
-		ret = odev->hard_start_xmit(pkt_dev->skb, odev);
+		ret = (*xmit)(pkt_dev->skb, odev);
 		if (likely(ret == NETDEV_TX_OK)) {
 			pkt_dev->last_ok = 1;
 			pkt_dev->sofar++;
-- 
cgit v1.2.3-70-g09d2


From 145186a39570244aead77dc2efc559e5cac90548 Mon Sep 17 00:00:00 2001
From: Stephen Hemminger <shemminger@vyatta.com>
Date: Thu, 20 Nov 2008 20:29:48 -0800
Subject: fddi: convert to new network device ops

Similar to ethernet. Convert infrastructure and the one lone FDDI
driver (for the one lone user of that hardware??). Compile tested only.

Signed-off-by: Stephen Hemminger <shemminger@vyatta.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/skfp/skfddi.c  | 19 ++++++++++++-------
 include/linux/fddidevice.h |  1 +
 net/802/fddi.c             |  8 ++++++--
 3 files changed, 19 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/skfp/skfddi.c b/drivers/net/skfp/skfddi.c
index 282bb47decc..7fbd4f84f27 100644
--- a/drivers/net/skfp/skfddi.c
+++ b/drivers/net/skfp/skfddi.c
@@ -168,6 +168,17 @@ static int num_boards;	/* total number of adapters configured */
 #define PRINTK(s, args...)
 #endif				// DRIVERDEBUG
 
+static const struct net_device_ops skfp_netdev_ops = {
+	.ndo_open		= skfp_open,
+	.ndo_stop		= skfp_close,
+	.ndo_start_xmit		= skfp_send_pkt,
+	.ndo_get_stats		= skfp_ctl_get_stats,
+	.ndo_change_mtu		= fddi_change_mtu,
+	.ndo_set_multicast_list = skfp_ctl_set_multicast_list,
+	.ndo_set_mac_address	= skfp_ctl_set_mac_address,
+	.ndo_do_ioctl		= skfp_ioctl,
+};
+
 /*
  * =================
  * = skfp_init_one =
@@ -253,13 +264,7 @@ static int skfp_init_one(struct pci_dev *pdev,
 	}
 
 	dev->irq = pdev->irq;
-	dev->get_stats = &skfp_ctl_get_stats;
-	dev->open = &skfp_open;
-	dev->stop = &skfp_close;
-	dev->hard_start_xmit = &skfp_send_pkt;
-	dev->set_multicast_list = &skfp_ctl_set_multicast_list;
-	dev->set_mac_address = &skfp_ctl_set_mac_address;
-	dev->do_ioctl = &skfp_ioctl;
+	dev->netdev_ops = &skfp_netdev_ops;
 
 	SET_NETDEV_DEV(dev, &pdev->dev);
 
diff --git a/include/linux/fddidevice.h b/include/linux/fddidevice.h
index e61e42dfd31..155bafd9e88 100644
--- a/include/linux/fddidevice.h
+++ b/include/linux/fddidevice.h
@@ -27,6 +27,7 @@
 #ifdef __KERNEL__
 extern __be16	fddi_type_trans(struct sk_buff *skb,
 				struct net_device *dev);
+extern int fddi_change_mtu(struct net_device *dev, int new_mtu);
 extern struct net_device *alloc_fddidev(int sizeof_priv);
 #endif
 
diff --git a/net/802/fddi.c b/net/802/fddi.c
index 0549317b935..f1611a1e06a 100644
--- a/net/802/fddi.c
+++ b/net/802/fddi.c
@@ -167,23 +167,27 @@ __be16 fddi_type_trans(struct sk_buff *skb, struct net_device *dev)
 
 EXPORT_SYMBOL(fddi_type_trans);
 
-static int fddi_change_mtu(struct net_device *dev, int new_mtu)
+int fddi_change_mtu(struct net_device *dev, int new_mtu)
 {
 	if ((new_mtu < FDDI_K_SNAP_HLEN) || (new_mtu > FDDI_K_SNAP_DLEN))
 		return(-EINVAL);
 	dev->mtu = new_mtu;
 	return(0);
 }
+EXPORT_SYMBOL(fddi_change_mtu);
 
 static const struct header_ops fddi_header_ops = {
 	.create		= fddi_header,
 	.rebuild	= fddi_rebuild_header,
 };
 
+
 static void fddi_setup(struct net_device *dev)
 {
-	dev->change_mtu		= fddi_change_mtu;
 	dev->header_ops		= &fddi_header_ops;
+#ifdef CONFIG_COMPAT_NET_DEV_OPS
+	dev->change_mtu		= fddi_change_mtu,
+#endif
 
 	dev->type		= ARPHRD_FDDI;
 	dev->hard_header_len	= FDDI_K_SNAP_HLEN+3;	/* Assume 802.2 SNAP hdr len + 3 pad bytes */
-- 
cgit v1.2.3-70-g09d2


From 748ff68fad9600593c6abe47856037602bd5d133 Mon Sep 17 00:00:00 2001
From: Stephen Hemminger <shemminger@vyatta.com>
Date: Thu, 20 Nov 2008 20:32:15 -0800
Subject: hippi: convert driver to net_device_ops

Convert the HIPPI infrastructure for use with net_device_ops.

Signed-off-by: Stephen Hemminger <shemminger@vyatta.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/rrunner.c       | 15 +++++++++++----
 include/linux/hippidevice.h |  4 +++-
 net/802/hippi.c             | 14 +++++++++-----
 3 files changed, 23 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/rrunner.c b/drivers/net/rrunner.c
index 6e4131f9a93..b4e3ddd0b59 100644
--- a/drivers/net/rrunner.c
+++ b/drivers/net/rrunner.c
@@ -63,6 +63,16 @@ MODULE_LICENSE("GPL");
 
 static char version[] __devinitdata = "rrunner.c: v0.50 11/11/2002  Jes Sorensen (jes@wildopensource.com)\n";
 
+
+static const struct net_device_ops rr_netdev_ops = {
+	.ndo_open 		= rr_open,
+	.ndo_stop		= rr_close,
+	.ndo_do_ioctl		= rr_ioctl,
+	.ndo_start_xmit		= rr_start_xmit,
+	.ndo_change_mtu		= hippi_change_mtu,
+	.ndo_set_mac_address	= hippi_mac_addr,
+};
+
 /*
  * Implementation notes:
  *
@@ -115,10 +125,7 @@ static int __devinit rr_init_one(struct pci_dev *pdev,
 	spin_lock_init(&rrpriv->lock);
 
 	dev->irq = pdev->irq;
-	dev->open = &rr_open;
-	dev->hard_start_xmit = &rr_start_xmit;
-	dev->stop = &rr_close;
-	dev->do_ioctl = &rr_ioctl;
+	dev->netdev_ops = &rr_netdev_ops;
 
 	dev->base_addr = pci_resource_start(pdev, 0);
 
diff --git a/include/linux/hippidevice.h b/include/linux/hippidevice.h
index bab303dafd6..f148e490841 100644
--- a/include/linux/hippidevice.h
+++ b/include/linux/hippidevice.h
@@ -32,7 +32,9 @@ struct hippi_cb {
 };
 
 extern __be16 hippi_type_trans(struct sk_buff *skb, struct net_device *dev);
-
+extern int hippi_change_mtu(struct net_device *dev, int new_mtu);
+extern int hippi_mac_addr(struct net_device *dev, void *p);
+extern int hippi_neigh_setup_dev(struct net_device *dev, struct neigh_parms *p);
 extern struct net_device *alloc_hippi_dev(int sizeof_priv);
 #endif
 
diff --git a/net/802/hippi.c b/net/802/hippi.c
index e35dc1e0915..313b9ebf92e 100644
--- a/net/802/hippi.c
+++ b/net/802/hippi.c
@@ -144,7 +144,7 @@ __be16 hippi_type_trans(struct sk_buff *skb, struct net_device *dev)
 
 EXPORT_SYMBOL(hippi_type_trans);
 
-static int hippi_change_mtu(struct net_device *dev, int new_mtu)
+int hippi_change_mtu(struct net_device *dev, int new_mtu)
 {
 	/*
 	 * HIPPI's got these nice large MTUs.
@@ -154,12 +154,13 @@ static int hippi_change_mtu(struct net_device *dev, int new_mtu)
 	dev->mtu = new_mtu;
 	return(0);
 }
+EXPORT_SYMBOL(hippi_change_mtu);
 
 /*
  * For HIPPI we will actually use the lower 4 bytes of the hardware
  * address as the I-FIELD rather than the actual hardware address.
  */
-static int hippi_mac_addr(struct net_device *dev, void *p)
+int hippi_mac_addr(struct net_device *dev, void *p)
 {
 	struct sockaddr *addr = p;
 	if (netif_running(dev))
@@ -167,8 +168,9 @@ static int hippi_mac_addr(struct net_device *dev, void *p)
 	memcpy(dev->dev_addr, addr->sa_data, dev->addr_len);
 	return 0;
 }
+EXPORT_SYMBOL(hippi_mac_addr);
 
-static int hippi_neigh_setup_dev(struct net_device *dev, struct neigh_parms *p)
+int hippi_neigh_setup_dev(struct net_device *dev, struct neigh_parms *p)
 {
 	/* Never send broadcast/multicast ARP messages */
 	p->mcast_probes = 0;
@@ -181,6 +183,7 @@ static int hippi_neigh_setup_dev(struct net_device *dev, struct neigh_parms *p)
 		p->ucast_probes = 0;
 	return 0;
 }
+EXPORT_SYMBOL(hippi_neigh_setup_dev);
 
 static const struct header_ops hippi_header_ops = {
 	.create		= hippi_header,
@@ -190,11 +193,12 @@ static const struct header_ops hippi_header_ops = {
 
 static void hippi_setup(struct net_device *dev)
 {
-	dev->set_multicast_list		= NULL;
+#ifdef CONFIG_COMPAT_NET_DEV_OPS
 	dev->change_mtu			= hippi_change_mtu;
-	dev->header_ops			= &hippi_header_ops;
 	dev->set_mac_address 		= hippi_mac_addr;
 	dev->neigh_setup 		= hippi_neigh_setup_dev;
+#endif
+	dev->header_ops			= &hippi_header_ops;
 
 	/*
 	 * We don't support HIPPI `ARP' for the time being, and probably
-- 
cgit v1.2.3-70-g09d2


From 2f90b8657ec942d1880f720e0177ee71df7c8e3c Mon Sep 17 00:00:00 2001
From: Alexander Duyck <alexander.h.duyck@intel.com>
Date: Thu, 20 Nov 2008 20:52:10 -0800
Subject: ixgbe: this patch adds support for DCB to the kernel and ixgbe driver

This adds support for Data Center Bridging (DCB) features in the ixgbe
driver and adds an rtnetlink interface for configuring DCB to the
kernel.  The DCB feature support included are Priority Grouping (PG) -
which allows bandwidth guarantees to be allocated to groups to traffic
based on the 802.1q priority, and Priority Based Flow Control (PFC) -
which introduces a new MAC control PAUSE frame which works at
granularity of the 802.1p priority instead of the link (IEEE 802.3x).

Signed-off-by: Alexander Duyck <alexander.h.duyck@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
Signed-off-by: Peter P Waskiewicz Jr <peter.p.waskiewicz.jr@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/Kconfig                 |  10 +
 drivers/net/ixgbe/Makefile          |   2 +
 drivers/net/ixgbe/ixgbe.h           |  25 +-
 drivers/net/ixgbe/ixgbe_dcb.c       | 332 +++++++++++++++++
 drivers/net/ixgbe/ixgbe_dcb.h       | 157 ++++++++
 drivers/net/ixgbe/ixgbe_dcb_82598.c | 398 ++++++++++++++++++++
 drivers/net/ixgbe/ixgbe_dcb_82598.h |  94 +++++
 drivers/net/ixgbe/ixgbe_dcb_nl.c    | 356 ++++++++++++++++++
 drivers/net/ixgbe/ixgbe_ethtool.c   |  30 +-
 drivers/net/ixgbe/ixgbe_main.c      | 200 +++++++++-
 include/linux/dcbnl.h               | 230 ++++++++++++
 include/linux/netdevice.h           |   8 +
 include/linux/rtnetlink.h           |   5 +
 include/net/dcbnl.h                 |  44 +++
 net/Kconfig                         |   1 +
 net/Makefile                        |   3 +
 net/dcb/Kconfig                     |  12 +
 net/dcb/Makefile                    |   1 +
 net/dcb/dcbnl.c                     | 704 ++++++++++++++++++++++++++++++++++++
 19 files changed, 2593 insertions(+), 19 deletions(-)
 create mode 100644 drivers/net/ixgbe/ixgbe_dcb.c
 create mode 100644 drivers/net/ixgbe/ixgbe_dcb.h
 create mode 100644 drivers/net/ixgbe/ixgbe_dcb_82598.c
 create mode 100644 drivers/net/ixgbe/ixgbe_dcb_82598.h
 create mode 100644 drivers/net/ixgbe/ixgbe_dcb_nl.c
 create mode 100644 include/linux/dcbnl.h
 create mode 100644 include/net/dcbnl.h
 create mode 100644 net/dcb/Kconfig
 create mode 100644 net/dcb/Makefile
 create mode 100644 net/dcb/dcbnl.c

(limited to 'include/linux')

diff --git a/drivers/net/Kconfig b/drivers/net/Kconfig
index afa206590ad..efd461d7c2b 100644
--- a/drivers/net/Kconfig
+++ b/drivers/net/Kconfig
@@ -2451,6 +2451,16 @@ config IXGBE_DCA
 	  driver.  DCA is a method for warming the CPU cache before data
 	  is used, with the intent of lessening the impact of cache misses.
 
+config IXGBE_DCBNL
+	bool "Data Center Bridging (DCB) Support"
+	default n
+	depends on IXGBE && DCBNL
+	---help---
+	  Say Y here if you want to use Data Center Bridging (DCB) in the
+	  driver.
+
+	  If unsure, say N.
+
 config IXGB
 	tristate "Intel(R) PRO/10GbE support"
 	depends on PCI
diff --git a/drivers/net/ixgbe/Makefile b/drivers/net/ixgbe/Makefile
index ccd83d9f579..3228e508e62 100644
--- a/drivers/net/ixgbe/Makefile
+++ b/drivers/net/ixgbe/Makefile
@@ -34,3 +34,5 @@ obj-$(CONFIG_IXGBE) += ixgbe.o
 
 ixgbe-objs := ixgbe_main.o ixgbe_common.o ixgbe_ethtool.o \
               ixgbe_82598.o ixgbe_phy.o
+
+ixgbe-$(CONFIG_IXGBE_DCBNL) +=  ixgbe_dcb.o ixgbe_dcb_82598.o ixgbe_dcb_nl.o
diff --git a/drivers/net/ixgbe/ixgbe.h b/drivers/net/ixgbe/ixgbe.h
index 132854f646b..796f189f387 100644
--- a/drivers/net/ixgbe/ixgbe.h
+++ b/drivers/net/ixgbe/ixgbe.h
@@ -35,7 +35,7 @@
 
 #include "ixgbe_type.h"
 #include "ixgbe_common.h"
-
+#include "ixgbe_dcb.h"
 #ifdef CONFIG_IXGBE_DCA
 #include <linux/dca.h>
 #endif
@@ -84,6 +84,7 @@
 #define IXGBE_TX_FLAGS_TSO		(u32)(1 << 2)
 #define IXGBE_TX_FLAGS_IPV4		(u32)(1 << 3)
 #define IXGBE_TX_FLAGS_VLAN_MASK	0xffff0000
+#define IXGBE_TX_FLAGS_VLAN_PRIO_MASK   0x0000e000
 #define IXGBE_TX_FLAGS_VLAN_SHIFT	16
 
 #define IXGBE_MAX_LRO_DESCRIPTORS       8
@@ -134,7 +135,7 @@ struct ixgbe_ring {
 
 	u16 reg_idx; /* holds the special value that gets the hardware register
 		      * offset associated with this ring, which is different
-		      * for DCE and RSS modes */
+		      * for DCB and RSS modes */
 
 #ifdef CONFIG_IXGBE_DCA
 	/* cpu for tx queue */
@@ -152,8 +153,10 @@ struct ixgbe_ring {
 	u16 rx_buf_len;
 };
 
+#define RING_F_DCB  0
 #define RING_F_VMDQ 1
 #define RING_F_RSS  2
+#define IXGBE_MAX_DCB_INDICES   8
 #define IXGBE_MAX_RSS_INDICES  16
 #define IXGBE_MAX_VMDQ_INDICES 16
 struct ixgbe_ring_feature {
@@ -164,6 +167,10 @@ struct ixgbe_ring_feature {
 #define MAX_RX_QUEUES 64
 #define MAX_TX_QUEUES 32
 
+#define MAX_RX_PACKET_BUFFERS ((adapter->flags & IXGBE_FLAG_DCB_ENABLED) \
+                              ? 8 : 1)
+#define MAX_TX_PACKET_BUFFERS MAX_RX_PACKET_BUFFERS
+
 /* MAX_MSIX_Q_VECTORS of these are allocated,
  * but we only use one per queue-specific vector.
  */
@@ -215,6 +222,9 @@ struct ixgbe_adapter {
 	struct work_struct reset_task;
 	struct ixgbe_q_vector q_vector[MAX_MSIX_Q_VECTORS];
 	char name[MAX_MSIX_COUNT][IFNAMSIZ + 5];
+	struct ixgbe_dcb_config dcb_cfg;
+	struct ixgbe_dcb_config temp_dcb_cfg;
+	u8 dcb_set_bitmap;
 
 	/* Interrupt Throttle Rate */
 	u32 itr_setting;
@@ -270,6 +280,7 @@ struct ixgbe_adapter {
 #define IXGBE_FLAG_FAN_FAIL_CAPABLE             (u32)(1 << 20)
 #define IXGBE_FLAG_NEED_LINK_UPDATE             (u32)(1 << 22)
 #define IXGBE_FLAG_IN_WATCHDOG_TASK             (u32)(1 << 23)
+#define IXGBE_FLAG_DCB_ENABLED                  (u32)(1 << 24)
 
 /* default to trying for four seconds */
 #define IXGBE_TRY_LINK_TIMEOUT (4 * HZ)
@@ -313,6 +324,12 @@ enum ixgbe_boards {
 };
 
 extern struct ixgbe_info ixgbe_82598_info;
+#ifdef CONFIG_IXGBE_DCBNL
+extern struct dcbnl_rtnl_ops dcbnl_ops;
+extern int ixgbe_copy_dcb_cfg(struct ixgbe_dcb_config *src_dcb_cfg,
+                              struct ixgbe_dcb_config *dst_dcb_cfg,
+                              int tc_max);
+#endif
 
 extern char ixgbe_driver_name[];
 extern const char ixgbe_driver_version[];
@@ -327,5 +344,9 @@ extern int ixgbe_setup_tx_resources(struct ixgbe_adapter *, struct ixgbe_ring *)
 extern void ixgbe_free_rx_resources(struct ixgbe_adapter *, struct ixgbe_ring *);
 extern void ixgbe_free_tx_resources(struct ixgbe_adapter *, struct ixgbe_ring *);
 extern void ixgbe_update_stats(struct ixgbe_adapter *adapter);
+extern void ixgbe_reset_interrupt_capability(struct ixgbe_adapter *adapter);
+extern int ixgbe_init_interrupt_scheme(struct ixgbe_adapter *adapter);
+void ixgbe_napi_add_all(struct ixgbe_adapter *adapter);
+void ixgbe_napi_del_all(struct ixgbe_adapter *adapter);
 
 #endif /* _IXGBE_H_ */
diff --git a/drivers/net/ixgbe/ixgbe_dcb.c b/drivers/net/ixgbe/ixgbe_dcb.c
new file mode 100644
index 00000000000..e2e28ac63de
--- /dev/null
+++ b/drivers/net/ixgbe/ixgbe_dcb.c
@@ -0,0 +1,332 @@
+/*******************************************************************************
+
+  Intel 10 Gigabit PCI Express Linux driver
+  Copyright(c) 1999 - 2007 Intel Corporation.
+
+  This program is free software; you can redistribute it and/or modify it
+  under the terms and conditions of the GNU General Public License,
+  version 2, as published by the Free Software Foundation.
+
+  This program is distributed in the hope it will be useful, but WITHOUT
+  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+  more details.
+
+  You should have received a copy of the GNU General Public License along with
+  this program; if not, write to the Free Software Foundation, Inc.,
+  51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
+
+  The full GNU General Public License is included in this distribution in
+  the file called "COPYING".
+
+  Contact Information:
+  Linux NICS <linux.nics@intel.com>
+  e1000-devel Mailing List <e1000-devel@lists.sourceforge.net>
+  Intel Corporation, 5200 N.E. Elam Young Parkway, Hillsboro, OR 97124-6497
+
+*******************************************************************************/
+
+
+#include "ixgbe.h"
+#include "ixgbe_type.h"
+#include "ixgbe_dcb.h"
+#include "ixgbe_dcb_82598.h"
+
+/**
+ * ixgbe_dcb_config - Struct containing DCB settings.
+ * @dcb_config: Pointer to DCB config structure
+ *
+ * This function checks DCB rules for DCB settings.
+ * The following rules are checked:
+ * 1. The sum of bandwidth percentages of all Bandwidth Groups must total 100%.
+ * 2. The sum of bandwidth percentages of all Traffic Classes within a Bandwidth
+ *    Group must total 100.
+ * 3. A Traffic Class should not be set to both Link Strict Priority
+ *    and Group Strict Priority.
+ * 4. Link strict Bandwidth Groups can only have link strict traffic classes
+ *    with zero bandwidth.
+ */
+s32 ixgbe_dcb_check_config(struct ixgbe_dcb_config *dcb_config)
+{
+	struct tc_bw_alloc *p;
+	s32 ret_val = 0;
+	u8 i, j, bw = 0, bw_id;
+	u8 bw_sum[2][MAX_BW_GROUP];
+	bool link_strict[2][MAX_BW_GROUP];
+
+	memset(bw_sum, 0, sizeof(bw_sum));
+	memset(link_strict, 0, sizeof(link_strict));
+
+	/* First Tx, then Rx */
+	for (i = 0; i < 2; i++) {
+		/* Check each traffic class for rule violation */
+		for (j = 0; j < MAX_TRAFFIC_CLASS; j++) {
+			p = &dcb_config->tc_config[j].path[i];
+
+			bw = p->bwg_percent;
+			bw_id = p->bwg_id;
+
+			if (bw_id >= MAX_BW_GROUP) {
+				ret_val = DCB_ERR_CONFIG;
+				goto err_config;
+			}
+			if (p->prio_type == prio_link) {
+				link_strict[i][bw_id] = true;
+				/* Link strict should have zero bandwidth */
+				if (bw) {
+					ret_val = DCB_ERR_LS_BW_NONZERO;
+					goto err_config;
+				}
+			} else if (!bw) {
+				/*
+				 * Traffic classes without link strict
+				 * should have non-zero bandwidth.
+				 */
+				ret_val = DCB_ERR_TC_BW_ZERO;
+				goto err_config;
+			}
+			bw_sum[i][bw_id] += bw;
+		}
+
+		bw = 0;
+
+		/* Check each bandwidth group for rule violation */
+		for (j = 0; j < MAX_BW_GROUP; j++) {
+			bw += dcb_config->bw_percentage[i][j];
+			/*
+			 * Sum of bandwidth percentages of all traffic classes
+			 * within a Bandwidth Group must total 100 except for
+			 * link strict group (zero bandwidth).
+			 */
+			if (link_strict[i][j]) {
+				if (bw_sum[i][j]) {
+					/*
+					 * Link strict group should have zero
+					 * bandwidth.
+					 */
+					ret_val = DCB_ERR_LS_BWG_NONZERO;
+					goto err_config;
+				}
+			} else if (bw_sum[i][j] != BW_PERCENT &&
+				   bw_sum[i][j] != 0) {
+				ret_val = DCB_ERR_TC_BW;
+				goto err_config;
+			}
+		}
+
+		if (bw != BW_PERCENT) {
+			ret_val = DCB_ERR_BW_GROUP;
+			goto err_config;
+		}
+	}
+
+err_config:
+	return ret_val;
+}
+
+/**
+ * ixgbe_dcb_calculate_tc_credits - Calculates traffic class credits
+ * @ixgbe_dcb_config: Struct containing DCB settings.
+ * @direction: Configuring either Tx or Rx.
+ *
+ * This function calculates the credits allocated to each traffic class.
+ * It should be called only after the rules are checked by
+ * ixgbe_dcb_check_config().
+ */
+s32 ixgbe_dcb_calculate_tc_credits(struct ixgbe_dcb_config *dcb_config,
+                                   u8 direction)
+{
+	struct tc_bw_alloc *p;
+	s32 ret_val = 0;
+	/* Initialization values default for Tx settings */
+	u32 credit_refill       = 0;
+	u32 credit_max          = 0;
+	u16 link_percentage     = 0;
+	u8  bw_percent          = 0;
+	u8  i;
+
+	if (dcb_config == NULL) {
+		ret_val = DCB_ERR_CONFIG;
+		goto out;
+	}
+
+	/* Find out the link percentage for each TC first */
+	for (i = 0; i < MAX_TRAFFIC_CLASS; i++) {
+		p = &dcb_config->tc_config[i].path[direction];
+		bw_percent = dcb_config->bw_percentage[direction][p->bwg_id];
+
+		link_percentage = p->bwg_percent;
+		/* Must be careful of integer division for very small nums */
+		link_percentage = (link_percentage * bw_percent) / 100;
+		if (p->bwg_percent > 0 && link_percentage == 0)
+			link_percentage = 1;
+
+		/* Save link_percentage for reference */
+		p->link_percent = (u8)link_percentage;
+
+		/* Calculate credit refill and save it */
+		credit_refill = link_percentage * MINIMUM_CREDIT_REFILL;
+		p->data_credits_refill = (u16)credit_refill;
+
+		/* Calculate maximum credit for the TC */
+		credit_max = (link_percentage * MAX_CREDIT) / 100;
+
+		/*
+		 * Adjustment based on rule checking, if the percentage
+		 * of a TC is too small, the maximum credit may not be
+		 * enough to send out a jumbo frame in data plane arbitration.
+		 */
+		if (credit_max && (credit_max < MINIMUM_CREDIT_FOR_JUMBO))
+			credit_max = MINIMUM_CREDIT_FOR_JUMBO;
+
+		if (direction == DCB_TX_CONFIG) {
+			/*
+			 * Adjustment based on rule checking, if the
+			 * percentage of a TC is too small, the maximum
+			 * credit may not be enough to send out a TSO
+			 * packet in descriptor plane arbitration.
+			 */
+			if (credit_max &&
+			    (credit_max < MINIMUM_CREDIT_FOR_TSO))
+				credit_max = MINIMUM_CREDIT_FOR_TSO;
+
+			dcb_config->tc_config[i].desc_credits_max =
+				(u16)credit_max;
+		}
+
+		p->data_credits_max = (u16)credit_max;
+	}
+
+out:
+	return ret_val;
+}
+
+/**
+ * ixgbe_dcb_get_tc_stats - Returns status of each traffic class
+ * @hw: pointer to hardware structure
+ * @stats: pointer to statistics structure
+ * @tc_count:  Number of elements in bwg_array.
+ *
+ * This function returns the status data for each of the Traffic Classes in use.
+ */
+s32 ixgbe_dcb_get_tc_stats(struct ixgbe_hw *hw, struct ixgbe_hw_stats *stats,
+                           u8 tc_count)
+{
+	s32 ret = 0;
+	if (hw->mac.type == ixgbe_mac_82598EB)
+		ret = ixgbe_dcb_get_tc_stats_82598(hw, stats, tc_count);
+	return ret;
+}
+
+/**
+ * ixgbe_dcb_get_pfc_stats - Returns CBFC status of each traffic class
+ * hw - pointer to hardware structure
+ * stats - pointer to statistics structure
+ * tc_count -  Number of elements in bwg_array.
+ *
+ * This function returns the CBFC status data for each of the Traffic Classes.
+ */
+s32 ixgbe_dcb_get_pfc_stats(struct ixgbe_hw *hw, struct ixgbe_hw_stats *stats,
+                            u8 tc_count)
+{
+	s32 ret = 0;
+	if (hw->mac.type == ixgbe_mac_82598EB)
+		ret = ixgbe_dcb_get_pfc_stats_82598(hw, stats, tc_count);
+	return ret;
+}
+
+/**
+ * ixgbe_dcb_config_rx_arbiter - Config Rx arbiter
+ * @hw: pointer to hardware structure
+ * @dcb_config: pointer to ixgbe_dcb_config structure
+ *
+ * Configure Rx Data Arbiter and credits for each traffic class.
+ */
+s32 ixgbe_dcb_config_rx_arbiter(struct ixgbe_hw *hw,
+                                struct ixgbe_dcb_config *dcb_config)
+{
+	s32 ret = 0;
+	if (hw->mac.type == ixgbe_mac_82598EB)
+		ret = ixgbe_dcb_config_rx_arbiter_82598(hw, dcb_config);
+	return ret;
+}
+
+/**
+ * ixgbe_dcb_config_tx_desc_arbiter - Config Tx Desc arbiter
+ * @hw: pointer to hardware structure
+ * @dcb_config: pointer to ixgbe_dcb_config structure
+ *
+ * Configure Tx Descriptor Arbiter and credits for each traffic class.
+ */
+s32 ixgbe_dcb_config_tx_desc_arbiter(struct ixgbe_hw *hw,
+                                     struct ixgbe_dcb_config *dcb_config)
+{
+	s32 ret = 0;
+	if (hw->mac.type == ixgbe_mac_82598EB)
+		ret = ixgbe_dcb_config_tx_desc_arbiter_82598(hw, dcb_config);
+	return ret;
+}
+
+/**
+ * ixgbe_dcb_config_tx_data_arbiter - Config Tx data arbiter
+ * @hw: pointer to hardware structure
+ * @dcb_config: pointer to ixgbe_dcb_config structure
+ *
+ * Configure Tx Data Arbiter and credits for each traffic class.
+ */
+s32 ixgbe_dcb_config_tx_data_arbiter(struct ixgbe_hw *hw,
+                                     struct ixgbe_dcb_config *dcb_config)
+{
+	s32 ret = 0;
+	if (hw->mac.type == ixgbe_mac_82598EB)
+		ret = ixgbe_dcb_config_tx_data_arbiter_82598(hw, dcb_config);
+	return ret;
+}
+
+/**
+ * ixgbe_dcb_config_pfc - Config priority flow control
+ * @hw: pointer to hardware structure
+ * @dcb_config: pointer to ixgbe_dcb_config structure
+ *
+ * Configure Priority Flow Control for each traffic class.
+ */
+s32 ixgbe_dcb_config_pfc(struct ixgbe_hw *hw,
+                         struct ixgbe_dcb_config *dcb_config)
+{
+	s32 ret = 0;
+	if (hw->mac.type == ixgbe_mac_82598EB)
+		ret = ixgbe_dcb_config_pfc_82598(hw, dcb_config);
+	return ret;
+}
+
+/**
+ * ixgbe_dcb_config_tc_stats - Config traffic class statistics
+ * @hw: pointer to hardware structure
+ *
+ * Configure queue statistics registers, all queues belonging to same traffic
+ * class uses a single set of queue statistics counters.
+ */
+s32 ixgbe_dcb_config_tc_stats(struct ixgbe_hw *hw)
+{
+	s32 ret = 0;
+	if (hw->mac.type == ixgbe_mac_82598EB)
+		ret = ixgbe_dcb_config_tc_stats_82598(hw);
+	return ret;
+}
+
+/**
+ * ixgbe_dcb_hw_config - Config and enable DCB
+ * @hw: pointer to hardware structure
+ * @dcb_config: pointer to ixgbe_dcb_config structure
+ *
+ * Configure dcb settings and enable dcb mode.
+ */
+s32 ixgbe_dcb_hw_config(struct ixgbe_hw *hw,
+                        struct ixgbe_dcb_config *dcb_config)
+{
+	s32 ret = 0;
+	if (hw->mac.type == ixgbe_mac_82598EB)
+		ret = ixgbe_dcb_hw_config_82598(hw, dcb_config);
+	return ret;
+}
+
diff --git a/drivers/net/ixgbe/ixgbe_dcb.h b/drivers/net/ixgbe/ixgbe_dcb.h
new file mode 100644
index 00000000000..62dfd243bed
--- /dev/null
+++ b/drivers/net/ixgbe/ixgbe_dcb.h
@@ -0,0 +1,157 @@
+/*******************************************************************************
+
+  Intel 10 Gigabit PCI Express Linux driver
+  Copyright(c) 1999 - 2007 Intel Corporation.
+
+  This program is free software; you can redistribute it and/or modify it
+  under the terms and conditions of the GNU General Public License,
+  version 2, as published by the Free Software Foundation.
+
+  This program is distributed in the hope it will be useful, but WITHOUT
+  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+  more details.
+
+  You should have received a copy of the GNU General Public License along with
+  this program; if not, write to the Free Software Foundation, Inc.,
+  51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
+
+  The full GNU General Public License is included in this distribution in
+  the file called "COPYING".
+
+  Contact Information:
+  Linux NICS <linux.nics@intel.com>
+  e1000-devel Mailing List <e1000-devel@lists.sourceforge.net>
+  Intel Corporation, 5200 N.E. Elam Young Parkway, Hillsboro, OR 97124-6497
+
+*******************************************************************************/
+
+#ifndef _DCB_CONFIG_H_
+#define _DCB_CONFIG_H_
+
+#include "ixgbe_type.h"
+
+/* DCB data structures */
+
+#define IXGBE_MAX_PACKET_BUFFERS 8
+#define MAX_USER_PRIORITY        8
+#define MAX_TRAFFIC_CLASS        8
+#define MAX_BW_GROUP             8
+#define BW_PERCENT               100
+
+#define DCB_TX_CONFIG            0
+#define DCB_RX_CONFIG            1
+
+/* DCB error Codes */
+#define DCB_SUCCESS              0
+#define DCB_ERR_CONFIG           -1
+#define DCB_ERR_PARAM            -2
+
+/* Transmit and receive Errors */
+/* Error in bandwidth group allocation */
+#define DCB_ERR_BW_GROUP        -3
+/* Error in traffic class bandwidth allocation */
+#define DCB_ERR_TC_BW           -4
+/* Traffic class has both link strict and group strict enabled */
+#define DCB_ERR_LS_GS           -5
+/* Link strict traffic class has non zero bandwidth */
+#define DCB_ERR_LS_BW_NONZERO   -6
+/* Link strict bandwidth group has non zero bandwidth */
+#define DCB_ERR_LS_BWG_NONZERO  -7
+/*  Traffic class has zero bandwidth */
+#define DCB_ERR_TC_BW_ZERO      -8
+
+#define DCB_NOT_IMPLEMENTED      0x7FFFFFFF
+
+struct dcb_pfc_tc_debug {
+	u8  tc;
+	u8  pause_status;
+	u64 pause_quanta;
+};
+
+enum strict_prio_type {
+	prio_none = 0,
+	prio_group,
+	prio_link
+};
+
+/* Traffic class bandwidth allocation per direction */
+struct tc_bw_alloc {
+	u8 bwg_id;		  /* Bandwidth Group (BWG) ID */
+	u8 bwg_percent;		  /* % of BWG's bandwidth */
+	u8 link_percent;	  /* % of link bandwidth */
+	u8 up_to_tc_bitmap;	  /* User Priority to Traffic Class mapping */
+	u16 data_credits_refill;  /* Credit refill amount in 64B granularity */
+	u16 data_credits_max;	  /* Max credits for a configured packet buffer
+				   * in 64B granularity.*/
+	enum strict_prio_type prio_type; /* Link or Group Strict Priority */
+};
+
+enum dcb_pfc_type {
+	pfc_disabled = 0,
+	pfc_enabled_full,
+	pfc_enabled_tx,
+	pfc_enabled_rx
+};
+
+/* Traffic class configuration */
+struct tc_configuration {
+	struct tc_bw_alloc path[2]; /* One each for Tx/Rx */
+	enum dcb_pfc_type  dcb_pfc; /* Class based flow control setting */
+
+	u16 desc_credits_max; /* For Tx Descriptor arbitration */
+	u8 tc; /* Traffic class (TC) */
+};
+
+enum dcb_rx_pba_cfg {
+	pba_equal,     /* PBA[0-7] each use 64KB FIFO */
+	pba_80_48      /* PBA[0-3] each use 80KB, PBA[4-7] each use 48KB */
+};
+
+struct ixgbe_dcb_config {
+	struct tc_configuration tc_config[MAX_TRAFFIC_CLASS];
+	u8     bw_percentage[2][MAX_BW_GROUP]; /* One each for Tx/Rx */
+
+	bool  round_robin_enable;
+
+	enum dcb_rx_pba_cfg rx_pba_cfg;
+
+	u32  dcb_cfg_version; /* Not used...OS-specific? */
+	u32  link_speed; /* For bandwidth allocation validation purpose */
+};
+
+/* DCB driver APIs */
+
+/* DCB rule checking function.*/
+s32 ixgbe_dcb_check_config(struct ixgbe_dcb_config *config);
+
+/* DCB credits calculation */
+s32 ixgbe_dcb_calculate_tc_credits(struct ixgbe_dcb_config *, u8);
+
+/* DCB PFC functions */
+s32 ixgbe_dcb_config_pfc(struct ixgbe_hw *, struct ixgbe_dcb_config *g);
+s32 ixgbe_dcb_get_pfc_stats(struct ixgbe_hw *, struct ixgbe_hw_stats *, u8);
+
+/* DCB traffic class stats */
+s32 ixgbe_dcb_config_tc_stats(struct ixgbe_hw *);
+s32 ixgbe_dcb_get_tc_stats(struct ixgbe_hw *, struct ixgbe_hw_stats *, u8);
+
+/* DCB config arbiters */
+s32 ixgbe_dcb_config_tx_desc_arbiter(struct ixgbe_hw *,
+                                     struct ixgbe_dcb_config *);
+s32 ixgbe_dcb_config_tx_data_arbiter(struct ixgbe_hw *,
+                                     struct ixgbe_dcb_config *);
+s32 ixgbe_dcb_config_rx_arbiter(struct ixgbe_hw *, struct ixgbe_dcb_config *);
+
+/* DCB hw initialization */
+s32 ixgbe_dcb_hw_config(struct ixgbe_hw *, struct ixgbe_dcb_config *);
+
+/* DCB definitions for credit calculation */
+#define MAX_CREDIT_REFILL       511  /* 0x1FF * 64B = 32704B */
+#define MINIMUM_CREDIT_REFILL   5    /* 5*64B = 320B */
+#define MINIMUM_CREDIT_FOR_JUMBO 145  /* 145= UpperBound((9*1024+54)/64B) for 9KB jumbo frame */
+#define DCB_MAX_TSO_SIZE        (32*1024) /* MAX TSO packet size supported in DCB mode */
+#define MINIMUM_CREDIT_FOR_TSO  (DCB_MAX_TSO_SIZE/64 + 1) /* 513 for 32KB TSO packet */
+#define MAX_CREDIT              4095 /* Maximum credit supported: 256KB * 1204 / 64B */
+
+#endif /* _DCB_CONFIG_H */
diff --git a/drivers/net/ixgbe/ixgbe_dcb_82598.c b/drivers/net/ixgbe/ixgbe_dcb_82598.c
new file mode 100644
index 00000000000..fce6867a451
--- /dev/null
+++ b/drivers/net/ixgbe/ixgbe_dcb_82598.c
@@ -0,0 +1,398 @@
+/*******************************************************************************
+
+  Intel 10 Gigabit PCI Express Linux driver
+  Copyright(c) 1999 - 2007 Intel Corporation.
+
+  This program is free software; you can redistribute it and/or modify it
+  under the terms and conditions of the GNU General Public License,
+  version 2, as published by the Free Software Foundation.
+
+  This program is distributed in the hope it will be useful, but WITHOUT
+  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+  more details.
+
+  You should have received a copy of the GNU General Public License along with
+  this program; if not, write to the Free Software Foundation, Inc.,
+  51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
+
+  The full GNU General Public License is included in this distribution in
+  the file called "COPYING".
+
+  Contact Information:
+  Linux NICS <linux.nics@intel.com>
+  e1000-devel Mailing List <e1000-devel@lists.sourceforge.net>
+  Intel Corporation, 5200 N.E. Elam Young Parkway, Hillsboro, OR 97124-6497
+
+*******************************************************************************/
+
+#include "ixgbe.h"
+#include "ixgbe_type.h"
+#include "ixgbe_dcb.h"
+#include "ixgbe_dcb_82598.h"
+
+/**
+ * ixgbe_dcb_get_tc_stats_82598 - Return status data for each traffic class
+ * @hw: pointer to hardware structure
+ * @stats: pointer to statistics structure
+ * @tc_count:  Number of elements in bwg_array.
+ *
+ * This function returns the status data for each of the Traffic Classes in use.
+ */
+s32 ixgbe_dcb_get_tc_stats_82598(struct ixgbe_hw *hw,
+                                 struct ixgbe_hw_stats *stats,
+                                 u8 tc_count)
+{
+	int tc;
+
+	if (tc_count > MAX_TRAFFIC_CLASS)
+		return DCB_ERR_PARAM;
+
+	/* Statistics pertaining to each traffic class */
+	for (tc = 0; tc < tc_count; tc++) {
+		/* Transmitted Packets */
+		stats->qptc[tc] += IXGBE_READ_REG(hw, IXGBE_QPTC(tc));
+		/* Transmitted Bytes */
+		stats->qbtc[tc] += IXGBE_READ_REG(hw, IXGBE_QBTC(tc));
+		/* Received Packets */
+		stats->qprc[tc] += IXGBE_READ_REG(hw, IXGBE_QPRC(tc));
+		/* Received Bytes */
+		stats->qbrc[tc] += IXGBE_READ_REG(hw, IXGBE_QBRC(tc));
+	}
+
+	return 0;
+}
+
+/**
+ * ixgbe_dcb_get_pfc_stats_82598 - Returns CBFC status data
+ * @hw: pointer to hardware structure
+ * @stats: pointer to statistics structure
+ * @tc_count:  Number of elements in bwg_array.
+ *
+ * This function returns the CBFC status data for each of the Traffic Classes.
+ */
+s32 ixgbe_dcb_get_pfc_stats_82598(struct ixgbe_hw *hw,
+                                  struct ixgbe_hw_stats *stats,
+                                  u8 tc_count)
+{
+	int tc;
+
+	if (tc_count > MAX_TRAFFIC_CLASS)
+		return DCB_ERR_PARAM;
+
+	for (tc = 0; tc < tc_count; tc++) {
+		/* Priority XOFF Transmitted */
+		stats->pxofftxc[tc] += IXGBE_READ_REG(hw, IXGBE_PXOFFTXC(tc));
+		/* Priority XOFF Received */
+		stats->pxoffrxc[tc] += IXGBE_READ_REG(hw, IXGBE_PXOFFRXC(tc));
+	}
+
+	return 0;
+}
+
+/**
+ * ixgbe_dcb_config_packet_buffers_82598 - Configure packet buffers
+ * @hw: pointer to hardware structure
+ * @dcb_config: pointer to ixgbe_dcb_config structure
+ *
+ * Configure packet buffers for DCB mode.
+ */
+s32 ixgbe_dcb_config_packet_buffers_82598(struct ixgbe_hw *hw,
+                                          struct ixgbe_dcb_config *dcb_config)
+{
+	s32 ret_val = 0;
+	u32 value = IXGBE_RXPBSIZE_64KB;
+	u8  i = 0;
+
+	/* Setup Rx packet buffer sizes */
+	switch (dcb_config->rx_pba_cfg) {
+	case pba_80_48:
+		/* Setup the first four at 80KB */
+		value = IXGBE_RXPBSIZE_80KB;
+		for (; i < 4; i++)
+			IXGBE_WRITE_REG(hw, IXGBE_RXPBSIZE(i), value);
+		/* Setup the last four at 48KB...don't re-init i */
+		value = IXGBE_RXPBSIZE_48KB;
+		/* Fall Through */
+	case pba_equal:
+	default:
+		for (; i < IXGBE_MAX_PACKET_BUFFERS; i++)
+			IXGBE_WRITE_REG(hw, IXGBE_RXPBSIZE(i), value);
+
+		/* Setup Tx packet buffer sizes */
+		for (i = 0; i < IXGBE_MAX_PACKET_BUFFERS; i++) {
+			IXGBE_WRITE_REG(hw, IXGBE_TXPBSIZE(i),
+					IXGBE_TXPBSIZE_40KB);
+		}
+		break;
+	}
+
+	return ret_val;
+}
+
+/**
+ * ixgbe_dcb_config_rx_arbiter_82598 - Config Rx data arbiter
+ * @hw: pointer to hardware structure
+ * @dcb_config: pointer to ixgbe_dcb_config structure
+ *
+ * Configure Rx Data Arbiter and credits for each traffic class.
+ */
+s32 ixgbe_dcb_config_rx_arbiter_82598(struct ixgbe_hw *hw,
+                                      struct ixgbe_dcb_config *dcb_config)
+{
+	struct tc_bw_alloc    *p;
+	u32    reg           = 0;
+	u32    credit_refill = 0;
+	u32    credit_max    = 0;
+	u8     i             = 0;
+
+	reg = IXGBE_READ_REG(hw, IXGBE_RUPPBMR) | IXGBE_RUPPBMR_MQA;
+	IXGBE_WRITE_REG(hw, IXGBE_RUPPBMR, reg);
+
+	reg = IXGBE_READ_REG(hw, IXGBE_RMCS);
+	/* Enable Arbiter */
+	reg &= ~IXGBE_RMCS_ARBDIS;
+	/* Enable Receive Recycle within the BWG */
+	reg |= IXGBE_RMCS_RRM;
+	/* Enable Deficit Fixed Priority arbitration*/
+	reg |= IXGBE_RMCS_DFP;
+
+	IXGBE_WRITE_REG(hw, IXGBE_RMCS, reg);
+
+	/* Configure traffic class credits and priority */
+	for (i = 0; i < MAX_TRAFFIC_CLASS; i++) {
+		p = &dcb_config->tc_config[i].path[DCB_RX_CONFIG];
+		credit_refill = p->data_credits_refill;
+		credit_max    = p->data_credits_max;
+
+		reg = credit_refill | (credit_max << IXGBE_RT2CR_MCL_SHIFT);
+
+		if (p->prio_type == prio_link)
+			reg |= IXGBE_RT2CR_LSP;
+
+		IXGBE_WRITE_REG(hw, IXGBE_RT2CR(i), reg);
+	}
+
+	reg = IXGBE_READ_REG(hw, IXGBE_RDRXCTL);
+	reg |= IXGBE_RDRXCTL_RDMTS_1_2;
+	reg |= IXGBE_RDRXCTL_MPBEN;
+	reg |= IXGBE_RDRXCTL_MCEN;
+	IXGBE_WRITE_REG(hw, IXGBE_RDRXCTL, reg);
+
+	reg = IXGBE_READ_REG(hw, IXGBE_RXCTRL);
+	/* Make sure there is enough descriptors before arbitration */
+	reg &= ~IXGBE_RXCTRL_DMBYPS;
+	IXGBE_WRITE_REG(hw, IXGBE_RXCTRL, reg);
+
+	return 0;
+}
+
+/**
+ * ixgbe_dcb_config_tx_desc_arbiter_82598 - Config Tx Desc. arbiter
+ * @hw: pointer to hardware structure
+ * @dcb_config: pointer to ixgbe_dcb_config structure
+ *
+ * Configure Tx Descriptor Arbiter and credits for each traffic class.
+ */
+s32 ixgbe_dcb_config_tx_desc_arbiter_82598(struct ixgbe_hw *hw,
+                                           struct ixgbe_dcb_config *dcb_config)
+{
+	struct tc_bw_alloc *p;
+	u32    reg, max_credits;
+	u8     i;
+
+	reg = IXGBE_READ_REG(hw, IXGBE_DPMCS);
+
+	/* Enable arbiter */
+	reg &= ~IXGBE_DPMCS_ARBDIS;
+	if (!(dcb_config->round_robin_enable)) {
+		/* Enable DFP and Recycle mode */
+		reg |= (IXGBE_DPMCS_TDPAC | IXGBE_DPMCS_TRM);
+	}
+	reg |= IXGBE_DPMCS_TSOEF;
+	/* Configure Max TSO packet size 34KB including payload and headers */
+	reg |= (0x4 << IXGBE_DPMCS_MTSOS_SHIFT);
+
+	IXGBE_WRITE_REG(hw, IXGBE_DPMCS, reg);
+
+	/* Configure traffic class credits and priority */
+	for (i = 0; i < MAX_TRAFFIC_CLASS; i++) {
+		p = &dcb_config->tc_config[i].path[DCB_TX_CONFIG];
+		max_credits = dcb_config->tc_config[i].desc_credits_max;
+		reg = max_credits << IXGBE_TDTQ2TCCR_MCL_SHIFT;
+		reg |= p->data_credits_refill;
+		reg |= (u32)(p->bwg_id) << IXGBE_TDTQ2TCCR_BWG_SHIFT;
+
+		if (p->prio_type == prio_group)
+			reg |= IXGBE_TDTQ2TCCR_GSP;
+
+		if (p->prio_type == prio_link)
+			reg |= IXGBE_TDTQ2TCCR_LSP;
+
+		IXGBE_WRITE_REG(hw, IXGBE_TDTQ2TCCR(i), reg);
+	}
+
+	return 0;
+}
+
+/**
+ * ixgbe_dcb_config_tx_data_arbiter_82598 - Config Tx data arbiter
+ * @hw: pointer to hardware structure
+ * @dcb_config: pointer to ixgbe_dcb_config structure
+ *
+ * Configure Tx Data Arbiter and credits for each traffic class.
+ */
+s32 ixgbe_dcb_config_tx_data_arbiter_82598(struct ixgbe_hw *hw,
+                                           struct ixgbe_dcb_config *dcb_config)
+{
+	struct tc_bw_alloc *p;
+	u32 reg;
+	u8 i;
+
+	reg = IXGBE_READ_REG(hw, IXGBE_PDPMCS);
+	/* Enable Data Plane Arbiter */
+	reg &= ~IXGBE_PDPMCS_ARBDIS;
+	/* Enable DFP and Transmit Recycle Mode */
+	reg |= (IXGBE_PDPMCS_TPPAC | IXGBE_PDPMCS_TRM);
+
+	IXGBE_WRITE_REG(hw, IXGBE_PDPMCS, reg);
+
+	/* Configure traffic class credits and priority */
+	for (i = 0; i < MAX_TRAFFIC_CLASS; i++) {
+		p = &dcb_config->tc_config[i].path[DCB_TX_CONFIG];
+		reg = p->data_credits_refill;
+		reg |= (u32)(p->data_credits_max) << IXGBE_TDPT2TCCR_MCL_SHIFT;
+		reg |= (u32)(p->bwg_id) << IXGBE_TDPT2TCCR_BWG_SHIFT;
+
+		if (p->prio_type == prio_group)
+			reg |= IXGBE_TDPT2TCCR_GSP;
+
+		if (p->prio_type == prio_link)
+			reg |= IXGBE_TDPT2TCCR_LSP;
+
+		IXGBE_WRITE_REG(hw, IXGBE_TDPT2TCCR(i), reg);
+	}
+
+	/* Enable Tx packet buffer division */
+	reg = IXGBE_READ_REG(hw, IXGBE_DTXCTL);
+	reg |= IXGBE_DTXCTL_ENDBUBD;
+	IXGBE_WRITE_REG(hw, IXGBE_DTXCTL, reg);
+
+	return 0;
+}
+
+/**
+ * ixgbe_dcb_config_pfc_82598 - Config priority flow control
+ * @hw: pointer to hardware structure
+ * @dcb_config: pointer to ixgbe_dcb_config structure
+ *
+ * Configure Priority Flow Control for each traffic class.
+ */
+s32 ixgbe_dcb_config_pfc_82598(struct ixgbe_hw *hw,
+                               struct ixgbe_dcb_config *dcb_config)
+{
+	u32 reg, rx_pba_size;
+	u8  i;
+
+	/* Enable Transmit Priority Flow Control */
+	reg = IXGBE_READ_REG(hw, IXGBE_RMCS);
+	reg &= ~IXGBE_RMCS_TFCE_802_3X;
+	/* correct the reporting of our flow control status */
+	hw->fc.type = ixgbe_fc_none;
+	reg |= IXGBE_RMCS_TFCE_PRIORITY;
+	IXGBE_WRITE_REG(hw, IXGBE_RMCS, reg);
+
+	/* Enable Receive Priority Flow Control */
+	reg = IXGBE_READ_REG(hw, IXGBE_FCTRL);
+	reg &= ~IXGBE_FCTRL_RFCE;
+	reg |= IXGBE_FCTRL_RPFCE;
+	IXGBE_WRITE_REG(hw, IXGBE_FCTRL, reg);
+
+	/*
+	 * Configure flow control thresholds and enable priority flow control
+	 * for each traffic class.
+	 */
+	for (i = 0; i < MAX_TRAFFIC_CLASS; i++) {
+		if (dcb_config->rx_pba_cfg == pba_equal) {
+			rx_pba_size = IXGBE_RXPBSIZE_64KB;
+		} else {
+			rx_pba_size = (i < 4) ? IXGBE_RXPBSIZE_80KB
+					      : IXGBE_RXPBSIZE_48KB;
+		}
+
+		reg = ((rx_pba_size >> 5) &  0xFFF0);
+		if (dcb_config->tc_config[i].dcb_pfc == pfc_enabled_tx ||
+		    dcb_config->tc_config[i].dcb_pfc == pfc_enabled_full)
+			reg |= IXGBE_FCRTL_XONE;
+
+		IXGBE_WRITE_REG(hw, IXGBE_FCRTL(i), reg);
+
+		reg = ((rx_pba_size >> 2) & 0xFFF0);
+		if (dcb_config->tc_config[i].dcb_pfc == pfc_enabled_tx ||
+		    dcb_config->tc_config[i].dcb_pfc == pfc_enabled_full)
+			reg |= IXGBE_FCRTH_FCEN;
+
+		IXGBE_WRITE_REG(hw, IXGBE_FCRTH(i), reg);
+	}
+
+	/* Configure pause time */
+	for (i = 0; i < (MAX_TRAFFIC_CLASS >> 1); i++)
+		IXGBE_WRITE_REG(hw, IXGBE_FCTTV(i), 0x68006800);
+
+	/* Configure flow control refresh threshold value */
+	IXGBE_WRITE_REG(hw, IXGBE_FCRTV, 0x3400);
+
+	return 0;
+}
+
+/**
+ * ixgbe_dcb_config_tc_stats_82598 - Configure traffic class statistics
+ * @hw: pointer to hardware structure
+ *
+ * Configure queue statistics registers, all queues belonging to same traffic
+ * class uses a single set of queue statistics counters.
+ */
+s32 ixgbe_dcb_config_tc_stats_82598(struct ixgbe_hw *hw)
+{
+	u32 reg = 0;
+	u8  i   = 0;
+	u8  j   = 0;
+
+	/* Receive Queues stats setting -  8 queues per statistics reg */
+	for (i = 0, j = 0; i < 15 && j < 8; i = i + 2, j++) {
+		reg = IXGBE_READ_REG(hw, IXGBE_RQSMR(i));
+		reg |= ((0x1010101) * j);
+		IXGBE_WRITE_REG(hw, IXGBE_RQSMR(i), reg);
+		reg = IXGBE_READ_REG(hw, IXGBE_RQSMR(i + 1));
+		reg |= ((0x1010101) * j);
+		IXGBE_WRITE_REG(hw, IXGBE_RQSMR(i + 1), reg);
+	}
+	/* Transmit Queues stats setting -  4 queues per statistics reg */
+	for (i = 0; i < 8; i++) {
+		reg = IXGBE_READ_REG(hw, IXGBE_TQSMR(i));
+		reg |= ((0x1010101) * i);
+		IXGBE_WRITE_REG(hw, IXGBE_TQSMR(i), reg);
+	}
+
+	return 0;
+}
+
+/**
+ * ixgbe_dcb_hw_config_82598 - Config and enable DCB
+ * @hw: pointer to hardware structure
+ * @dcb_config: pointer to ixgbe_dcb_config structure
+ *
+ * Configure dcb settings and enable dcb mode.
+ */
+s32 ixgbe_dcb_hw_config_82598(struct ixgbe_hw *hw,
+                              struct ixgbe_dcb_config *dcb_config)
+{
+	ixgbe_dcb_config_packet_buffers_82598(hw, dcb_config);
+	ixgbe_dcb_config_rx_arbiter_82598(hw, dcb_config);
+	ixgbe_dcb_config_tx_desc_arbiter_82598(hw, dcb_config);
+	ixgbe_dcb_config_tx_data_arbiter_82598(hw, dcb_config);
+	ixgbe_dcb_config_pfc_82598(hw, dcb_config);
+	ixgbe_dcb_config_tc_stats_82598(hw);
+
+	return 0;
+}
diff --git a/drivers/net/ixgbe/ixgbe_dcb_82598.h b/drivers/net/ixgbe/ixgbe_dcb_82598.h
new file mode 100644
index 00000000000..1e6a313719d
--- /dev/null
+++ b/drivers/net/ixgbe/ixgbe_dcb_82598.h
@@ -0,0 +1,94 @@
+/*******************************************************************************
+
+  Intel 10 Gigabit PCI Express Linux driver
+  Copyright(c) 1999 - 2007 Intel Corporation.
+
+  This program is free software; you can redistribute it and/or modify it
+  under the terms and conditions of the GNU General Public License,
+  version 2, as published by the Free Software Foundation.
+
+  This program is distributed in the hope it will be useful, but WITHOUT
+  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+  more details.
+
+  You should have received a copy of the GNU General Public License along with
+  this program; if not, write to the Free Software Foundation, Inc.,
+  51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
+
+  The full GNU General Public License is included in this distribution in
+  the file called "COPYING".
+
+  Contact Information:
+  Linux NICS <linux.nics@intel.com>
+  e1000-devel Mailing List <e1000-devel@lists.sourceforge.net>
+  Intel Corporation, 5200 N.E. Elam Young Parkway, Hillsboro, OR 97124-6497
+
+*******************************************************************************/
+
+#ifndef _DCB_82598_CONFIG_H_
+#define _DCB_82598_CONFIG_H_
+
+/* DCB register definitions */
+
+#define IXGBE_DPMCS_MTSOS_SHIFT 16
+#define IXGBE_DPMCS_TDPAC       0x00000001 /* 0 Round Robin, 1 DFP - Deficit Fixed Priority */
+#define IXGBE_DPMCS_TRM         0x00000010 /* Transmit Recycle Mode */
+#define IXGBE_DPMCS_ARBDIS      0x00000040 /* DCB arbiter disable */
+#define IXGBE_DPMCS_TSOEF       0x00080000 /* TSO Expand Factor: 0=x4, 1=x2 */
+
+#define IXGBE_RUPPBMR_MQA       0x80000000 /* Enable UP to queue mapping */
+
+#define IXGBE_RT2CR_MCL_SHIFT   12 /* Offset to Max Credit Limit setting */
+#define IXGBE_RT2CR_LSP         0x80000000 /* LSP enable bit */
+
+#define IXGBE_RDRXCTL_MPBEN     0x00000010 /* DMA config for multiple packet buffers enable */
+#define IXGBE_RDRXCTL_MCEN      0x00000040 /* DMA config for multiple cores (RSS) enable */
+
+#define IXGBE_TDTQ2TCCR_MCL_SHIFT   12
+#define IXGBE_TDTQ2TCCR_BWG_SHIFT   9
+#define IXGBE_TDTQ2TCCR_GSP     0x40000000
+#define IXGBE_TDTQ2TCCR_LSP     0x80000000
+
+#define IXGBE_TDPT2TCCR_MCL_SHIFT   12
+#define IXGBE_TDPT2TCCR_BWG_SHIFT   9
+#define IXGBE_TDPT2TCCR_GSP     0x40000000
+#define IXGBE_TDPT2TCCR_LSP     0x80000000
+
+#define IXGBE_PDPMCS_TPPAC      0x00000020 /* 0 Round Robin, 1 for DFP - Deficit Fixed Priority */
+#define IXGBE_PDPMCS_ARBDIS     0x00000040 /* Arbiter disable */
+#define IXGBE_PDPMCS_TRM        0x00000100 /* Transmit Recycle Mode enable */
+
+#define IXGBE_DTXCTL_ENDBUBD    0x00000004 /* Enable DBU buffer division */
+
+#define IXGBE_TXPBSIZE_40KB     0x0000A000 /* 40KB Packet Buffer */
+#define IXGBE_RXPBSIZE_48KB     0x0000C000 /* 48KB Packet Buffer */
+#define IXGBE_RXPBSIZE_64KB     0x00010000 /* 64KB Packet Buffer */
+#define IXGBE_RXPBSIZE_80KB     0x00014000 /* 80KB Packet Buffer */
+
+#define IXGBE_RDRXCTL_RDMTS_1_2 0x00000000
+
+/* DCB hardware-specific driver APIs */
+
+/* DCB PFC functions */
+s32 ixgbe_dcb_config_pfc_82598(struct ixgbe_hw *, struct ixgbe_dcb_config *);
+s32 ixgbe_dcb_get_pfc_stats_82598(struct ixgbe_hw *, struct ixgbe_hw_stats *,
+                                  u8);
+
+/* DCB traffic class stats */
+s32 ixgbe_dcb_config_tc_stats_82598(struct ixgbe_hw *);
+s32 ixgbe_dcb_get_tc_stats_82598(struct ixgbe_hw *, struct ixgbe_hw_stats *,
+                                 u8);
+
+/* DCB config arbiters */
+s32 ixgbe_dcb_config_tx_desc_arbiter_82598(struct ixgbe_hw *,
+                                           struct ixgbe_dcb_config *);
+s32 ixgbe_dcb_config_tx_data_arbiter_82598(struct ixgbe_hw *,
+                                           struct ixgbe_dcb_config *);
+s32 ixgbe_dcb_config_rx_arbiter_82598(struct ixgbe_hw *,
+                                      struct ixgbe_dcb_config *);
+
+/* DCB hw initialization */
+s32 ixgbe_dcb_hw_config_82598(struct ixgbe_hw *, struct ixgbe_dcb_config *);
+
+#endif /* _DCB_82598_CONFIG_H */
diff --git a/drivers/net/ixgbe/ixgbe_dcb_nl.c b/drivers/net/ixgbe/ixgbe_dcb_nl.c
new file mode 100644
index 00000000000..50bff2af6b0
--- /dev/null
+++ b/drivers/net/ixgbe/ixgbe_dcb_nl.c
@@ -0,0 +1,356 @@
+/*******************************************************************************
+
+  Intel 10 Gigabit PCI Express Linux driver
+  Copyright(c) 1999 - 2008 Intel Corporation.
+
+  This program is free software; you can redistribute it and/or modify it
+  under the terms and conditions of the GNU General Public License,
+  version 2, as published by the Free Software Foundation.
+
+  This program is distributed in the hope it will be useful, but WITHOUT
+  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+  more details.
+
+  You should have received a copy of the GNU General Public License along with
+  this program; if not, write to the Free Software Foundation, Inc.,
+  51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
+
+  The full GNU General Public License is included in this distribution in
+  the file called "COPYING".
+
+  Contact Information:
+  Linux NICS <linux.nics@intel.com>
+  e1000-devel Mailing List <e1000-devel@lists.sourceforge.net>
+  Intel Corporation, 5200 N.E. Elam Young Parkway, Hillsboro, OR 97124-6497
+
+*******************************************************************************/
+
+#include "ixgbe.h"
+#include <linux/dcbnl.h>
+
+/* Callbacks for DCB netlink in the kernel */
+#define BIT_DCB_MODE	0x01
+#define BIT_PFC		0x02
+#define BIT_PG_RX	0x04
+#define BIT_PG_TX	0x08
+
+int ixgbe_copy_dcb_cfg(struct ixgbe_dcb_config *src_dcb_cfg,
+                       struct ixgbe_dcb_config *dst_dcb_cfg, int tc_max)
+{
+	struct tc_configuration *src_tc_cfg = NULL;
+	struct tc_configuration *dst_tc_cfg = NULL;
+	int i;
+
+	if (!src_dcb_cfg || !dst_dcb_cfg)
+		return -EINVAL;
+
+	for (i = DCB_PG_ATTR_TC_0; i < tc_max + DCB_PG_ATTR_TC_0; i++) {
+		src_tc_cfg = &src_dcb_cfg->tc_config[i - DCB_PG_ATTR_TC_0];
+		dst_tc_cfg = &dst_dcb_cfg->tc_config[i - DCB_PG_ATTR_TC_0];
+
+		dst_tc_cfg->path[DCB_TX_CONFIG].prio_type =
+				src_tc_cfg->path[DCB_TX_CONFIG].prio_type;
+
+		dst_tc_cfg->path[DCB_TX_CONFIG].bwg_id =
+				src_tc_cfg->path[DCB_TX_CONFIG].bwg_id;
+
+		dst_tc_cfg->path[DCB_TX_CONFIG].bwg_percent =
+				src_tc_cfg->path[DCB_TX_CONFIG].bwg_percent;
+
+		dst_tc_cfg->path[DCB_TX_CONFIG].up_to_tc_bitmap =
+				src_tc_cfg->path[DCB_TX_CONFIG].up_to_tc_bitmap;
+
+		dst_tc_cfg->path[DCB_RX_CONFIG].prio_type =
+				src_tc_cfg->path[DCB_RX_CONFIG].prio_type;
+
+		dst_tc_cfg->path[DCB_RX_CONFIG].bwg_id =
+				src_tc_cfg->path[DCB_RX_CONFIG].bwg_id;
+
+		dst_tc_cfg->path[DCB_RX_CONFIG].bwg_percent =
+				src_tc_cfg->path[DCB_RX_CONFIG].bwg_percent;
+
+		dst_tc_cfg->path[DCB_RX_CONFIG].up_to_tc_bitmap =
+				src_tc_cfg->path[DCB_RX_CONFIG].up_to_tc_bitmap;
+	}
+
+	for (i = DCB_PG_ATTR_BW_ID_0; i < DCB_PG_ATTR_BW_ID_MAX; i++) {
+		dst_dcb_cfg->bw_percentage[DCB_TX_CONFIG]
+			[i-DCB_PG_ATTR_BW_ID_0] = src_dcb_cfg->bw_percentage
+				[DCB_TX_CONFIG][i-DCB_PG_ATTR_BW_ID_0];
+		dst_dcb_cfg->bw_percentage[DCB_RX_CONFIG]
+			[i-DCB_PG_ATTR_BW_ID_0] = src_dcb_cfg->bw_percentage
+				[DCB_RX_CONFIG][i-DCB_PG_ATTR_BW_ID_0];
+	}
+
+	for (i = DCB_PFC_UP_ATTR_0; i < DCB_PFC_UP_ATTR_MAX; i++) {
+		dst_dcb_cfg->tc_config[i - DCB_PFC_UP_ATTR_0].dcb_pfc =
+			src_dcb_cfg->tc_config[i - DCB_PFC_UP_ATTR_0].dcb_pfc;
+	}
+
+	return 0;
+}
+
+static u8 ixgbe_dcbnl_get_state(struct net_device *netdev)
+{
+	struct ixgbe_adapter *adapter = netdev_priv(netdev);
+
+	DPRINTK(DRV, INFO, "Get DCB Admin Mode.\n");
+
+	return !!(adapter->flags & IXGBE_FLAG_DCB_ENABLED);
+}
+
+static u16 ixgbe_dcb_select_queue(struct net_device *dev, struct sk_buff *skb)
+{
+	/* All traffic should default to class 0 */
+	return 0;
+}
+
+static void ixgbe_dcbnl_set_state(struct net_device *netdev, u8 state)
+{
+	struct ixgbe_adapter *adapter = netdev_priv(netdev);
+
+	DPRINTK(DRV, INFO, "Set DCB Admin Mode.\n");
+
+	if (state > 0) {
+		/* Turn on DCB */
+		if (adapter->flags & IXGBE_FLAG_DCB_ENABLED) {
+			return;
+		} else {
+			if (netif_running(netdev))
+				netdev->stop(netdev);
+			ixgbe_reset_interrupt_capability(adapter);
+			ixgbe_napi_del_all(adapter);
+			kfree(adapter->tx_ring);
+			kfree(adapter->rx_ring);
+			adapter->tx_ring = NULL;
+			adapter->rx_ring = NULL;
+			netdev->select_queue = &ixgbe_dcb_select_queue;
+
+			adapter->flags &= ~IXGBE_FLAG_RSS_ENABLED;
+			adapter->flags |= IXGBE_FLAG_DCB_ENABLED;
+			ixgbe_init_interrupt_scheme(adapter);
+			ixgbe_napi_add_all(adapter);
+			if (netif_running(netdev))
+				netdev->open(netdev);
+		}
+	} else {
+		/* Turn off DCB */
+		if (adapter->flags & IXGBE_FLAG_DCB_ENABLED) {
+			if (netif_running(netdev))
+				netdev->stop(netdev);
+			ixgbe_reset_interrupt_capability(adapter);
+			ixgbe_napi_del_all(adapter);
+			kfree(adapter->tx_ring);
+			kfree(adapter->rx_ring);
+			adapter->tx_ring = NULL;
+			adapter->rx_ring = NULL;
+			netdev->select_queue = NULL;
+
+			adapter->flags &= ~IXGBE_FLAG_DCB_ENABLED;
+			adapter->flags |= IXGBE_FLAG_RSS_ENABLED;
+			ixgbe_init_interrupt_scheme(adapter);
+			ixgbe_napi_add_all(adapter);
+			if (netif_running(netdev))
+				netdev->open(netdev);
+		} else {
+			return;
+		}
+	}
+}
+
+static void ixgbe_dcbnl_get_perm_hw_addr(struct net_device *netdev,
+					 u8 *perm_addr)
+{
+	struct ixgbe_adapter *adapter = netdev_priv(netdev);
+	int i;
+
+	for (i = 0; i < netdev->addr_len; i++)
+		perm_addr[i] = adapter->hw.mac.perm_addr[i];
+}
+
+static void ixgbe_dcbnl_set_pg_tc_cfg_tx(struct net_device *netdev, int tc,
+                                         u8 prio, u8 bwg_id, u8 bw_pct,
+                                         u8 up_map)
+{
+	struct ixgbe_adapter *adapter = netdev_priv(netdev);
+
+	if (prio != DCB_ATTR_VALUE_UNDEFINED)
+		adapter->temp_dcb_cfg.tc_config[tc].path[0].prio_type = prio;
+	if (bwg_id != DCB_ATTR_VALUE_UNDEFINED)
+		adapter->temp_dcb_cfg.tc_config[tc].path[0].bwg_id = bwg_id;
+	if (bw_pct != DCB_ATTR_VALUE_UNDEFINED)
+		adapter->temp_dcb_cfg.tc_config[tc].path[0].bwg_percent =
+			bw_pct;
+	if (up_map != DCB_ATTR_VALUE_UNDEFINED)
+		adapter->temp_dcb_cfg.tc_config[tc].path[0].up_to_tc_bitmap =
+			up_map;
+
+	if ((adapter->temp_dcb_cfg.tc_config[tc].path[0].prio_type !=
+	     adapter->dcb_cfg.tc_config[tc].path[0].prio_type) ||
+	    (adapter->temp_dcb_cfg.tc_config[tc].path[0].bwg_id !=
+	     adapter->dcb_cfg.tc_config[tc].path[0].bwg_id) ||
+	    (adapter->temp_dcb_cfg.tc_config[tc].path[0].bwg_percent !=
+	     adapter->dcb_cfg.tc_config[tc].path[0].bwg_percent) ||
+	    (adapter->temp_dcb_cfg.tc_config[tc].path[0].up_to_tc_bitmap !=
+	     adapter->dcb_cfg.tc_config[tc].path[0].up_to_tc_bitmap))
+		adapter->dcb_set_bitmap |= BIT_PG_TX;
+}
+
+static void ixgbe_dcbnl_set_pg_bwg_cfg_tx(struct net_device *netdev, int bwg_id,
+                                          u8 bw_pct)
+{
+	struct ixgbe_adapter *adapter = netdev_priv(netdev);
+
+	adapter->temp_dcb_cfg.bw_percentage[0][bwg_id] = bw_pct;
+
+	if (adapter->temp_dcb_cfg.bw_percentage[0][bwg_id] !=
+	    adapter->dcb_cfg.bw_percentage[0][bwg_id])
+		adapter->dcb_set_bitmap |= BIT_PG_RX;
+}
+
+static void ixgbe_dcbnl_set_pg_tc_cfg_rx(struct net_device *netdev, int tc,
+                                         u8 prio, u8 bwg_id, u8 bw_pct,
+                                         u8 up_map)
+{
+	struct ixgbe_adapter *adapter = netdev_priv(netdev);
+
+	if (prio != DCB_ATTR_VALUE_UNDEFINED)
+		adapter->temp_dcb_cfg.tc_config[tc].path[1].prio_type = prio;
+	if (bwg_id != DCB_ATTR_VALUE_UNDEFINED)
+		adapter->temp_dcb_cfg.tc_config[tc].path[1].bwg_id = bwg_id;
+	if (bw_pct != DCB_ATTR_VALUE_UNDEFINED)
+		adapter->temp_dcb_cfg.tc_config[tc].path[1].bwg_percent =
+			bw_pct;
+	if (up_map != DCB_ATTR_VALUE_UNDEFINED)
+		adapter->temp_dcb_cfg.tc_config[tc].path[1].up_to_tc_bitmap =
+			up_map;
+
+	if ((adapter->temp_dcb_cfg.tc_config[tc].path[1].prio_type !=
+	     adapter->dcb_cfg.tc_config[tc].path[1].prio_type) ||
+	    (adapter->temp_dcb_cfg.tc_config[tc].path[1].bwg_id !=
+	     adapter->dcb_cfg.tc_config[tc].path[1].bwg_id) ||
+	    (adapter->temp_dcb_cfg.tc_config[tc].path[1].bwg_percent !=
+	     adapter->dcb_cfg.tc_config[tc].path[1].bwg_percent) ||
+	    (adapter->temp_dcb_cfg.tc_config[tc].path[1].up_to_tc_bitmap !=
+	     adapter->dcb_cfg.tc_config[tc].path[1].up_to_tc_bitmap))
+		adapter->dcb_set_bitmap |= BIT_PG_RX;
+}
+
+static void ixgbe_dcbnl_set_pg_bwg_cfg_rx(struct net_device *netdev, int bwg_id,
+                                          u8 bw_pct)
+{
+	struct ixgbe_adapter *adapter = netdev_priv(netdev);
+
+	adapter->temp_dcb_cfg.bw_percentage[1][bwg_id] = bw_pct;
+
+	if (adapter->temp_dcb_cfg.bw_percentage[1][bwg_id] !=
+	    adapter->dcb_cfg.bw_percentage[1][bwg_id])
+		adapter->dcb_set_bitmap |= BIT_PG_RX;
+}
+
+static void ixgbe_dcbnl_get_pg_tc_cfg_tx(struct net_device *netdev, int tc,
+                                         u8 *prio, u8 *bwg_id, u8 *bw_pct,
+                                         u8 *up_map)
+{
+	struct ixgbe_adapter *adapter = netdev_priv(netdev);
+
+	*prio = adapter->dcb_cfg.tc_config[tc].path[0].prio_type;
+	*bwg_id = adapter->dcb_cfg.tc_config[tc].path[0].bwg_id;
+	*bw_pct = adapter->dcb_cfg.tc_config[tc].path[0].bwg_percent;
+	*up_map = adapter->dcb_cfg.tc_config[tc].path[0].up_to_tc_bitmap;
+}
+
+static void ixgbe_dcbnl_get_pg_bwg_cfg_tx(struct net_device *netdev, int bwg_id,
+                                          u8 *bw_pct)
+{
+	struct ixgbe_adapter *adapter = netdev_priv(netdev);
+
+	*bw_pct = adapter->dcb_cfg.bw_percentage[0][bwg_id];
+}
+
+static void ixgbe_dcbnl_get_pg_tc_cfg_rx(struct net_device *netdev, int tc,
+                                         u8 *prio, u8 *bwg_id, u8 *bw_pct,
+                                         u8 *up_map)
+{
+	struct ixgbe_adapter *adapter = netdev_priv(netdev);
+
+	*prio = adapter->dcb_cfg.tc_config[tc].path[1].prio_type;
+	*bwg_id = adapter->dcb_cfg.tc_config[tc].path[1].bwg_id;
+	*bw_pct = adapter->dcb_cfg.tc_config[tc].path[1].bwg_percent;
+	*up_map = adapter->dcb_cfg.tc_config[tc].path[1].up_to_tc_bitmap;
+}
+
+static void ixgbe_dcbnl_get_pg_bwg_cfg_rx(struct net_device *netdev, int bwg_id,
+                                          u8 *bw_pct)
+{
+	struct ixgbe_adapter *adapter = netdev_priv(netdev);
+
+	*bw_pct = adapter->dcb_cfg.bw_percentage[1][bwg_id];
+}
+
+static void ixgbe_dcbnl_set_pfc_cfg(struct net_device *netdev, int priority,
+                                    u8 setting)
+{
+	struct ixgbe_adapter *adapter = netdev_priv(netdev);
+
+	adapter->temp_dcb_cfg.tc_config[priority].dcb_pfc = setting;
+	if (adapter->temp_dcb_cfg.tc_config[priority].dcb_pfc !=
+	    adapter->dcb_cfg.tc_config[priority].dcb_pfc)
+		adapter->dcb_set_bitmap |= BIT_PFC;
+}
+
+static void ixgbe_dcbnl_get_pfc_cfg(struct net_device *netdev, int priority,
+                                    u8 *setting)
+{
+	struct ixgbe_adapter *adapter = netdev_priv(netdev);
+
+	*setting = adapter->dcb_cfg.tc_config[priority].dcb_pfc;
+}
+
+static u8 ixgbe_dcbnl_set_all(struct net_device *netdev)
+{
+	struct ixgbe_adapter *adapter = netdev_priv(netdev);
+	int ret;
+
+	if (!adapter->dcb_set_bitmap)
+		return 1;
+
+	while (test_and_set_bit(__IXGBE_RESETTING, &adapter->state))
+		msleep(1);
+
+	if (netif_running(netdev))
+		ixgbe_down(adapter);
+
+	ret = ixgbe_copy_dcb_cfg(&adapter->temp_dcb_cfg, &adapter->dcb_cfg,
+				 adapter->ring_feature[RING_F_DCB].indices);
+	if (ret) {
+		clear_bit(__IXGBE_RESETTING, &adapter->state);
+		return ret;
+	}
+
+	if (netif_running(netdev))
+		ixgbe_up(adapter);
+
+	adapter->dcb_set_bitmap = 0x00;
+	clear_bit(__IXGBE_RESETTING, &adapter->state);
+	return ret;
+}
+
+struct dcbnl_rtnl_ops dcbnl_ops = {
+	.getstate	= ixgbe_dcbnl_get_state,
+	.setstate	= ixgbe_dcbnl_set_state,
+	.getpermhwaddr	= ixgbe_dcbnl_get_perm_hw_addr,
+	.setpgtccfgtx	= ixgbe_dcbnl_set_pg_tc_cfg_tx,
+	.setpgbwgcfgtx	= ixgbe_dcbnl_set_pg_bwg_cfg_tx,
+	.setpgtccfgrx	= ixgbe_dcbnl_set_pg_tc_cfg_rx,
+	.setpgbwgcfgrx	= ixgbe_dcbnl_set_pg_bwg_cfg_rx,
+	.getpgtccfgtx	= ixgbe_dcbnl_get_pg_tc_cfg_tx,
+	.getpgbwgcfgtx	= ixgbe_dcbnl_get_pg_bwg_cfg_tx,
+	.getpgtccfgrx	= ixgbe_dcbnl_get_pg_tc_cfg_rx,
+	.getpgbwgcfgrx	= ixgbe_dcbnl_get_pg_bwg_cfg_rx,
+	.setpfccfg	= ixgbe_dcbnl_set_pfc_cfg,
+	.getpfccfg	= ixgbe_dcbnl_get_pfc_cfg,
+	.setall		= ixgbe_dcbnl_set_all
+};
+
diff --git a/drivers/net/ixgbe/ixgbe_ethtool.c b/drivers/net/ixgbe/ixgbe_ethtool.c
index a610016a017..aaa4404e7c5 100644
--- a/drivers/net/ixgbe/ixgbe_ethtool.c
+++ b/drivers/net/ixgbe/ixgbe_ethtool.c
@@ -97,9 +97,18 @@ static struct ixgbe_stats ixgbe_gstrings_stats[] = {
 	((((struct ixgbe_adapter *)netdev_priv(netdev))->num_tx_queues + \
 	((struct ixgbe_adapter *)netdev_priv(netdev))->num_rx_queues) * \
 	(sizeof(struct ixgbe_queue_stats) / sizeof(u64)))
-#define IXGBE_STATS_LEN (IXGBE_GLOBAL_STATS_LEN + IXGBE_QUEUE_STATS_LEN)
 #define IXGBE_GLOBAL_STATS_LEN ARRAY_SIZE(ixgbe_gstrings_stats)
-#define IXGBE_STATS_LEN (IXGBE_GLOBAL_STATS_LEN + IXGBE_QUEUE_STATS_LEN)
+#define IXGBE_PB_STATS_LEN ( \
+                 (((struct ixgbe_adapter *)netdev->priv)->flags & \
+                 IXGBE_FLAG_DCB_ENABLED) ? \
+                 (sizeof(((struct ixgbe_adapter *)0)->stats.pxonrxc) + \
+                  sizeof(((struct ixgbe_adapter *)0)->stats.pxontxc) + \
+                  sizeof(((struct ixgbe_adapter *)0)->stats.pxoffrxc) + \
+                  sizeof(((struct ixgbe_adapter *)0)->stats.pxofftxc)) \
+                  / sizeof(u64) : 0)
+#define IXGBE_STATS_LEN (IXGBE_GLOBAL_STATS_LEN + \
+                         IXGBE_PB_STATS_LEN + \
+                         IXGBE_QUEUE_STATS_LEN)
 
 static int ixgbe_get_settings(struct net_device *netdev,
                               struct ethtool_cmd *ecmd)
@@ -831,6 +840,16 @@ static void ixgbe_get_ethtool_stats(struct net_device *netdev,
 			data[i + k] = queue_stat[k];
 		i += k;
 	}
+	if (adapter->flags & IXGBE_FLAG_DCB_ENABLED) {
+		for (j = 0; j < MAX_TX_PACKET_BUFFERS; j++) {
+			data[i++] = adapter->stats.pxontxc[j];
+			data[i++] = adapter->stats.pxofftxc[j];
+		}
+		for (j = 0; j < MAX_RX_PACKET_BUFFERS; j++) {
+			data[i++] = adapter->stats.pxonrxc[j];
+			data[i++] = adapter->stats.pxoffrxc[j];
+		}
+	}
 }
 
 static void ixgbe_get_strings(struct net_device *netdev, u32 stringset,
@@ -859,6 +878,13 @@ static void ixgbe_get_strings(struct net_device *netdev, u32 stringset,
 			sprintf(p, "rx_queue_%u_bytes", i);
 			p += ETH_GSTRING_LEN;
 		}
+		if (adapter->flags & IXGBE_FLAG_DCB_ENABLED) {
+			for (i = 0; i < MAX_TX_PACKET_BUFFERS; i++) {
+				sprintf(p, "tx_pb_%u_pxon", i);
+			}
+			for (i = 0; i < MAX_RX_PACKET_BUFFERS; i++) {
+			}
+		}
 		/* BUG_ON(p - data != IXGBE_STATS_LEN * ETH_GSTRING_LEN); */
 		break;
 	}
diff --git a/drivers/net/ixgbe/ixgbe_main.c b/drivers/net/ixgbe/ixgbe_main.c
index 40108523377..91dde9cdab6 100644
--- a/drivers/net/ixgbe/ixgbe_main.c
+++ b/drivers/net/ixgbe/ixgbe_main.c
@@ -404,7 +404,7 @@ static void ixgbe_receive_skb(struct ixgbe_adapter *adapter,
 
 	if (adapter->netdev->features & NETIF_F_LRO &&
 	    skb->ip_summed == CHECKSUM_UNNECESSARY) {
-		if (adapter->vlgrp && is_vlan)
+		if (adapter->vlgrp && is_vlan && (tag != 0))
 			lro_vlan_hwaccel_receive_skb(&ring->lro_mgr, skb,
 			                             adapter->vlgrp, tag,
 			                             rx_desc);
@@ -413,12 +413,12 @@ static void ixgbe_receive_skb(struct ixgbe_adapter *adapter,
 		ring->lro_used = true;
 	} else {
 		if (!(adapter->flags & IXGBE_FLAG_IN_NETPOLL)) {
-			if (adapter->vlgrp && is_vlan)
+			if (adapter->vlgrp && is_vlan && (tag != 0))
 				vlan_hwaccel_receive_skb(skb, adapter->vlgrp, tag);
 			else
 				netif_receive_skb(skb);
 		} else {
-			if (adapter->vlgrp && is_vlan)
+			if (adapter->vlgrp && is_vlan && (tag != 0))
 				vlan_hwaccel_rx(skb, adapter->vlgrp, tag);
 			else
 				netif_rx(skb);
@@ -1670,10 +1670,12 @@ static void ixgbe_configure_rx(struct ixgbe_adapter *adapter)
 	 * effects of setting this bit are only that SRRCTL must be
 	 * fully programmed [0..15]
 	 */
-	rdrxctl = IXGBE_READ_REG(hw, IXGBE_RDRXCTL);
-	rdrxctl |= IXGBE_RDRXCTL_MVMEN;
-	IXGBE_WRITE_REG(hw, IXGBE_RDRXCTL, rdrxctl);
-
+	if (adapter->flags &
+	    (IXGBE_FLAG_RSS_ENABLED | IXGBE_FLAG_VMDQ_ENABLED)) {
+		rdrxctl = IXGBE_READ_REG(hw, IXGBE_RDRXCTL);
+		rdrxctl |= IXGBE_RDRXCTL_MVMEN;
+		IXGBE_WRITE_REG(hw, IXGBE_RDRXCTL, rdrxctl);
+	}
 
 	if (adapter->flags & IXGBE_FLAG_RSS_ENABLED) {
 		/* Fill out redirection table */
@@ -1732,6 +1734,16 @@ static void ixgbe_vlan_rx_register(struct net_device *netdev,
 		ixgbe_irq_disable(adapter);
 	adapter->vlgrp = grp;
 
+	/*
+	 * For a DCB driver, always enable VLAN tag stripping so we can
+	 * still receive traffic from a DCB-enabled host even if we're
+	 * not in DCB mode.
+	 */
+	ctrl = IXGBE_READ_REG(&adapter->hw, IXGBE_VLNCTRL);
+	ctrl |= IXGBE_VLNCTRL_VME;
+	ctrl &= ~IXGBE_VLNCTRL_CFIEN;
+	IXGBE_WRITE_REG(&adapter->hw, IXGBE_VLNCTRL, ctrl);
+
 	if (grp) {
 		/* enable VLAN tag insert/strip */
 		ctrl = IXGBE_READ_REG(&adapter->hw, IXGBE_VLNCTRL);
@@ -1896,6 +1908,44 @@ static void ixgbe_napi_disable_all(struct ixgbe_adapter *adapter)
 	}
 }
 
+#ifdef CONFIG_IXGBE_DCBNL
+/*
+ * ixgbe_configure_dcb - Configure DCB hardware
+ * @adapter: ixgbe adapter struct
+ *
+ * This is called by the driver on open to configure the DCB hardware.
+ * This is also called by the gennetlink interface when reconfiguring
+ * the DCB state.
+ */
+static void ixgbe_configure_dcb(struct ixgbe_adapter *adapter)
+{
+	struct ixgbe_hw *hw = &adapter->hw;
+	u32 txdctl, vlnctrl;
+	int i, j;
+
+	ixgbe_dcb_check_config(&adapter->dcb_cfg);
+	ixgbe_dcb_calculate_tc_credits(&adapter->dcb_cfg, DCB_TX_CONFIG);
+	ixgbe_dcb_calculate_tc_credits(&adapter->dcb_cfg, DCB_RX_CONFIG);
+
+	/* reconfigure the hardware */
+	ixgbe_dcb_hw_config(&adapter->hw, &adapter->dcb_cfg);
+
+	for (i = 0; i < adapter->num_tx_queues; i++) {
+		j = adapter->tx_ring[i].reg_idx;
+		txdctl = IXGBE_READ_REG(hw, IXGBE_TXDCTL(j));
+		/* PThresh workaround for Tx hang with DFP enabled. */
+		txdctl |= 32;
+		IXGBE_WRITE_REG(hw, IXGBE_TXDCTL(j), txdctl);
+	}
+	/* Enable VLAN tag insert/strip */
+	vlnctrl = IXGBE_READ_REG(hw, IXGBE_VLNCTRL);
+	vlnctrl |= IXGBE_VLNCTRL_VME | IXGBE_VLNCTRL_VFE;
+	vlnctrl &= ~IXGBE_VLNCTRL_CFIEN;
+	IXGBE_WRITE_REG(hw, IXGBE_VLNCTRL, vlnctrl);
+	hw->mac.ops.set_vfta(&adapter->hw, 0, 0, true);
+}
+
+#endif
 static void ixgbe_configure(struct ixgbe_adapter *adapter)
 {
 	struct net_device *netdev = adapter->netdev;
@@ -1904,6 +1954,16 @@ static void ixgbe_configure(struct ixgbe_adapter *adapter)
 	ixgbe_set_rx_mode(netdev);
 
 	ixgbe_restore_vlan(adapter);
+#ifdef CONFIG_IXGBE_DCBNL
+	if (adapter->flags & IXGBE_FLAG_DCB_ENABLED) {
+		netif_set_gso_max_size(netdev, 32768);
+		ixgbe_configure_dcb(adapter);
+	} else {
+		netif_set_gso_max_size(netdev, 65536);
+	}
+#else
+	netif_set_gso_max_size(netdev, 65536);
+#endif
 
 	ixgbe_configure_tx(adapter);
 	ixgbe_configure_rx(adapter);
@@ -1995,9 +2055,6 @@ static int ixgbe_up_complete(struct ixgbe_adapter *adapter)
 
 	ixgbe_irq_enable(adapter);
 
-	/* enable transmits */
-	netif_tx_start_all_queues(netdev);
-
 	/* bring the link up in the watchdog, this could race with our first
 	 * link up interrupt but shouldn't be a problem */
 	adapter->flags |= IXGBE_FLAG_NEED_LINK_UPDATE;
@@ -2260,6 +2317,11 @@ static void ixgbe_reset_task(struct work_struct *work)
 	struct ixgbe_adapter *adapter;
 	adapter = container_of(work, struct ixgbe_adapter, reset_task);
 
+	/* If we're already down or resetting, just bail */
+	if (test_bit(__IXGBE_DOWN, &adapter->state) ||
+	    test_bit(__IXGBE_RESETTING, &adapter->state))
+		return;
+
 	adapter->tx_timeout_count++;
 
 	ixgbe_reinit_locked(adapter);
@@ -2269,15 +2331,31 @@ static void ixgbe_set_num_queues(struct ixgbe_adapter *adapter)
 {
 	int nrq = 1, ntq = 1;
 	int feature_mask = 0, rss_i, rss_m;
+	int dcb_i, dcb_m;
 
 	/* Number of supported queues */
 	switch (adapter->hw.mac.type) {
 	case ixgbe_mac_82598EB:
+		dcb_i = adapter->ring_feature[RING_F_DCB].indices;
+		dcb_m = 0;
 		rss_i = adapter->ring_feature[RING_F_RSS].indices;
 		rss_m = 0;
 		feature_mask |= IXGBE_FLAG_RSS_ENABLED;
+		feature_mask |= IXGBE_FLAG_DCB_ENABLED;
 
 		switch (adapter->flags & feature_mask) {
+		case (IXGBE_FLAG_RSS_ENABLED | IXGBE_FLAG_DCB_ENABLED):
+			dcb_m = 0x7 << 3;
+			rss_i = min(8, rss_i);
+			rss_m = 0x7;
+			nrq = dcb_i * rss_i;
+			ntq = min(MAX_TX_QUEUES, dcb_i * rss_i);
+			break;
+		case (IXGBE_FLAG_DCB_ENABLED):
+			dcb_m = 0x7 << 3;
+			nrq = dcb_i;
+			ntq = dcb_i;
+			break;
 		case (IXGBE_FLAG_RSS_ENABLED):
 			rss_m = 0xF;
 			nrq = rss_i;
@@ -2285,6 +2363,8 @@ static void ixgbe_set_num_queues(struct ixgbe_adapter *adapter)
 			break;
 		case 0:
 		default:
+			dcb_i = 0;
+			dcb_m = 0;
 			rss_i = 0;
 			rss_m = 0;
 			nrq = 1;
@@ -2292,6 +2372,12 @@ static void ixgbe_set_num_queues(struct ixgbe_adapter *adapter)
 			break;
 		}
 
+		/* Sanity check, we should never have zero queues */
+		nrq = (nrq ?:1);
+		ntq = (ntq ?:1);
+
+		adapter->ring_feature[RING_F_DCB].indices = dcb_i;
+		adapter->ring_feature[RING_F_DCB].mask = dcb_m;
 		adapter->ring_feature[RING_F_RSS].indices = rss_i;
 		adapter->ring_feature[RING_F_RSS].mask = rss_m;
 		break;
@@ -2343,6 +2429,7 @@ static void ixgbe_acquire_msix_vectors(struct ixgbe_adapter *adapter,
 		adapter->flags &= ~IXGBE_FLAG_MSIX_ENABLED;
 		kfree(adapter->msix_entries);
 		adapter->msix_entries = NULL;
+		adapter->flags &= ~IXGBE_FLAG_DCB_ENABLED;
 		adapter->flags &= ~IXGBE_FLAG_RSS_ENABLED;
 		ixgbe_set_num_queues(adapter);
 	} else {
@@ -2362,15 +2449,42 @@ static void __devinit ixgbe_cache_ring_register(struct ixgbe_adapter *adapter)
 {
 	int feature_mask = 0, rss_i;
 	int i, txr_idx, rxr_idx;
+	int dcb_i;
 
 	/* Number of supported queues */
 	switch (adapter->hw.mac.type) {
 	case ixgbe_mac_82598EB:
+		dcb_i = adapter->ring_feature[RING_F_DCB].indices;
 		rss_i = adapter->ring_feature[RING_F_RSS].indices;
 		txr_idx = 0;
 		rxr_idx = 0;
+		feature_mask |= IXGBE_FLAG_DCB_ENABLED;
 		feature_mask |= IXGBE_FLAG_RSS_ENABLED;
 		switch (adapter->flags & feature_mask) {
+		case (IXGBE_FLAG_RSS_ENABLED | IXGBE_FLAG_DCB_ENABLED):
+			for (i = 0; i < dcb_i; i++) {
+				int j;
+				/* Rx first */
+				for (j = 0; j < adapter->num_rx_queues; j++) {
+					adapter->rx_ring[rxr_idx].reg_idx =
+						i << 3 | j;
+					rxr_idx++;
+				}
+				/* Tx now */
+				for (j = 0; j < adapter->num_tx_queues; j++) {
+					adapter->tx_ring[txr_idx].reg_idx =
+						i << 2 | (j >> 1);
+					if (j & 1)
+						txr_idx++;
+				}
+			}
+		case (IXGBE_FLAG_DCB_ENABLED):
+			/* the number of queues is assumed to be symmetric */
+			for (i = 0; i < dcb_i; i++) {
+				adapter->rx_ring[i].reg_idx = i << 3;
+				adapter->tx_ring[i].reg_idx = i << 2;
+			}
+			break;
 		case (IXGBE_FLAG_RSS_ENABLED):
 			for (i = 0; i < adapter->num_rx_queues; i++)
 				adapter->rx_ring[i].reg_idx = i;
@@ -2395,7 +2509,7 @@ static void __devinit ixgbe_cache_ring_register(struct ixgbe_adapter *adapter)
  * number of queues at compile-time.  The polling_netdev array is
  * intended for Multiqueue, but should work fine with a single queue.
  **/
-static int __devinit ixgbe_alloc_queues(struct ixgbe_adapter *adapter)
+static int ixgbe_alloc_queues(struct ixgbe_adapter *adapter)
 {
 	int i;
 
@@ -2465,6 +2579,7 @@ static int __devinit ixgbe_set_interrupt_capability(struct ixgbe_adapter
 	adapter->msix_entries = kcalloc(v_budget,
 	                                sizeof(struct msix_entry), GFP_KERNEL);
 	if (!adapter->msix_entries) {
+		adapter->flags &= ~IXGBE_FLAG_DCB_ENABLED;
 		adapter->flags &= ~IXGBE_FLAG_RSS_ENABLED;
 		ixgbe_set_num_queues(adapter);
 		kfree(adapter->tx_ring);
@@ -2505,7 +2620,7 @@ out:
 	return err;
 }
 
-static void ixgbe_reset_interrupt_capability(struct ixgbe_adapter *adapter)
+void ixgbe_reset_interrupt_capability(struct ixgbe_adapter *adapter)
 {
 	if (adapter->flags & IXGBE_FLAG_MSIX_ENABLED) {
 		adapter->flags &= ~IXGBE_FLAG_MSIX_ENABLED;
@@ -2529,7 +2644,7 @@ static void ixgbe_reset_interrupt_capability(struct ixgbe_adapter *adapter)
  * - Hardware queue count (num_*_queues)
  *   - defined by miscellaneous hardware support/features (RSS, etc.)
  **/
-static int __devinit ixgbe_init_interrupt_scheme(struct ixgbe_adapter *adapter)
+int ixgbe_init_interrupt_scheme(struct ixgbe_adapter *adapter)
 {
 	int err;
 
@@ -2577,6 +2692,10 @@ static int __devinit ixgbe_sw_init(struct ixgbe_adapter *adapter)
 	struct ixgbe_hw *hw = &adapter->hw;
 	struct pci_dev *pdev = adapter->pdev;
 	unsigned int rss;
+#ifdef CONFIG_IXGBE_DCBNL
+	int j;
+	struct tc_configuration *tc;
+#endif
 
 	/* PCI config space info */
 
@@ -2590,6 +2709,27 @@ static int __devinit ixgbe_sw_init(struct ixgbe_adapter *adapter)
 	rss = min(IXGBE_MAX_RSS_INDICES, (int)num_online_cpus());
 	adapter->ring_feature[RING_F_RSS].indices = rss;
 	adapter->flags |= IXGBE_FLAG_RSS_ENABLED;
+	adapter->ring_feature[RING_F_DCB].indices = IXGBE_MAX_DCB_INDICES;
+
+#ifdef CONFIG_IXGBE_DCBNL
+	/* Configure DCB traffic classes */
+	for (j = 0; j < MAX_TRAFFIC_CLASS; j++) {
+		tc = &adapter->dcb_cfg.tc_config[j];
+		tc->path[DCB_TX_CONFIG].bwg_id = 0;
+		tc->path[DCB_TX_CONFIG].bwg_percent = 12 + (j & 1);
+		tc->path[DCB_RX_CONFIG].bwg_id = 0;
+		tc->path[DCB_RX_CONFIG].bwg_percent = 12 + (j & 1);
+		tc->dcb_pfc = pfc_disabled;
+	}
+	adapter->dcb_cfg.bw_percentage[DCB_TX_CONFIG][0] = 100;
+	adapter->dcb_cfg.bw_percentage[DCB_RX_CONFIG][0] = 100;
+	adapter->dcb_cfg.rx_pba_cfg = pba_equal;
+	adapter->dcb_cfg.round_robin_enable = false;
+	adapter->dcb_set_bitmap = 0x00;
+	ixgbe_copy_dcb_cfg(&adapter->dcb_cfg, &adapter->temp_dcb_cfg,
+	                   adapter->ring_feature[RING_F_DCB].indices);
+
+#endif
 	if (hw->mac.ops.get_media_type &&
 	    (hw->mac.ops.get_media_type(hw) == ixgbe_media_type_copper))
 		adapter->flags |= IXGBE_FLAG_FAN_FAIL_CAPABLE;
@@ -2967,7 +3107,7 @@ static int ixgbe_close(struct net_device *netdev)
  * @adapter: private struct
  * helper function to napi_add each possible q_vector->napi
  */
-static void ixgbe_napi_add_all(struct ixgbe_adapter *adapter)
+void ixgbe_napi_add_all(struct ixgbe_adapter *adapter)
 {
 	int q_idx, q_vectors;
 	int (*poll)(struct napi_struct *, int);
@@ -2988,7 +3128,7 @@ static void ixgbe_napi_add_all(struct ixgbe_adapter *adapter)
 	}
 }
 
-static void ixgbe_napi_del_all(struct ixgbe_adapter *adapter)
+void ixgbe_napi_del_all(struct ixgbe_adapter *adapter)
 {
 	int q_idx;
 	int q_vectors = adapter->num_msix_vectors - NON_Q_VECTORS;
@@ -3109,6 +3249,18 @@ void ixgbe_update_stats(struct ixgbe_adapter *adapter)
 		adapter->stats.mpc[i] += mpc;
 		total_mpc += adapter->stats.mpc[i];
 		adapter->stats.rnbc[i] += IXGBE_READ_REG(hw, IXGBE_RNBC(i));
+		adapter->stats.qptc[i] += IXGBE_READ_REG(hw, IXGBE_QPTC(i));
+		adapter->stats.qbtc[i] += IXGBE_READ_REG(hw, IXGBE_QBTC(i));
+		adapter->stats.qprc[i] += IXGBE_READ_REG(hw, IXGBE_QPRC(i));
+		adapter->stats.qbrc[i] += IXGBE_READ_REG(hw, IXGBE_QBRC(i));
+		adapter->stats.pxonrxc[i] += IXGBE_READ_REG(hw,
+		                                            IXGBE_PXONRXC(i));
+		adapter->stats.pxontxc[i] += IXGBE_READ_REG(hw,
+		                                            IXGBE_PXONTXC(i));
+		adapter->stats.pxoffrxc[i] += IXGBE_READ_REG(hw,
+		                                            IXGBE_PXOFFRXC(i));
+		adapter->stats.pxofftxc[i] += IXGBE_READ_REG(hw,
+		                                            IXGBE_PXOFFTXC(i));
 	}
 	adapter->stats.gprc += IXGBE_READ_REG(hw, IXGBE_GPRC);
 	/* work around hardware counting issue */
@@ -3248,6 +3400,7 @@ static void ixgbe_watchdog_task(struct work_struct *work)
 			         (FLOW_TX ? "TX" : "None"))));
 
 			netif_carrier_on(netdev);
+			netif_tx_wake_all_queues(netdev);
 		} else {
 			/* Force detection of hung controller */
 			adapter->detect_tx_hung = true;
@@ -3258,6 +3411,7 @@ static void ixgbe_watchdog_task(struct work_struct *work)
 		if (netif_carrier_ok(netdev)) {
 			DPRINTK(LINK, INFO, "NIC Link is Down\n");
 			netif_carrier_off(netdev);
+			netif_tx_stop_all_queues(netdev);
 		}
 	}
 
@@ -3604,6 +3758,14 @@ static int ixgbe_xmit_frame(struct sk_buff *skb, struct net_device *netdev)
 
 	if (adapter->vlgrp && vlan_tx_tag_present(skb)) {
 		tx_flags |= vlan_tx_tag_get(skb);
+		if (adapter->flags & IXGBE_FLAG_DCB_ENABLED) {
+			tx_flags &= ~IXGBE_TX_FLAGS_VLAN_PRIO_MASK;
+			tx_flags |= (skb->queue_mapping << 13);
+		}
+		tx_flags <<= IXGBE_TX_FLAGS_VLAN_SHIFT;
+		tx_flags |= IXGBE_TX_FLAGS_VLAN;
+	} else if (adapter->flags & IXGBE_FLAG_DCB_ENABLED) {
+		tx_flags |= (skb->queue_mapping << 13);
 		tx_flags <<= IXGBE_TX_FLAGS_VLAN_SHIFT;
 		tx_flags |= IXGBE_TX_FLAGS_VLAN;
 	}
@@ -3878,6 +4040,13 @@ static int __devinit ixgbe_probe(struct pci_dev *pdev,
 	netdev->vlan_features |= NETIF_F_IP_CSUM;
 	netdev->vlan_features |= NETIF_F_SG;
 
+	if (adapter->flags & IXGBE_FLAG_DCB_ENABLED)
+		adapter->flags &= ~IXGBE_FLAG_RSS_ENABLED;
+
+#ifdef CONFIG_IXGBE_DCBNL
+	netdev->dcbnl_ops = &dcbnl_ops;
+#endif
+
 	if (pci_using_dac)
 		netdev->features |= NETIF_F_HIGHDMA;
 
@@ -3946,6 +4115,7 @@ static int __devinit ixgbe_probe(struct pci_dev *pdev,
 	}
 
 	netif_carrier_off(netdev);
+	netif_tx_stop_all_queues(netdev);
 
 	ixgbe_napi_add_all(adapter);
 
diff --git a/include/linux/dcbnl.h b/include/linux/dcbnl.h
new file mode 100644
index 00000000000..32d32c1ee41
--- /dev/null
+++ b/include/linux/dcbnl.h
@@ -0,0 +1,230 @@
+/*
+ * Copyright (c) 2008, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+ * Place - Suite 330, Boston, MA 02111-1307 USA.
+ *
+ * Author: Lucy Liu <lucy.liu@intel.com>
+ */
+
+#ifndef __LINUX_DCBNL_H__
+#define __LINUX_DCBNL_H__
+
+#define DCB_PROTO_VERSION 1
+
+struct dcbmsg {
+	unsigned char      dcb_family;
+	__u8               cmd;
+	__u16              dcb_pad;
+};
+
+/**
+ * enum dcbnl_commands - supported DCB commands
+ *
+ * @DCB_CMD_UNDEFINED: unspecified command to catch errors
+ * @DCB_CMD_GSTATE: request the state of DCB in the device
+ * @DCB_CMD_SSTATE: set the state of DCB in the device
+ * @DCB_CMD_PGTX_GCFG: request the priority group configuration for Tx
+ * @DCB_CMD_PGTX_SCFG: set the priority group configuration for Tx
+ * @DCB_CMD_PGRX_GCFG: request the priority group configuration for Rx
+ * @DCB_CMD_PGRX_SCFG: set the priority group configuration for Rx
+ * @DCB_CMD_PFC_GCFG: request the priority flow control configuration
+ * @DCB_CMD_PFC_SCFG: set the priority flow control configuration
+ * @DCB_CMD_SET_ALL: apply all changes to the underlying device
+ * @DCB_CMD_GPERM_HWADDR: get the permanent MAC address of the underlying
+ *                        device.  Only useful when using bonding.
+ */
+enum dcbnl_commands {
+	DCB_CMD_UNDEFINED,
+
+	DCB_CMD_GSTATE,
+	DCB_CMD_SSTATE,
+
+	DCB_CMD_PGTX_GCFG,
+	DCB_CMD_PGTX_SCFG,
+	DCB_CMD_PGRX_GCFG,
+	DCB_CMD_PGRX_SCFG,
+
+	DCB_CMD_PFC_GCFG,
+	DCB_CMD_PFC_SCFG,
+
+	DCB_CMD_SET_ALL,
+	DCB_CMD_GPERM_HWADDR,
+
+	__DCB_CMD_ENUM_MAX,
+	DCB_CMD_MAX = __DCB_CMD_ENUM_MAX - 1,
+};
+
+
+/**
+ * enum dcbnl_attrs - DCB top-level netlink attributes
+ *
+ * @DCB_ATTR_UNDEFINED: unspecified attribute to catch errors
+ * @DCB_ATTR_IFNAME: interface name of the underlying device (NLA_STRING)
+ * @DCB_ATTR_STATE: enable state of DCB in the device (NLA_U8)
+ * @DCB_ATTR_PFC_STATE: enable state of PFC in the device (NLA_U8)
+ * @DCB_ATTR_PFC_CFG: priority flow control configuration (NLA_NESTED)
+ * @DCB_ATTR_NUM_TC: number of traffic classes supported in the device (NLA_U8)
+ * @DCB_ATTR_PG_CFG: priority group configuration (NLA_NESTED)
+ * @DCB_ATTR_SET_ALL: bool to commit changes to hardware or not (NLA_U8)
+ * @DCB_ATTR_PERM_HWADDR: MAC address of the physical device (NLA_NESTED)
+ */
+enum dcbnl_attrs {
+	DCB_ATTR_UNDEFINED,
+
+	DCB_ATTR_IFNAME,
+	DCB_ATTR_STATE,
+	DCB_ATTR_PFC_STATE,
+	DCB_ATTR_PFC_CFG,
+	DCB_ATTR_NUM_TC,
+	DCB_ATTR_PG_CFG,
+	DCB_ATTR_SET_ALL,
+	DCB_ATTR_PERM_HWADDR,
+
+	__DCB_ATTR_ENUM_MAX,
+	DCB_ATTR_MAX = __DCB_ATTR_ENUM_MAX - 1,
+};
+
+/**
+ * enum dcbnl_pfc_attrs - DCB Priority Flow Control user priority nested attrs
+ *
+ * @DCB_PFC_UP_ATTR_UNDEFINED: unspecified attribute to catch errors
+ * @DCB_PFC_UP_ATTR_0: Priority Flow Control value for User Priority 0 (NLA_U8)
+ * @DCB_PFC_UP_ATTR_1: Priority Flow Control value for User Priority 1 (NLA_U8)
+ * @DCB_PFC_UP_ATTR_2: Priority Flow Control value for User Priority 2 (NLA_U8)
+ * @DCB_PFC_UP_ATTR_3: Priority Flow Control value for User Priority 3 (NLA_U8)
+ * @DCB_PFC_UP_ATTR_4: Priority Flow Control value for User Priority 4 (NLA_U8)
+ * @DCB_PFC_UP_ATTR_5: Priority Flow Control value for User Priority 5 (NLA_U8)
+ * @DCB_PFC_UP_ATTR_6: Priority Flow Control value for User Priority 6 (NLA_U8)
+ * @DCB_PFC_UP_ATTR_7: Priority Flow Control value for User Priority 7 (NLA_U8)
+ * @DCB_PFC_UP_ATTR_MAX: highest attribute number currently defined
+ * @DCB_PFC_UP_ATTR_ALL: apply to all priority flow control attrs (NLA_FLAG)
+ *
+ */
+enum dcbnl_pfc_up_attrs {
+	DCB_PFC_UP_ATTR_UNDEFINED,
+
+	DCB_PFC_UP_ATTR_0,
+	DCB_PFC_UP_ATTR_1,
+	DCB_PFC_UP_ATTR_2,
+	DCB_PFC_UP_ATTR_3,
+	DCB_PFC_UP_ATTR_4,
+	DCB_PFC_UP_ATTR_5,
+	DCB_PFC_UP_ATTR_6,
+	DCB_PFC_UP_ATTR_7,
+	DCB_PFC_UP_ATTR_ALL,
+
+	__DCB_PFC_UP_ATTR_ENUM_MAX,
+	DCB_PFC_UP_ATTR_MAX = __DCB_PFC_UP_ATTR_ENUM_MAX - 1,
+};
+
+/**
+ * enum dcbnl_pg_attrs - DCB Priority Group attributes
+ *
+ * @DCB_PG_ATTR_UNDEFINED: unspecified attribute to catch errors
+ * @DCB_PG_ATTR_TC_0: Priority Group Traffic Class 0 configuration (NLA_NESTED)
+ * @DCB_PG_ATTR_TC_1: Priority Group Traffic Class 1 configuration (NLA_NESTED)
+ * @DCB_PG_ATTR_TC_2: Priority Group Traffic Class 2 configuration (NLA_NESTED)
+ * @DCB_PG_ATTR_TC_3: Priority Group Traffic Class 3 configuration (NLA_NESTED)
+ * @DCB_PG_ATTR_TC_4: Priority Group Traffic Class 4 configuration (NLA_NESTED)
+ * @DCB_PG_ATTR_TC_5: Priority Group Traffic Class 5 configuration (NLA_NESTED)
+ * @DCB_PG_ATTR_TC_6: Priority Group Traffic Class 6 configuration (NLA_NESTED)
+ * @DCB_PG_ATTR_TC_7: Priority Group Traffic Class 7 configuration (NLA_NESTED)
+ * @DCB_PG_ATTR_TC_MAX: highest attribute number currently defined
+ * @DCB_PG_ATTR_TC_ALL: apply to all traffic classes (NLA_NESTED)
+ * @DCB_PG_ATTR_BW_ID_0: Percent of link bandwidth for Priority Group 0 (NLA_U8)
+ * @DCB_PG_ATTR_BW_ID_1: Percent of link bandwidth for Priority Group 1 (NLA_U8)
+ * @DCB_PG_ATTR_BW_ID_2: Percent of link bandwidth for Priority Group 2 (NLA_U8)
+ * @DCB_PG_ATTR_BW_ID_3: Percent of link bandwidth for Priority Group 3 (NLA_U8)
+ * @DCB_PG_ATTR_BW_ID_4: Percent of link bandwidth for Priority Group 4 (NLA_U8)
+ * @DCB_PG_ATTR_BW_ID_5: Percent of link bandwidth for Priority Group 5 (NLA_U8)
+ * @DCB_PG_ATTR_BW_ID_6: Percent of link bandwidth for Priority Group 6 (NLA_U8)
+ * @DCB_PG_ATTR_BW_ID_7: Percent of link bandwidth for Priority Group 7 (NLA_U8)
+ * @DCB_PG_ATTR_BW_ID_MAX: highest attribute number currently defined
+ * @DCB_PG_ATTR_BW_ID_ALL: apply to all priority groups (NLA_FLAG)
+ *
+ */
+enum dcbnl_pg_attrs {
+	DCB_PG_ATTR_UNDEFINED,
+
+	DCB_PG_ATTR_TC_0,
+	DCB_PG_ATTR_TC_1,
+	DCB_PG_ATTR_TC_2,
+	DCB_PG_ATTR_TC_3,
+	DCB_PG_ATTR_TC_4,
+	DCB_PG_ATTR_TC_5,
+	DCB_PG_ATTR_TC_6,
+	DCB_PG_ATTR_TC_7,
+	DCB_PG_ATTR_TC_MAX,
+	DCB_PG_ATTR_TC_ALL,
+
+	DCB_PG_ATTR_BW_ID_0,
+	DCB_PG_ATTR_BW_ID_1,
+	DCB_PG_ATTR_BW_ID_2,
+	DCB_PG_ATTR_BW_ID_3,
+	DCB_PG_ATTR_BW_ID_4,
+	DCB_PG_ATTR_BW_ID_5,
+	DCB_PG_ATTR_BW_ID_6,
+	DCB_PG_ATTR_BW_ID_7,
+	DCB_PG_ATTR_BW_ID_MAX,
+	DCB_PG_ATTR_BW_ID_ALL,
+
+	__DCB_PG_ATTR_ENUM_MAX,
+	DCB_PG_ATTR_MAX = __DCB_PG_ATTR_ENUM_MAX - 1,
+};
+
+/**
+ * enum dcbnl_tc_attrs - DCB Traffic Class attributes
+ *
+ * @DCB_TC_ATTR_PARAM_UNDEFINED: unspecified attribute to catch errors
+ * @DCB_TC_ATTR_PARAM_PGID: (NLA_U8) Priority group the traffic class belongs to
+ *                          Valid values are:  0-7
+ * @DCB_TC_ATTR_PARAM_UP_MAPPING: (NLA_U8) Traffic class to user priority map
+ *                                Some devices may not support changing the
+ *                                user priority map of a TC.
+ * @DCB_TC_ATTR_PARAM_STRICT_PRIO: (NLA_U8) Strict priority setting
+ *                                 0 - none
+ *                                 1 - group strict
+ *                                 2 - link strict
+ * @DCB_TC_ATTR_PARAM_BW_PCT: optional - (NLA_U8) If supported by the device and
+ *                            not configured to use link strict priority,
+ *                            this is the percentage of bandwidth of the
+ *                            priority group this traffic class belongs to
+ * @DCB_TC_ATTR_PARAM_ALL: (NLA_FLAG) all traffic class parameters
+ *
+ */
+enum dcbnl_tc_attrs {
+	DCB_TC_ATTR_PARAM_UNDEFINED,
+
+	DCB_TC_ATTR_PARAM_PGID,
+	DCB_TC_ATTR_PARAM_UP_MAPPING,
+	DCB_TC_ATTR_PARAM_STRICT_PRIO,
+	DCB_TC_ATTR_PARAM_BW_PCT,
+	DCB_TC_ATTR_PARAM_ALL,
+
+	__DCB_TC_ATTR_PARAM_ENUM_MAX,
+	DCB_TC_ATTR_PARAM_MAX = __DCB_TC_ATTR_PARAM_ENUM_MAX - 1,
+};
+
+/**
+ * enum dcb_general_attr_values - general DCB attribute values
+ *
+ * @DCB_ATTR_UNDEFINED: value used to indicate an attribute is not supported
+ *
+ */
+enum dcb_general_attr_values {
+	DCB_ATTR_VALUE_UNDEFINED = 0xff
+};
+
+
+#endif /* __LINUX_DCBNL_H__ */
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index d8fb23679ee..6095af572df 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -43,6 +43,9 @@
 
 #include <net/net_namespace.h>
 #include <net/dsa.h>
+#ifdef CONFIG_DCBNL
+#include <net/dcbnl.h>
+#endif
 
 struct vlan_group;
 struct ethtool_ops;
@@ -843,6 +846,11 @@ struct net_device
 #define GSO_MAX_SIZE		65536
 	unsigned int		gso_max_size;
 
+#ifdef CONFIG_DCBNL
+	/* Data Center Bridging netlink ops */
+	struct dcbnl_rtnl_ops *dcbnl_ops;
+#endif
+
 #ifdef CONFIG_COMPAT_NET_DEV_OPS
 	struct {
 		int			(*init)(struct net_device *dev);
diff --git a/include/linux/rtnetlink.h b/include/linux/rtnetlink.h
index 2b3d51c6ec9..e88f7058b3a 100644
--- a/include/linux/rtnetlink.h
+++ b/include/linux/rtnetlink.h
@@ -107,6 +107,11 @@ enum {
 	RTM_GETADDRLABEL,
 #define RTM_GETADDRLABEL RTM_GETADDRLABEL
 
+	RTM_GETDCB = 78,
+#define RTM_GETDCB RTM_GETDCB
+	RTM_SETDCB,
+#define RTM_SETDCB RTM_SETDCB
+
 	__RTM_MAX,
 #define RTM_MAX		(((__RTM_MAX + 3) & ~3) - 1)
 };
diff --git a/include/net/dcbnl.h b/include/net/dcbnl.h
new file mode 100644
index 00000000000..0ef0c5a46d8
--- /dev/null
+++ b/include/net/dcbnl.h
@@ -0,0 +1,44 @@
+/*
+ * Copyright (c) 2008, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+ * Place - Suite 330, Boston, MA 02111-1307 USA.
+ *
+ * Author: Lucy Liu <lucy.liu@intel.com>
+ */
+
+#ifndef __NET_DCBNL_H__
+#define __NET_DCBNL_H__
+
+/*
+ * Ops struct for the netlink callbacks.  Used by DCB-enabled drivers through
+ * the netdevice struct.
+ */
+struct dcbnl_rtnl_ops {
+	u8   (*getstate)(struct net_device *);
+	void (*setstate)(struct net_device *, u8);
+	void (*getpermhwaddr)(struct net_device *, u8 *);
+	void (*setpgtccfgtx)(struct net_device *, int, u8, u8, u8, u8);
+	void (*setpgbwgcfgtx)(struct net_device *, int, u8);
+	void (*setpgtccfgrx)(struct net_device *, int, u8, u8, u8, u8);
+	void (*setpgbwgcfgrx)(struct net_device *, int, u8);
+	void (*getpgtccfgtx)(struct net_device *, int, u8 *, u8 *, u8 *, u8 *);
+	void (*getpgbwgcfgtx)(struct net_device *, int, u8 *);
+	void (*getpgtccfgrx)(struct net_device *, int, u8 *, u8 *, u8 *, u8 *);
+	void (*getpgbwgcfgrx)(struct net_device *, int, u8 *);
+	void (*setpfccfg)(struct net_device *, int, u8);
+	void (*getpfccfg)(struct net_device *, int, u8 *);
+	u8   (*setall)(struct net_device *);
+};
+
+#endif /* __NET_DCBNL_H__ */
diff --git a/net/Kconfig b/net/Kconfig
index 4e2e40ba8ba..c7d01c3a23c 100644
--- a/net/Kconfig
+++ b/net/Kconfig
@@ -194,6 +194,7 @@ source "net/lapb/Kconfig"
 source "net/econet/Kconfig"
 source "net/wanrouter/Kconfig"
 source "net/sched/Kconfig"
+source "net/dcb/Kconfig"
 
 menu "Network testing"
 
diff --git a/net/Makefile b/net/Makefile
index 27d1f10dc0e..83b064651f1 100644
--- a/net/Makefile
+++ b/net/Makefile
@@ -57,6 +57,9 @@ obj-$(CONFIG_NETLABEL)		+= netlabel/
 obj-$(CONFIG_IUCV)		+= iucv/
 obj-$(CONFIG_RFKILL)		+= rfkill/
 obj-$(CONFIG_NET_9P)		+= 9p/
+ifeq ($(CONFIG_DCBNL),y)
+obj-$(CONFIG_DCB)		+= dcb/
+endif
 
 ifeq ($(CONFIG_NET),y)
 obj-$(CONFIG_SYSCTL)		+= sysctl_net.o
diff --git a/net/dcb/Kconfig b/net/dcb/Kconfig
new file mode 100644
index 00000000000..bdf38802d33
--- /dev/null
+++ b/net/dcb/Kconfig
@@ -0,0 +1,12 @@
+config DCB
+        tristate "Data Center Bridging support"
+
+config DCBNL
+	bool "Data Center Bridging netlink interface support"
+	depends on DCB
+	default n
+	---help---
+	  This option turns on the netlink interface
+	  (dcbnl) for Data Center Bridging capable devices.
+
+	  If unsure, say N.
diff --git a/net/dcb/Makefile b/net/dcb/Makefile
new file mode 100644
index 00000000000..9930f4cde81
--- /dev/null
+++ b/net/dcb/Makefile
@@ -0,0 +1 @@
+obj-$(CONFIG_DCB) += dcbnl.o
diff --git a/net/dcb/dcbnl.c b/net/dcb/dcbnl.c
new file mode 100644
index 00000000000..516e8be83d7
--- /dev/null
+++ b/net/dcb/dcbnl.c
@@ -0,0 +1,704 @@
+/*
+ * Copyright (c) 2008, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+ * Place - Suite 330, Boston, MA 02111-1307 USA.
+ *
+ * Author: Lucy Liu <lucy.liu@intel.com>
+ */
+
+#include <linux/netdevice.h>
+#include <linux/netlink.h>
+#include <net/netlink.h>
+#include <net/rtnetlink.h>
+#include <linux/dcbnl.h>
+#include <linux/rtnetlink.h>
+#include <net/sock.h>
+
+/**
+ * Data Center Bridging (DCB) is a collection of Ethernet enhancements
+ * intended to allow network traffic with differing requirements
+ * (highly reliable, no drops vs. best effort vs. low latency) to operate
+ * and co-exist on Ethernet.  Current DCB features are:
+ *
+ * Enhanced Transmission Selection (aka Priority Grouping [PG]) - provides a
+ *   framework for assigning bandwidth guarantees to traffic classes.
+ *
+ * Priority-based Flow Control (PFC) - provides a flow control mechanism which
+ *   can work independently for each 802.1p priority.
+ *
+ * Congestion Notification - provides a mechanism for end-to-end congestion
+ *   control for protocols which do not have built-in congestion management.
+ *
+ * More information about the emerging standards for these Ethernet features
+ * can be found at: http://www.ieee802.org/1/pages/dcbridges.html
+ *
+ * This file implements an rtnetlink interface to allow configuration of DCB
+ * features for capable devices.
+ */
+
+MODULE_AUTHOR("Lucy Liu, <lucy.liu@intel.com>");
+MODULE_DESCRIPTION("Data Center Bridging generic netlink interface");
+MODULE_LICENSE("GPL");
+
+/**************** DCB attribute policies *************************************/
+
+/* DCB netlink attributes policy */
+static struct nla_policy dcbnl_rtnl_policy[DCB_ATTR_MAX + 1] = {
+	[DCB_ATTR_IFNAME]    = {.type = NLA_STRING, .len = IFNAMSIZ - 1},
+	[DCB_ATTR_STATE]     = {.type = NLA_U8},
+	[DCB_ATTR_PFC_CFG]   = {.type = NLA_NESTED},
+	[DCB_ATTR_PG_CFG]    = {.type = NLA_NESTED},
+	[DCB_ATTR_SET_ALL]   = {.type = NLA_U8},
+	[DCB_ATTR_PERM_HWADDR] = {.type = NLA_FLAG},
+};
+
+/* DCB priority flow control to User Priority nested attributes */
+static struct nla_policy dcbnl_pfc_up_nest[DCB_PFC_UP_ATTR_MAX + 1] = {
+	[DCB_PFC_UP_ATTR_0]   = {.type = NLA_U8},
+	[DCB_PFC_UP_ATTR_1]   = {.type = NLA_U8},
+	[DCB_PFC_UP_ATTR_2]   = {.type = NLA_U8},
+	[DCB_PFC_UP_ATTR_3]   = {.type = NLA_U8},
+	[DCB_PFC_UP_ATTR_4]   = {.type = NLA_U8},
+	[DCB_PFC_UP_ATTR_5]   = {.type = NLA_U8},
+	[DCB_PFC_UP_ATTR_6]   = {.type = NLA_U8},
+	[DCB_PFC_UP_ATTR_7]   = {.type = NLA_U8},
+	[DCB_PFC_UP_ATTR_ALL] = {.type = NLA_FLAG},
+};
+
+/* DCB priority grouping nested attributes */
+static struct nla_policy dcbnl_pg_nest[DCB_PG_ATTR_MAX + 1] = {
+	[DCB_PG_ATTR_TC_0]      = {.type = NLA_NESTED},
+	[DCB_PG_ATTR_TC_1]      = {.type = NLA_NESTED},
+	[DCB_PG_ATTR_TC_2]      = {.type = NLA_NESTED},
+	[DCB_PG_ATTR_TC_3]      = {.type = NLA_NESTED},
+	[DCB_PG_ATTR_TC_4]      = {.type = NLA_NESTED},
+	[DCB_PG_ATTR_TC_5]      = {.type = NLA_NESTED},
+	[DCB_PG_ATTR_TC_6]      = {.type = NLA_NESTED},
+	[DCB_PG_ATTR_TC_7]      = {.type = NLA_NESTED},
+	[DCB_PG_ATTR_TC_ALL]    = {.type = NLA_NESTED},
+	[DCB_PG_ATTR_BW_ID_0]   = {.type = NLA_U8},
+	[DCB_PG_ATTR_BW_ID_1]   = {.type = NLA_U8},
+	[DCB_PG_ATTR_BW_ID_2]   = {.type = NLA_U8},
+	[DCB_PG_ATTR_BW_ID_3]   = {.type = NLA_U8},
+	[DCB_PG_ATTR_BW_ID_4]   = {.type = NLA_U8},
+	[DCB_PG_ATTR_BW_ID_5]   = {.type = NLA_U8},
+	[DCB_PG_ATTR_BW_ID_6]   = {.type = NLA_U8},
+	[DCB_PG_ATTR_BW_ID_7]   = {.type = NLA_U8},
+	[DCB_PG_ATTR_BW_ID_ALL] = {.type = NLA_FLAG},
+};
+
+/* DCB traffic class nested attributes. */
+static struct nla_policy dcbnl_tc_param_nest[DCB_TC_ATTR_PARAM_MAX + 1] = {
+	[DCB_TC_ATTR_PARAM_PGID]            = {.type = NLA_U8},
+	[DCB_TC_ATTR_PARAM_UP_MAPPING]      = {.type = NLA_U8},
+	[DCB_TC_ATTR_PARAM_STRICT_PRIO]     = {.type = NLA_U8},
+	[DCB_TC_ATTR_PARAM_BW_PCT]          = {.type = NLA_U8},
+	[DCB_TC_ATTR_PARAM_ALL]             = {.type = NLA_FLAG},
+};
+
+
+/* standard netlink reply call */
+static int dcbnl_reply(u8 value, u8 event, u8 cmd, u8 attr, u32 pid,
+                       u32 seq, u16 flags)
+{
+	struct sk_buff *dcbnl_skb;
+	struct dcbmsg *dcb;
+	struct nlmsghdr *nlh;
+	int ret = -EINVAL;
+
+	dcbnl_skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+	if (!dcbnl_skb)
+		return ret;
+
+	nlh = NLMSG_NEW(dcbnl_skb, pid, seq, event, sizeof(*dcb), flags);
+
+	dcb = NLMSG_DATA(nlh);
+	dcb->dcb_family = AF_UNSPEC;
+	dcb->cmd = cmd;
+	dcb->dcb_pad = 0;
+
+	ret = nla_put_u8(dcbnl_skb, attr, value);
+	if (ret)
+		goto err;
+
+	/* end the message, assign the nlmsg_len. */
+	nlmsg_end(dcbnl_skb, nlh);
+	ret = rtnl_unicast(dcbnl_skb, &init_net, pid);
+	if (ret)
+		goto err;
+
+	return 0;
+nlmsg_failure:
+err:
+	kfree(dcbnl_skb);
+	return ret;
+}
+
+static int dcbnl_getstate(struct net_device *netdev, struct nlattr **tb,
+                          u32 pid, u32 seq, u16 flags)
+{
+	int ret = -EINVAL;
+
+	/* if (!tb[DCB_ATTR_STATE] || !netdev->dcbnl_ops->getstate) */
+	if (!netdev->dcbnl_ops->getstate)
+		return ret;
+
+	ret = dcbnl_reply(netdev->dcbnl_ops->getstate(netdev), RTM_GETDCB,
+	                  DCB_CMD_GSTATE, DCB_ATTR_STATE, pid, seq, flags);
+
+	return ret;
+}
+
+static int dcbnl_getpfccfg(struct net_device *netdev, struct nlattr **tb,
+                           u32 pid, u32 seq, u16 flags)
+{
+	struct sk_buff *dcbnl_skb;
+	struct nlmsghdr *nlh;
+	struct dcbmsg *dcb;
+	struct nlattr *data[DCB_PFC_UP_ATTR_MAX + 1], *nest;
+	u8 value;
+	int ret = -EINVAL;
+	int i;
+	int getall = 0;
+
+	if (!tb[DCB_ATTR_PFC_CFG] || !netdev->dcbnl_ops->getpfccfg)
+		return ret;
+
+	ret = nla_parse_nested(data, DCB_PFC_UP_ATTR_MAX,
+	                       tb[DCB_ATTR_PFC_CFG],
+	                       dcbnl_pfc_up_nest);
+	if (ret)
+		goto err_out;
+
+	dcbnl_skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+	if (!dcbnl_skb)
+		goto err_out;
+
+	nlh = NLMSG_NEW(dcbnl_skb, pid, seq, RTM_GETDCB, sizeof(*dcb), flags);
+
+	dcb = NLMSG_DATA(nlh);
+	dcb->dcb_family = AF_UNSPEC;
+	dcb->cmd = DCB_CMD_PFC_GCFG;
+
+	nest = nla_nest_start(dcbnl_skb, DCB_ATTR_PFC_CFG);
+	if (!nest)
+		goto err;
+
+	if (data[DCB_PFC_UP_ATTR_ALL])
+		getall = 1;
+
+	for (i = DCB_PFC_UP_ATTR_0; i <= DCB_PFC_UP_ATTR_7; i++) {
+		if (!getall && !data[i])
+			continue;
+
+		netdev->dcbnl_ops->getpfccfg(netdev, i - DCB_PFC_UP_ATTR_0,
+		                             &value);
+		ret = nla_put_u8(dcbnl_skb, i, value);
+
+		if (ret) {
+			nla_nest_cancel(dcbnl_skb, nest);
+			goto err;
+		}
+	}
+	nla_nest_end(dcbnl_skb, nest);
+
+	nlmsg_end(dcbnl_skb, nlh);
+
+	ret = rtnl_unicast(dcbnl_skb, &init_net, pid);
+	if (ret)
+		goto err;
+
+	return 0;
+nlmsg_failure:
+err:
+	kfree(dcbnl_skb);
+err_out:
+	return -EINVAL;
+}
+
+static int dcbnl_getperm_hwaddr(struct net_device *netdev, struct nlattr **tb,
+                                u32 pid, u32 seq, u16 flags)
+{
+	struct sk_buff *dcbnl_skb;
+	struct nlmsghdr *nlh;
+	struct dcbmsg *dcb;
+	u8 perm_addr[MAX_ADDR_LEN];
+	int ret = -EINVAL;
+
+	if (!netdev->dcbnl_ops->getpermhwaddr)
+		return ret;
+
+	dcbnl_skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+	if (!dcbnl_skb)
+		goto err_out;
+
+	nlh = NLMSG_NEW(dcbnl_skb, pid, seq, RTM_GETDCB, sizeof(*dcb), flags);
+
+	dcb = NLMSG_DATA(nlh);
+	dcb->dcb_family = AF_UNSPEC;
+	dcb->cmd = DCB_CMD_GPERM_HWADDR;
+
+	netdev->dcbnl_ops->getpermhwaddr(netdev, perm_addr);
+
+	ret = nla_put(dcbnl_skb, DCB_ATTR_PERM_HWADDR, sizeof(perm_addr),
+	              perm_addr);
+
+	nlmsg_end(dcbnl_skb, nlh);
+
+	ret = rtnl_unicast(dcbnl_skb, &init_net, pid);
+	if (ret)
+		goto err;
+
+	return 0;
+
+nlmsg_failure:
+err:
+	kfree(dcbnl_skb);
+err_out:
+	return -EINVAL;
+}
+
+static int __dcbnl_pg_getcfg(struct net_device *netdev, struct nlattr **tb,
+                             u32 pid, u32 seq, u16 flags, int dir)
+{
+	struct sk_buff *dcbnl_skb;
+	struct nlmsghdr *nlh;
+	struct dcbmsg *dcb;
+	struct nlattr *pg_nest, *param_nest, *data;
+	struct nlattr *pg_tb[DCB_PG_ATTR_MAX + 1];
+	struct nlattr *param_tb[DCB_TC_ATTR_PARAM_MAX + 1];
+	u8 prio, pgid, tc_pct, up_map;
+	int ret  = -EINVAL;
+	int getall = 0;
+	int i;
+
+	if (!tb[DCB_ATTR_PG_CFG] ||
+	    !netdev->dcbnl_ops->getpgtccfgtx ||
+	    !netdev->dcbnl_ops->getpgtccfgrx ||
+	    !netdev->dcbnl_ops->getpgbwgcfgtx ||
+	    !netdev->dcbnl_ops->getpgbwgcfgrx)
+		return ret;
+
+	ret = nla_parse_nested(pg_tb, DCB_PG_ATTR_MAX,
+	                       tb[DCB_ATTR_PG_CFG], dcbnl_pg_nest);
+
+	if (ret)
+		goto err_out;
+
+	dcbnl_skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+	if (!dcbnl_skb)
+		goto err_out;
+
+	nlh = NLMSG_NEW(dcbnl_skb, pid, seq, RTM_GETDCB, sizeof(*dcb), flags);
+
+	dcb = NLMSG_DATA(nlh);
+	dcb->dcb_family = AF_UNSPEC;
+	dcb->cmd = (dir) ? DCB_CMD_PGRX_GCFG : DCB_CMD_PGTX_GCFG;
+
+	pg_nest = nla_nest_start(dcbnl_skb, DCB_ATTR_PG_CFG);
+	if (!pg_nest)
+		goto err;
+
+	if (pg_tb[DCB_PG_ATTR_TC_ALL])
+		getall = 1;
+
+	for (i = DCB_PG_ATTR_TC_0; i <= DCB_PG_ATTR_TC_7; i++) {
+		if (!getall && !pg_tb[i])
+			continue;
+
+		if (pg_tb[DCB_PG_ATTR_TC_ALL])
+			data = pg_tb[DCB_PG_ATTR_TC_ALL];
+		else
+			data = pg_tb[i];
+		ret = nla_parse_nested(param_tb, DCB_TC_ATTR_PARAM_MAX,
+				       data, dcbnl_tc_param_nest);
+		if (ret)
+			goto err_pg;
+
+		param_nest = nla_nest_start(dcbnl_skb, i);
+		if (!param_nest)
+			goto err_pg;
+
+		pgid = DCB_ATTR_VALUE_UNDEFINED;
+		prio = DCB_ATTR_VALUE_UNDEFINED;
+		tc_pct = DCB_ATTR_VALUE_UNDEFINED;
+		up_map = DCB_ATTR_VALUE_UNDEFINED;
+
+		if (dir) {
+			/* Rx */
+			netdev->dcbnl_ops->getpgtccfgrx(netdev,
+						i - DCB_PG_ATTR_TC_0, &prio,
+						&pgid, &tc_pct, &up_map);
+		} else {
+			/* Tx */
+			netdev->dcbnl_ops->getpgtccfgtx(netdev,
+						i - DCB_PG_ATTR_TC_0, &prio,
+						&pgid, &tc_pct, &up_map);
+		}
+
+		if (param_tb[DCB_TC_ATTR_PARAM_PGID] ||
+		    param_tb[DCB_TC_ATTR_PARAM_ALL]) {
+			ret = nla_put_u8(dcbnl_skb,
+			                 DCB_TC_ATTR_PARAM_PGID, pgid);
+			if (ret)
+				goto err_param;
+		}
+		if (param_tb[DCB_TC_ATTR_PARAM_UP_MAPPING] ||
+		    param_tb[DCB_TC_ATTR_PARAM_ALL]) {
+			ret = nla_put_u8(dcbnl_skb,
+			                 DCB_TC_ATTR_PARAM_UP_MAPPING, up_map);
+			if (ret)
+				goto err_param;
+		}
+		if (param_tb[DCB_TC_ATTR_PARAM_STRICT_PRIO] ||
+		    param_tb[DCB_TC_ATTR_PARAM_ALL]) {
+			ret = nla_put_u8(dcbnl_skb,
+			                 DCB_TC_ATTR_PARAM_STRICT_PRIO, prio);
+			if (ret)
+				goto err_param;
+		}
+		if (param_tb[DCB_TC_ATTR_PARAM_BW_PCT] ||
+		    param_tb[DCB_TC_ATTR_PARAM_ALL]) {
+			ret = nla_put_u8(dcbnl_skb, DCB_TC_ATTR_PARAM_BW_PCT,
+			                 tc_pct);
+			if (ret)
+				goto err_param;
+		}
+		nla_nest_end(dcbnl_skb, param_nest);
+	}
+
+	if (pg_tb[DCB_PG_ATTR_BW_ID_ALL])
+		getall = 1;
+	else
+		getall = 0;
+
+	for (i = DCB_PG_ATTR_BW_ID_0; i <= DCB_PG_ATTR_BW_ID_7; i++) {
+		if (!getall && !pg_tb[i])
+			continue;
+
+		tc_pct = DCB_ATTR_VALUE_UNDEFINED;
+
+		if (dir) {
+			/* Rx */
+			netdev->dcbnl_ops->getpgbwgcfgrx(netdev,
+					i - DCB_PG_ATTR_BW_ID_0, &tc_pct);
+		} else {
+			/* Tx */
+			netdev->dcbnl_ops->getpgbwgcfgtx(netdev,
+					i - DCB_PG_ATTR_BW_ID_0, &tc_pct);
+		}
+		ret = nla_put_u8(dcbnl_skb, i, tc_pct);
+
+		if (ret)
+			goto err_pg;
+	}
+
+	nla_nest_end(dcbnl_skb, pg_nest);
+
+	nlmsg_end(dcbnl_skb, nlh);
+
+	ret = rtnl_unicast(dcbnl_skb, &init_net, pid);
+	if (ret)
+		goto err;
+
+	return 0;
+
+err_param:
+	nla_nest_cancel(dcbnl_skb, param_nest);
+err_pg:
+	nla_nest_cancel(dcbnl_skb, pg_nest);
+nlmsg_failure:
+err:
+	kfree(dcbnl_skb);
+err_out:
+	ret  = -EINVAL;
+	return ret;
+}
+
+static int dcbnl_pgtx_getcfg(struct net_device *netdev, struct nlattr **tb,
+                             u32 pid, u32 seq, u16 flags)
+{
+	return __dcbnl_pg_getcfg(netdev, tb, pid, seq, flags, 0);
+}
+
+static int dcbnl_pgrx_getcfg(struct net_device *netdev, struct nlattr **tb,
+                             u32 pid, u32 seq, u16 flags)
+{
+	return __dcbnl_pg_getcfg(netdev, tb, pid, seq, flags, 1);
+}
+
+static int dcbnl_setstate(struct net_device *netdev, struct nlattr **tb,
+                          u32 pid, u32 seq, u16 flags)
+{
+	int ret = -EINVAL;
+	u8 value;
+
+	if (!tb[DCB_ATTR_STATE] || !netdev->dcbnl_ops->setstate)
+		return ret;
+
+	value = nla_get_u8(tb[DCB_ATTR_STATE]);
+
+	netdev->dcbnl_ops->setstate(netdev, value);
+
+	ret = dcbnl_reply(0, RTM_SETDCB, DCB_CMD_SSTATE, DCB_ATTR_STATE,
+	                  pid, seq, flags);
+
+	return ret;
+}
+
+static int dcbnl_setpfccfg(struct net_device *netdev, struct nlattr **tb,
+                           u32 pid, u32 seq, u16 flags)
+{
+	struct nlattr *data[DCB_PFC_UP_ATTR_MAX + 1];
+	int i;
+	int ret = -EINVAL;
+	u8 value;
+
+	if (!tb[DCB_ATTR_PFC_CFG] || !netdev->dcbnl_ops->setpfccfg)
+		return ret;
+
+	ret = nla_parse_nested(data, DCB_PFC_UP_ATTR_MAX,
+	                       tb[DCB_ATTR_PFC_CFG],
+	                       dcbnl_pfc_up_nest);
+	if (ret)
+		goto err;
+
+	for (i = DCB_PFC_UP_ATTR_0; i <= DCB_PFC_UP_ATTR_7; i++) {
+		if (data[i] == NULL)
+			continue;
+		value = nla_get_u8(data[i]);
+		netdev->dcbnl_ops->setpfccfg(netdev,
+			data[i]->nla_type - DCB_PFC_UP_ATTR_0, value);
+	}
+
+	ret = dcbnl_reply(0, RTM_SETDCB, DCB_CMD_PFC_SCFG, DCB_ATTR_PFC_CFG,
+	                  pid, seq, flags);
+err:
+	return ret;
+}
+
+static int dcbnl_setall(struct net_device *netdev, struct nlattr **tb,
+                        u32 pid, u32 seq, u16 flags)
+{
+	int ret = -EINVAL;
+
+	if (!tb[DCB_ATTR_SET_ALL] || !netdev->dcbnl_ops->setall)
+		return ret;
+
+	ret = dcbnl_reply(netdev->dcbnl_ops->setall(netdev), RTM_SETDCB,
+	                  DCB_CMD_SET_ALL, DCB_ATTR_SET_ALL, pid, seq, flags);
+
+	return ret;
+}
+
+static int __dcbnl_pg_setcfg(struct net_device *netdev, struct nlattr **tb,
+                             u32 pid, u32 seq, u16 flags, int dir)
+{
+	struct nlattr *pg_tb[DCB_PG_ATTR_MAX + 1];
+	struct nlattr *param_tb[DCB_TC_ATTR_PARAM_MAX + 1];
+	int ret = -EINVAL;
+	int i;
+	u8 pgid;
+	u8 up_map;
+	u8 prio;
+	u8 tc_pct;
+
+	if (!tb[DCB_ATTR_PG_CFG] ||
+	    !netdev->dcbnl_ops->setpgtccfgtx ||
+	    !netdev->dcbnl_ops->setpgtccfgrx ||
+	    !netdev->dcbnl_ops->setpgbwgcfgtx ||
+	    !netdev->dcbnl_ops->setpgbwgcfgrx)
+		return ret;
+
+	ret = nla_parse_nested(pg_tb, DCB_PG_ATTR_MAX,
+	                       tb[DCB_ATTR_PG_CFG], dcbnl_pg_nest);
+	if (ret)
+		goto err;
+
+	for (i = DCB_PG_ATTR_TC_0; i <= DCB_PG_ATTR_TC_7; i++) {
+		if (!pg_tb[i])
+			continue;
+
+		ret = nla_parse_nested(param_tb, DCB_TC_ATTR_PARAM_MAX,
+		                       pg_tb[i], dcbnl_tc_param_nest);
+		if (ret)
+			goto err;
+
+		pgid = DCB_ATTR_VALUE_UNDEFINED;
+		prio = DCB_ATTR_VALUE_UNDEFINED;
+		tc_pct = DCB_ATTR_VALUE_UNDEFINED;
+		up_map = DCB_ATTR_VALUE_UNDEFINED;
+
+		if (param_tb[DCB_TC_ATTR_PARAM_STRICT_PRIO])
+			prio =
+			    nla_get_u8(param_tb[DCB_TC_ATTR_PARAM_STRICT_PRIO]);
+
+		if (param_tb[DCB_TC_ATTR_PARAM_PGID])
+			pgid = nla_get_u8(param_tb[DCB_TC_ATTR_PARAM_PGID]);
+
+		if (param_tb[DCB_TC_ATTR_PARAM_BW_PCT])
+			tc_pct = nla_get_u8(param_tb[DCB_TC_ATTR_PARAM_BW_PCT]);
+
+		if (param_tb[DCB_TC_ATTR_PARAM_UP_MAPPING])
+			up_map =
+			     nla_get_u8(param_tb[DCB_TC_ATTR_PARAM_UP_MAPPING]);
+
+		/* dir: Tx = 0, Rx = 1 */
+		if (dir) {
+			/* Rx */
+			netdev->dcbnl_ops->setpgtccfgrx(netdev,
+				i - DCB_PG_ATTR_TC_0,
+				prio, pgid, tc_pct, up_map);
+		} else {
+			/* Tx */
+			netdev->dcbnl_ops->setpgtccfgtx(netdev,
+				i - DCB_PG_ATTR_TC_0,
+				prio, pgid, tc_pct, up_map);
+		}
+	}
+
+	for (i = DCB_PG_ATTR_BW_ID_0; i <= DCB_PG_ATTR_BW_ID_7; i++) {
+		if (!pg_tb[i])
+			continue;
+
+		tc_pct = nla_get_u8(pg_tb[i]);
+
+		/* dir: Tx = 0, Rx = 1 */
+		if (dir) {
+			/* Rx */
+			netdev->dcbnl_ops->setpgbwgcfgrx(netdev,
+					 i - DCB_PG_ATTR_BW_ID_0, tc_pct);
+		} else {
+			/* Tx */
+			netdev->dcbnl_ops->setpgbwgcfgtx(netdev,
+					 i - DCB_PG_ATTR_BW_ID_0, tc_pct);
+		}
+	}
+
+	ret = dcbnl_reply(0, RTM_SETDCB,
+			  (dir ? DCB_CMD_PGRX_SCFG : DCB_CMD_PGTX_SCFG),
+			  DCB_ATTR_PG_CFG, pid, seq, flags);
+
+err:
+	return ret;
+}
+
+static int dcbnl_pgtx_setcfg(struct net_device *netdev, struct nlattr **tb,
+                             u32 pid, u32 seq, u16 flags)
+{
+	return __dcbnl_pg_setcfg(netdev, tb, pid, seq, flags, 0);
+}
+
+static int dcbnl_pgrx_setcfg(struct net_device *netdev, struct nlattr **tb,
+                             u32 pid, u32 seq, u16 flags)
+{
+	return __dcbnl_pg_setcfg(netdev, tb, pid, seq, flags, 1);
+}
+
+static int dcb_doit(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg)
+{
+	struct net *net = sock_net(skb->sk);
+	struct net_device *netdev;
+	struct dcbmsg  *dcb = (struct dcbmsg *)NLMSG_DATA(nlh);
+	struct nlattr *tb[DCB_ATTR_MAX + 1];
+	u32 pid = skb ? NETLINK_CB(skb).pid : 0;
+	int ret = -EINVAL;
+
+	if (net != &init_net)
+		return -EINVAL;
+
+	ret = nlmsg_parse(nlh, sizeof(*dcb), tb, DCB_ATTR_MAX,
+			  dcbnl_rtnl_policy);
+	if (ret < 0)
+		return ret;
+
+	if (!tb[DCB_ATTR_IFNAME])
+		return -EINVAL;
+
+	netdev = dev_get_by_name(&init_net, nla_data(tb[DCB_ATTR_IFNAME]));
+	if (!netdev)
+		return -EINVAL;
+
+	if (!netdev->dcbnl_ops)
+		goto errout;
+
+	switch (dcb->cmd) {
+	case DCB_CMD_GSTATE:
+		ret = dcbnl_getstate(netdev, tb, pid, nlh->nlmsg_seq,
+		                     nlh->nlmsg_flags);
+		goto out;
+	case DCB_CMD_PFC_GCFG:
+		ret = dcbnl_getpfccfg(netdev, tb, pid, nlh->nlmsg_seq,
+		                      nlh->nlmsg_flags);
+		goto out;
+	case DCB_CMD_GPERM_HWADDR:
+		ret = dcbnl_getperm_hwaddr(netdev, tb, pid, nlh->nlmsg_seq,
+		                           nlh->nlmsg_flags);
+		goto out;
+	case DCB_CMD_PGTX_GCFG:
+		ret = dcbnl_pgtx_getcfg(netdev, tb, pid, nlh->nlmsg_seq,
+		                        nlh->nlmsg_flags);
+		goto out;
+	case DCB_CMD_PGRX_GCFG:
+		ret = dcbnl_pgrx_getcfg(netdev, tb, pid, nlh->nlmsg_seq,
+		                        nlh->nlmsg_flags);
+		goto out;
+	case DCB_CMD_SSTATE:
+		ret = dcbnl_setstate(netdev, tb, pid, nlh->nlmsg_seq,
+		                     nlh->nlmsg_flags);
+		goto out;
+	case DCB_CMD_PFC_SCFG:
+		ret = dcbnl_setpfccfg(netdev, tb, pid, nlh->nlmsg_seq,
+		                      nlh->nlmsg_flags);
+		goto out;
+
+	case DCB_CMD_SET_ALL:
+		ret = dcbnl_setall(netdev, tb, pid, nlh->nlmsg_seq,
+		                   nlh->nlmsg_flags);
+		goto out;
+	case DCB_CMD_PGTX_SCFG:
+		ret = dcbnl_pgtx_setcfg(netdev, tb, pid, nlh->nlmsg_seq,
+		                        nlh->nlmsg_flags);
+		goto out;
+	case DCB_CMD_PGRX_SCFG:
+		ret = dcbnl_pgrx_setcfg(netdev, tb, pid, nlh->nlmsg_seq,
+		                        nlh->nlmsg_flags);
+		goto out;
+	default:
+		goto errout;
+	}
+errout:
+	ret = -EINVAL;
+out:
+	dev_put(netdev);
+	return ret;
+}
+
+static int __init dcbnl_init(void)
+{
+	rtnl_register(PF_UNSPEC, RTM_GETDCB, dcb_doit, NULL);
+	rtnl_register(PF_UNSPEC, RTM_SETDCB, dcb_doit, NULL);
+
+	return 0;
+}
+module_init(dcbnl_init);
+
+static void __exit dcbnl_exit(void)
+{
+	rtnl_unregister(PF_UNSPEC, RTM_GETDCB);
+	rtnl_unregister(PF_UNSPEC, RTM_SETDCB);
+}
+module_exit(dcbnl_exit);
+
+
-- 
cgit v1.2.3-70-g09d2


From 46132188bf72e22ef097f16ed5c969ee8cea1e8b Mon Sep 17 00:00:00 2001
From: Alexander Duyck <alexander.h.duyck@intel.com>
Date: Thu, 20 Nov 2008 21:05:08 -0800
Subject: DCB: Add interface to query for the DCB capabilities of an device.

Adds to the netlink interface for Data Center Bridging (DCB), allowing
the DCB capabilities supported by a device to be queried.

Signed-off-by: Alexander Duyck <alexander.h.duyck@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
Signed-off-by: Peter P Waskiewicz Jr <peter.p.waskiewicz.jr@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ixgbe/ixgbe_dcb_nl.c | 42 +++++++++++++++++++-
 include/linux/dcbnl.h            | 37 ++++++++++++++++++
 include/net/dcbnl.h              |  1 +
 net/dcb/dcbnl.c                  | 82 ++++++++++++++++++++++++++++++++++++++++
 4 files changed, 161 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/net/ixgbe/ixgbe_dcb_nl.c b/drivers/net/ixgbe/ixgbe_dcb_nl.c
index 50bff2af6b0..eb3a6cea1aa 100644
--- a/drivers/net/ixgbe/ixgbe_dcb_nl.c
+++ b/drivers/net/ixgbe/ixgbe_dcb_nl.c
@@ -337,6 +337,45 @@ static u8 ixgbe_dcbnl_set_all(struct net_device *netdev)
 	return ret;
 }
 
+static u8 ixgbe_dcbnl_getcap(struct net_device *netdev, int capid, u8 *cap)
+{
+	struct ixgbe_adapter *adapter = netdev_priv(netdev);
+	u8 rval = 0;
+
+	if (adapter->flags & IXGBE_FLAG_DCB_ENABLED) {
+		switch (capid) {
+		case DCB_CAP_ATTR_PG:
+			*cap = true;
+			break;
+		case DCB_CAP_ATTR_PFC:
+			*cap = true;
+			break;
+		case DCB_CAP_ATTR_UP2TC:
+			*cap = false;
+			break;
+		case DCB_CAP_ATTR_PG_TCS:
+			*cap = 0x80;
+			break;
+		case DCB_CAP_ATTR_PFC_TCS:
+			*cap = 0x80;
+			break;
+		case DCB_CAP_ATTR_GSP:
+			*cap = true;
+			break;
+		case DCB_CAP_ATTR_BCN:
+			*cap = false;
+			break;
+		default:
+			rval = -EINVAL;
+			break;
+		}
+	} else {
+		rval = -EINVAL;
+	}
+
+	return rval;
+}
+
 struct dcbnl_rtnl_ops dcbnl_ops = {
 	.getstate	= ixgbe_dcbnl_get_state,
 	.setstate	= ixgbe_dcbnl_set_state,
@@ -351,6 +390,7 @@ struct dcbnl_rtnl_ops dcbnl_ops = {
 	.getpgbwgcfgrx	= ixgbe_dcbnl_get_pg_bwg_cfg_rx,
 	.setpfccfg	= ixgbe_dcbnl_set_pfc_cfg,
 	.getpfccfg	= ixgbe_dcbnl_get_pfc_cfg,
-	.setall		= ixgbe_dcbnl_set_all
+	.setall		= ixgbe_dcbnl_set_all,
+	.getcap		= ixgbe_dcbnl_getcap
 };
 
diff --git a/include/linux/dcbnl.h b/include/linux/dcbnl.h
index 32d32c1ee41..13f0c638a69 100644
--- a/include/linux/dcbnl.h
+++ b/include/linux/dcbnl.h
@@ -43,6 +43,7 @@ struct dcbmsg {
  * @DCB_CMD_SET_ALL: apply all changes to the underlying device
  * @DCB_CMD_GPERM_HWADDR: get the permanent MAC address of the underlying
  *                        device.  Only useful when using bonding.
+ * @DCB_CMD_GCAP: request the DCB capabilities of the device
  */
 enum dcbnl_commands {
 	DCB_CMD_UNDEFINED,
@@ -60,6 +61,7 @@ enum dcbnl_commands {
 
 	DCB_CMD_SET_ALL,
 	DCB_CMD_GPERM_HWADDR,
+	DCB_CMD_GCAP,
 
 	__DCB_CMD_ENUM_MAX,
 	DCB_CMD_MAX = __DCB_CMD_ENUM_MAX - 1,
@@ -78,6 +80,7 @@ enum dcbnl_commands {
  * @DCB_ATTR_PG_CFG: priority group configuration (NLA_NESTED)
  * @DCB_ATTR_SET_ALL: bool to commit changes to hardware or not (NLA_U8)
  * @DCB_ATTR_PERM_HWADDR: MAC address of the physical device (NLA_NESTED)
+ * @DCB_ATTR_CAP: DCB capabilities of the device (NLA_NESTED)
  */
 enum dcbnl_attrs {
 	DCB_ATTR_UNDEFINED,
@@ -90,6 +93,7 @@ enum dcbnl_attrs {
 	DCB_ATTR_PG_CFG,
 	DCB_ATTR_SET_ALL,
 	DCB_ATTR_PERM_HWADDR,
+	DCB_ATTR_CAP,
 
 	__DCB_ATTR_ENUM_MAX,
 	DCB_ATTR_MAX = __DCB_ATTR_ENUM_MAX - 1,
@@ -216,6 +220,39 @@ enum dcbnl_tc_attrs {
 	DCB_TC_ATTR_PARAM_MAX = __DCB_TC_ATTR_PARAM_ENUM_MAX - 1,
 };
 
+/**
+ * enum dcbnl_cap_attrs - DCB Capability attributes
+ *
+ * @DCB_CAP_ATTR_UNDEFINED: unspecified attribute to catch errors
+ * @DCB_CAP_ATTR_ALL: (NLA_FLAG) all capability parameters
+ * @DCB_CAP_ATTR_PG: (NLA_U8) device supports Priority Groups
+ * @DCB_CAP_ATTR_PFC: (NLA_U8) device supports Priority Flow Control
+ * @DCB_CAP_ATTR_UP2TC: (NLA_U8) device supports user priority to
+ *                               traffic class mapping
+ * @DCB_CAP_ATTR_PG_TCS: (NLA_U8) bitmap where each bit represents a
+ *                                number of traffic classes the device
+ *                                can be configured to use for Priority Groups
+ * @DCB_CAP_ATTR_PFC_TCS: (NLA_U8) bitmap where each bit represents a
+ *                                 number of traffic classes the device can be
+ *                                 configured to use for Priority Flow Control
+ * @DCB_CAP_ATTR_GSP: (NLA_U8) device supports group strict priority
+ * @DCB_CAP_ATTR_BCN: (NLA_U8) device supports Backwards Congestion
+ *                             Notification
+ */
+enum dcbnl_cap_attrs {
+	DCB_CAP_ATTR_UNDEFINED,
+	DCB_CAP_ATTR_ALL,
+	DCB_CAP_ATTR_PG,
+	DCB_CAP_ATTR_PFC,
+	DCB_CAP_ATTR_UP2TC,
+	DCB_CAP_ATTR_PG_TCS,
+	DCB_CAP_ATTR_PFC_TCS,
+	DCB_CAP_ATTR_GSP,
+	DCB_CAP_ATTR_BCN,
+
+	__DCB_CAP_ATTR_ENUM_MAX,
+	DCB_CAP_ATTR_MAX = __DCB_CAP_ATTR_ENUM_MAX - 1,
+};
 /**
  * enum dcb_general_attr_values - general DCB attribute values
  *
diff --git a/include/net/dcbnl.h b/include/net/dcbnl.h
index 0ef0c5a46d8..183ed040cf4 100644
--- a/include/net/dcbnl.h
+++ b/include/net/dcbnl.h
@@ -39,6 +39,7 @@ struct dcbnl_rtnl_ops {
 	void (*setpfccfg)(struct net_device *, int, u8);
 	void (*getpfccfg)(struct net_device *, int, u8 *);
 	u8   (*setall)(struct net_device *);
+	u8   (*getcap)(struct net_device *, int, u8 *);
 };
 
 #endif /* __NET_DCBNL_H__ */
diff --git a/net/dcb/dcbnl.c b/net/dcb/dcbnl.c
index 516e8be83d7..de61d64d102 100644
--- a/net/dcb/dcbnl.c
+++ b/net/dcb/dcbnl.c
@@ -61,6 +61,7 @@ static struct nla_policy dcbnl_rtnl_policy[DCB_ATTR_MAX + 1] = {
 	[DCB_ATTR_PG_CFG]    = {.type = NLA_NESTED},
 	[DCB_ATTR_SET_ALL]   = {.type = NLA_U8},
 	[DCB_ATTR_PERM_HWADDR] = {.type = NLA_FLAG},
+	[DCB_ATTR_CAP]       = {.type = NLA_NESTED},
 };
 
 /* DCB priority flow control to User Priority nested attributes */
@@ -107,6 +108,17 @@ static struct nla_policy dcbnl_tc_param_nest[DCB_TC_ATTR_PARAM_MAX + 1] = {
 	[DCB_TC_ATTR_PARAM_ALL]             = {.type = NLA_FLAG},
 };
 
+/* DCB capabilities nested attributes. */
+static struct nla_policy dcbnl_cap_nest[DCB_CAP_ATTR_MAX + 1] = {
+	[DCB_CAP_ATTR_ALL]     = {.type = NLA_FLAG},
+	[DCB_CAP_ATTR_PG]      = {.type = NLA_U8},
+	[DCB_CAP_ATTR_PFC]     = {.type = NLA_U8},
+	[DCB_CAP_ATTR_UP2TC]   = {.type = NLA_U8},
+	[DCB_CAP_ATTR_PG_TCS]  = {.type = NLA_U8},
+	[DCB_CAP_ATTR_PFC_TCS] = {.type = NLA_U8},
+	[DCB_CAP_ATTR_GSP]     = {.type = NLA_U8},
+	[DCB_CAP_ATTR_BCN]     = {.type = NLA_U8},
+};
 
 /* standard netlink reply call */
 static int dcbnl_reply(u8 value, u8 event, u8 cmd, u8 attr, u32 pid,
@@ -269,6 +281,72 @@ err_out:
 	return -EINVAL;
 }
 
+static int dcbnl_getcap(struct net_device *netdev, struct nlattr **tb,
+                        u32 pid, u32 seq, u16 flags)
+{
+	struct sk_buff *dcbnl_skb;
+	struct nlmsghdr *nlh;
+	struct dcbmsg *dcb;
+	struct nlattr *data[DCB_CAP_ATTR_MAX + 1], *nest;
+	u8 value;
+	int ret = -EINVAL;
+	int i;
+	int getall = 0;
+
+	if (!tb[DCB_ATTR_CAP] || !netdev->dcbnl_ops->getcap)
+		return ret;
+
+	ret = nla_parse_nested(data, DCB_CAP_ATTR_MAX, tb[DCB_ATTR_CAP],
+	                       dcbnl_cap_nest);
+	if (ret)
+		goto err_out;
+
+	dcbnl_skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+	if (!dcbnl_skb)
+		goto err_out;
+
+	nlh = NLMSG_NEW(dcbnl_skb, pid, seq, RTM_GETDCB, sizeof(*dcb), flags);
+
+	dcb = NLMSG_DATA(nlh);
+	dcb->dcb_family = AF_UNSPEC;
+	dcb->cmd = DCB_CMD_GCAP;
+
+	nest = nla_nest_start(dcbnl_skb, DCB_ATTR_CAP);
+	if (!nest)
+		goto err;
+
+	if (data[DCB_CAP_ATTR_ALL])
+		getall = 1;
+
+	for (i = DCB_CAP_ATTR_ALL+1; i <= DCB_CAP_ATTR_MAX; i++) {
+		if (!getall && !data[i])
+			continue;
+
+		if (!netdev->dcbnl_ops->getcap(netdev, i, &value)) {
+			ret = nla_put_u8(dcbnl_skb, i, value);
+
+			if (ret) {
+				nla_nest_cancel(dcbnl_skb, nest);
+				goto err;
+			}
+		}
+	}
+	nla_nest_end(dcbnl_skb, nest);
+
+	nlmsg_end(dcbnl_skb, nlh);
+
+	ret = rtnl_unicast(dcbnl_skb, &init_net, pid);
+	if (ret)
+		goto err;
+
+	return 0;
+nlmsg_failure:
+err:
+	kfree(dcbnl_skb);
+err_out:
+	return -EINVAL;
+}
+
 static int __dcbnl_pg_getcfg(struct net_device *netdev, struct nlattr **tb,
                              u32 pid, u32 seq, u16 flags, int dir)
 {
@@ -675,6 +753,10 @@ static int dcb_doit(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg)
 		ret = dcbnl_pgrx_setcfg(netdev, tb, pid, nlh->nlmsg_seq,
 		                        nlh->nlmsg_flags);
 		goto out;
+	case DCB_CMD_GCAP:
+		ret = dcbnl_getcap(netdev, tb, pid, nlh->nlmsg_seq,
+		                   nlh->nlmsg_flags);
+		goto out;
 	default:
 		goto errout;
 	}
-- 
cgit v1.2.3-70-g09d2


From 33dbabc4a7f7bd72313c73a3c199f31f3900336f Mon Sep 17 00:00:00 2001
From: Alexander Duyck <alexander.h.duyck@intel.com>
Date: Thu, 20 Nov 2008 21:08:19 -0800
Subject: DCB: Add interface to query # of TCs supported by device

Adds interface for Data Center Bridging (DCB) to query (and set if
supported) the number of traffic classes currently supported by the
device for the two (DCB) features: priority groups (PG) and priority
flow control (PFC).

Signed-off-by: Alexander Duyck <alexander.h.duyck@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
Signed-off-by: Peter P Waskiewicz Jr <peter.p.waskiewicz.jr@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ixgbe/ixgbe_dcb_nl.c |  33 +++++++++-
 include/linux/dcbnl.h            |  27 ++++++++
 include/net/dcbnl.h              |   2 +
 net/dcb/dcbnl.c                  | 132 +++++++++++++++++++++++++++++++++++++++
 4 files changed, 193 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/net/ixgbe/ixgbe_dcb_nl.c b/drivers/net/ixgbe/ixgbe_dcb_nl.c
index eb3a6cea1aa..5921795f840 100644
--- a/drivers/net/ixgbe/ixgbe_dcb_nl.c
+++ b/drivers/net/ixgbe/ixgbe_dcb_nl.c
@@ -376,6 +376,35 @@ static u8 ixgbe_dcbnl_getcap(struct net_device *netdev, int capid, u8 *cap)
 	return rval;
 }
 
+static u8 ixgbe_dcbnl_getnumtcs(struct net_device *netdev, int tcid, u8 *num)
+{
+	struct ixgbe_adapter *adapter = netdev_priv(netdev);
+	u8 rval = 0;
+
+	if (adapter->flags & IXGBE_FLAG_DCB_ENABLED) {
+		switch (tcid) {
+		case DCB_NUMTCS_ATTR_PG:
+			*num = MAX_TRAFFIC_CLASS;
+			break;
+		case DCB_NUMTCS_ATTR_PFC:
+			*num = MAX_TRAFFIC_CLASS;
+			break;
+		default:
+			rval = -EINVAL;
+			break;
+		}
+	} else {
+		rval = -EINVAL;
+	}
+
+	return rval;
+}
+
+static u8 ixgbe_dcbnl_setnumtcs(struct net_device *netdev, int tcid, u8 num)
+{
+	return -EINVAL;
+}
+
 struct dcbnl_rtnl_ops dcbnl_ops = {
 	.getstate	= ixgbe_dcbnl_get_state,
 	.setstate	= ixgbe_dcbnl_set_state,
@@ -391,6 +420,8 @@ struct dcbnl_rtnl_ops dcbnl_ops = {
 	.setpfccfg	= ixgbe_dcbnl_set_pfc_cfg,
 	.getpfccfg	= ixgbe_dcbnl_get_pfc_cfg,
 	.setall		= ixgbe_dcbnl_set_all,
-	.getcap		= ixgbe_dcbnl_getcap
+	.getcap		= ixgbe_dcbnl_getcap,
+	.getnumtcs	= ixgbe_dcbnl_getnumtcs,
+	.setnumtcs	= ixgbe_dcbnl_setnumtcs
 };
 
diff --git a/include/linux/dcbnl.h b/include/linux/dcbnl.h
index 13f0c638a69..1077fba1dad 100644
--- a/include/linux/dcbnl.h
+++ b/include/linux/dcbnl.h
@@ -44,6 +44,8 @@ struct dcbmsg {
  * @DCB_CMD_GPERM_HWADDR: get the permanent MAC address of the underlying
  *                        device.  Only useful when using bonding.
  * @DCB_CMD_GCAP: request the DCB capabilities of the device
+ * @DCB_CMD_GNUMTCS: get the number of traffic classes currently supported
+ * @DCB_CMD_SNUMTCS: set the number of traffic classes
  */
 enum dcbnl_commands {
 	DCB_CMD_UNDEFINED,
@@ -62,6 +64,8 @@ enum dcbnl_commands {
 	DCB_CMD_SET_ALL,
 	DCB_CMD_GPERM_HWADDR,
 	DCB_CMD_GCAP,
+	DCB_CMD_GNUMTCS,
+	DCB_CMD_SNUMTCS,
 
 	__DCB_CMD_ENUM_MAX,
 	DCB_CMD_MAX = __DCB_CMD_ENUM_MAX - 1,
@@ -81,6 +85,7 @@ enum dcbnl_commands {
  * @DCB_ATTR_SET_ALL: bool to commit changes to hardware or not (NLA_U8)
  * @DCB_ATTR_PERM_HWADDR: MAC address of the physical device (NLA_NESTED)
  * @DCB_ATTR_CAP: DCB capabilities of the device (NLA_NESTED)
+ * @DCB_ATTR_NUMTCS: number of traffic classes supported (NLA_NESTED)
  */
 enum dcbnl_attrs {
 	DCB_ATTR_UNDEFINED,
@@ -94,6 +99,7 @@ enum dcbnl_attrs {
 	DCB_ATTR_SET_ALL,
 	DCB_ATTR_PERM_HWADDR,
 	DCB_ATTR_CAP,
+	DCB_ATTR_NUMTCS,
 
 	__DCB_ATTR_ENUM_MAX,
 	DCB_ATTR_MAX = __DCB_ATTR_ENUM_MAX - 1,
@@ -253,6 +259,27 @@ enum dcbnl_cap_attrs {
 	__DCB_CAP_ATTR_ENUM_MAX,
 	DCB_CAP_ATTR_MAX = __DCB_CAP_ATTR_ENUM_MAX - 1,
 };
+
+/**
+ * enum dcbnl_numtcs_attrs - number of traffic classes
+ *
+ * @DCB_NUMTCS_ATTR_UNDEFINED: unspecified attribute to catch errors
+ * @DCB_NUMTCS_ATTR_ALL: (NLA_FLAG) all traffic class attributes
+ * @DCB_NUMTCS_ATTR_PG: (NLA_U8) number of traffic classes used for
+ *                               priority groups
+ * @DCB_NUMTCS_ATTR_PFC: (NLA_U8) number of traffic classes which can
+ *                                support priority flow control
+ */
+enum dcbnl_numtcs_attrs {
+	DCB_NUMTCS_ATTR_UNDEFINED,
+	DCB_NUMTCS_ATTR_ALL,
+	DCB_NUMTCS_ATTR_PG,
+	DCB_NUMTCS_ATTR_PFC,
+
+	__DCB_NUMTCS_ATTR_ENUM_MAX,
+	DCB_NUMTCS_ATTR_MAX = __DCB_NUMTCS_ATTR_ENUM_MAX - 1,
+};
+
 /**
  * enum dcb_general_attr_values - general DCB attribute values
  *
diff --git a/include/net/dcbnl.h b/include/net/dcbnl.h
index 183ed040cf4..f0a65281ea7 100644
--- a/include/net/dcbnl.h
+++ b/include/net/dcbnl.h
@@ -40,6 +40,8 @@ struct dcbnl_rtnl_ops {
 	void (*getpfccfg)(struct net_device *, int, u8 *);
 	u8   (*setall)(struct net_device *);
 	u8   (*getcap)(struct net_device *, int, u8 *);
+	u8   (*getnumtcs)(struct net_device *, int, u8 *);
+	u8   (*setnumtcs)(struct net_device *, int, u8);
 };
 
 #endif /* __NET_DCBNL_H__ */
diff --git a/net/dcb/dcbnl.c b/net/dcb/dcbnl.c
index de61d64d102..5ff7e3c0c17 100644
--- a/net/dcb/dcbnl.c
+++ b/net/dcb/dcbnl.c
@@ -120,6 +120,13 @@ static struct nla_policy dcbnl_cap_nest[DCB_CAP_ATTR_MAX + 1] = {
 	[DCB_CAP_ATTR_BCN]     = {.type = NLA_U8},
 };
 
+/* DCB capabilities nested attributes. */
+static struct nla_policy dcbnl_numtcs_nest[DCB_NUMTCS_ATTR_MAX + 1] = {
+	[DCB_NUMTCS_ATTR_ALL]     = {.type = NLA_FLAG},
+	[DCB_NUMTCS_ATTR_PG]      = {.type = NLA_U8},
+	[DCB_NUMTCS_ATTR_PFC]     = {.type = NLA_U8},
+};
+
 /* standard netlink reply call */
 static int dcbnl_reply(u8 value, u8 event, u8 cmd, u8 attr, u32 pid,
                        u32 seq, u16 flags)
@@ -347,6 +354,123 @@ err_out:
 	return -EINVAL;
 }
 
+static int dcbnl_getnumtcs(struct net_device *netdev, struct nlattr **tb,
+                           u32 pid, u32 seq, u16 flags)
+{
+	struct sk_buff *dcbnl_skb;
+	struct nlmsghdr *nlh;
+	struct dcbmsg *dcb;
+	struct nlattr *data[DCB_NUMTCS_ATTR_MAX + 1], *nest;
+	u8 value;
+	int ret = -EINVAL;
+	int i;
+	int getall = 0;
+
+	if (!tb[DCB_ATTR_NUMTCS] || !netdev->dcbnl_ops->getnumtcs)
+		return ret;
+
+	ret = nla_parse_nested(data, DCB_NUMTCS_ATTR_MAX, tb[DCB_ATTR_NUMTCS],
+	                       dcbnl_numtcs_nest);
+	if (ret) {
+		ret = -EINVAL;
+		goto err_out;
+	}
+
+	dcbnl_skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+	if (!dcbnl_skb) {
+		ret = -EINVAL;
+		goto err_out;
+	}
+
+	nlh = NLMSG_NEW(dcbnl_skb, pid, seq, RTM_GETDCB, sizeof(*dcb), flags);
+
+	dcb = NLMSG_DATA(nlh);
+	dcb->dcb_family = AF_UNSPEC;
+	dcb->cmd = DCB_CMD_GNUMTCS;
+
+	nest = nla_nest_start(dcbnl_skb, DCB_ATTR_NUMTCS);
+	if (!nest) {
+		ret = -EINVAL;
+		goto err;
+	}
+
+	if (data[DCB_NUMTCS_ATTR_ALL])
+		getall = 1;
+
+	for (i = DCB_NUMTCS_ATTR_ALL+1; i <= DCB_NUMTCS_ATTR_MAX; i++) {
+		if (!getall && !data[i])
+			continue;
+
+		ret = netdev->dcbnl_ops->getnumtcs(netdev, i, &value);
+		if (!ret) {
+			ret = nla_put_u8(dcbnl_skb, i, value);
+
+			if (ret) {
+				nla_nest_cancel(dcbnl_skb, nest);
+				ret = -EINVAL;
+				goto err;
+			}
+		} else {
+			goto err;
+		}
+	}
+	nla_nest_end(dcbnl_skb, nest);
+
+	nlmsg_end(dcbnl_skb, nlh);
+
+	ret = rtnl_unicast(dcbnl_skb, &init_net, pid);
+	if (ret) {
+		ret = -EINVAL;
+		goto err;
+	}
+
+	return 0;
+nlmsg_failure:
+err:
+	kfree(dcbnl_skb);
+err_out:
+	return ret;
+}
+
+static int dcbnl_setnumtcs(struct net_device *netdev, struct nlattr **tb,
+                           u32 pid, u32 seq, u16 flags)
+{
+	struct nlattr *data[DCB_NUMTCS_ATTR_MAX + 1];
+	int ret = -EINVAL;
+	u8 value;
+	int i;
+
+	if (!tb[DCB_ATTR_NUMTCS] || !netdev->dcbnl_ops->setstate)
+		return ret;
+
+	ret = nla_parse_nested(data, DCB_NUMTCS_ATTR_MAX, tb[DCB_ATTR_NUMTCS],
+	                       dcbnl_numtcs_nest);
+
+	if (ret) {
+		ret = -EINVAL;
+		goto err;
+	}
+
+	for (i = DCB_NUMTCS_ATTR_ALL+1; i <= DCB_NUMTCS_ATTR_MAX; i++) {
+		if (data[i] == NULL)
+			continue;
+
+		value = nla_get_u8(data[i]);
+
+		ret = netdev->dcbnl_ops->setnumtcs(netdev, i, value);
+
+		if (ret)
+			goto operr;
+	}
+
+operr:
+	ret = dcbnl_reply(!!ret, RTM_SETDCB, DCB_CMD_SNUMTCS,
+	                  DCB_ATTR_NUMTCS, pid, seq, flags);
+
+err:
+	return ret;
+}
+
 static int __dcbnl_pg_getcfg(struct net_device *netdev, struct nlattr **tb,
                              u32 pid, u32 seq, u16 flags, int dir)
 {
@@ -757,6 +881,14 @@ static int dcb_doit(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg)
 		ret = dcbnl_getcap(netdev, tb, pid, nlh->nlmsg_seq,
 		                   nlh->nlmsg_flags);
 		goto out;
+	case DCB_CMD_GNUMTCS:
+		ret = dcbnl_getnumtcs(netdev, tb, pid, nlh->nlmsg_seq,
+		                      nlh->nlmsg_flags);
+		goto out;
+	case DCB_CMD_SNUMTCS:
+		ret = dcbnl_setnumtcs(netdev, tb, pid, nlh->nlmsg_seq,
+		                      nlh->nlmsg_flags);
+		goto out;
 	default:
 		goto errout;
 	}
-- 
cgit v1.2.3-70-g09d2


From 0eb3aa9bab20217fb42244ccdcb5bf8a002f504c Mon Sep 17 00:00:00 2001
From: Alexander Duyck <alexander.h.duyck@intel.com>
Date: Thu, 20 Nov 2008 21:09:23 -0800
Subject: DCB: Add interface to query the state of PFC feature.

Adds a netlink interface for Data Center Bridging (DCB) to get and set
the enable state of the Priority Flow Control (PFC) feature.
Primarily, this is a way to turn off PFC in the driver while DCB
remains enabled.

Signed-off-by: Alexander Duyck <alexander.h.duyck@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
Signed-off-by: Peter P Waskiewicz Jr <peter.p.waskiewicz.jr@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ixgbe/ixgbe_dcb_nl.c | 16 ++++++++++++++-
 include/linux/dcbnl.h            |  2 ++
 include/net/dcbnl.h              |  2 ++
 net/dcb/dcbnl.c                  | 43 ++++++++++++++++++++++++++++++++++++++++
 4 files changed, 62 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/net/ixgbe/ixgbe_dcb_nl.c b/drivers/net/ixgbe/ixgbe_dcb_nl.c
index 5921795f840..dd940a8f935 100644
--- a/drivers/net/ixgbe/ixgbe_dcb_nl.c
+++ b/drivers/net/ixgbe/ixgbe_dcb_nl.c
@@ -405,6 +405,18 @@ static u8 ixgbe_dcbnl_setnumtcs(struct net_device *netdev, int tcid, u8 num)
 	return -EINVAL;
 }
 
+static u8 ixgbe_dcbnl_getpfcstate(struct net_device *netdev)
+{
+	struct ixgbe_adapter *adapter = netdev_priv(netdev);
+
+	return !!(adapter->flags & IXGBE_FLAG_DCB_ENABLED);
+}
+
+static void ixgbe_dcbnl_setpfcstate(struct net_device *netdev, u8 state)
+{
+	return;
+}
+
 struct dcbnl_rtnl_ops dcbnl_ops = {
 	.getstate	= ixgbe_dcbnl_get_state,
 	.setstate	= ixgbe_dcbnl_set_state,
@@ -422,6 +434,8 @@ struct dcbnl_rtnl_ops dcbnl_ops = {
 	.setall		= ixgbe_dcbnl_set_all,
 	.getcap		= ixgbe_dcbnl_getcap,
 	.getnumtcs	= ixgbe_dcbnl_getnumtcs,
-	.setnumtcs	= ixgbe_dcbnl_setnumtcs
+	.setnumtcs	= ixgbe_dcbnl_setnumtcs,
+	.getpfcstate	= ixgbe_dcbnl_getpfcstate,
+	.setpfcstate	= ixgbe_dcbnl_setpfcstate
 };
 
diff --git a/include/linux/dcbnl.h b/include/linux/dcbnl.h
index 1077fba1dad..6cc4560bc37 100644
--- a/include/linux/dcbnl.h
+++ b/include/linux/dcbnl.h
@@ -66,6 +66,8 @@ enum dcbnl_commands {
 	DCB_CMD_GCAP,
 	DCB_CMD_GNUMTCS,
 	DCB_CMD_SNUMTCS,
+	DCB_CMD_PFC_GSTATE,
+	DCB_CMD_PFC_SSTATE,
 
 	__DCB_CMD_ENUM_MAX,
 	DCB_CMD_MAX = __DCB_CMD_ENUM_MAX - 1,
diff --git a/include/net/dcbnl.h b/include/net/dcbnl.h
index f0a65281ea7..c7d87caf3f9 100644
--- a/include/net/dcbnl.h
+++ b/include/net/dcbnl.h
@@ -42,6 +42,8 @@ struct dcbnl_rtnl_ops {
 	u8   (*getcap)(struct net_device *, int, u8 *);
 	u8   (*getnumtcs)(struct net_device *, int, u8 *);
 	u8   (*setnumtcs)(struct net_device *, int, u8);
+	u8   (*getpfcstate)(struct net_device *);
+	void (*setpfcstate)(struct net_device *, u8);
 };
 
 #endif /* __NET_DCBNL_H__ */
diff --git a/net/dcb/dcbnl.c b/net/dcb/dcbnl.c
index 5ff7e3c0c17..758419c6f59 100644
--- a/net/dcb/dcbnl.c
+++ b/net/dcb/dcbnl.c
@@ -62,6 +62,7 @@ static struct nla_policy dcbnl_rtnl_policy[DCB_ATTR_MAX + 1] = {
 	[DCB_ATTR_SET_ALL]   = {.type = NLA_U8},
 	[DCB_ATTR_PERM_HWADDR] = {.type = NLA_FLAG},
 	[DCB_ATTR_CAP]       = {.type = NLA_NESTED},
+	[DCB_ATTR_PFC_STATE] = {.type = NLA_U8},
 };
 
 /* DCB priority flow control to User Priority nested attributes */
@@ -471,6 +472,40 @@ err:
 	return ret;
 }
 
+static int dcbnl_getpfcstate(struct net_device *netdev, struct nlattr **tb,
+                             u32 pid, u32 seq, u16 flags)
+{
+	int ret = -EINVAL;
+
+	if (!netdev->dcbnl_ops->getpfcstate)
+		return ret;
+
+	ret = dcbnl_reply(netdev->dcbnl_ops->getpfcstate(netdev), RTM_GETDCB,
+	                  DCB_CMD_PFC_GSTATE, DCB_ATTR_PFC_STATE,
+	                  pid, seq, flags);
+
+	return ret;
+}
+
+static int dcbnl_setpfcstate(struct net_device *netdev, struct nlattr **tb,
+                             u32 pid, u32 seq, u16 flags)
+{
+	int ret = -EINVAL;
+	u8 value;
+
+	if (!tb[DCB_ATTR_PFC_STATE] || !netdev->dcbnl_ops->setpfcstate)
+		return ret;
+
+	value = nla_get_u8(tb[DCB_ATTR_PFC_STATE]);
+
+	netdev->dcbnl_ops->setpfcstate(netdev, value);
+
+	ret = dcbnl_reply(0, RTM_SETDCB, DCB_CMD_PFC_SSTATE, DCB_ATTR_PFC_STATE,
+	                  pid, seq, flags);
+
+	return ret;
+}
+
 static int __dcbnl_pg_getcfg(struct net_device *netdev, struct nlattr **tb,
                              u32 pid, u32 seq, u16 flags, int dir)
 {
@@ -889,6 +924,14 @@ static int dcb_doit(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg)
 		ret = dcbnl_setnumtcs(netdev, tb, pid, nlh->nlmsg_seq,
 		                      nlh->nlmsg_flags);
 		goto out;
+	case DCB_CMD_PFC_GSTATE:
+		ret = dcbnl_getpfcstate(netdev, tb, pid, nlh->nlmsg_seq,
+		                        nlh->nlmsg_flags);
+		goto out;
+	case DCB_CMD_PFC_SSTATE:
+		ret = dcbnl_setpfcstate(netdev, tb, pid, nlh->nlmsg_seq,
+		                        nlh->nlmsg_flags);
+		goto out;
 	default:
 		goto errout;
 	}
-- 
cgit v1.2.3-70-g09d2


From 859ee3c43812051e21816c6d6d4cc04fb7ce9b2e Mon Sep 17 00:00:00 2001
From: Alexander Duyck <alexander.h.duyck@intel.com>
Date: Thu, 20 Nov 2008 21:10:23 -0800
Subject: DCB: Add support for DCB BCN

Adds an interface to configure the Backward Congestion Notification
(BCN) feature.  In a BCN capabale network, congestion notifications
from congested points out in the network can cause the end station
limit the rate of a given traffic flow.

Signed-off-by: Alexander Duyck <alexander.h.duyck@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
Signed-off-by: Peter P Waskiewicz Jr <peter.p.waskiewicz.jr@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ixgbe/ixgbe_dcb.h    |  27 ++++++
 drivers/net/ixgbe/ixgbe_dcb_nl.c | 176 ++++++++++++++++++++++++++++++++++++++-
 include/linux/dcbnl.h            |  44 +++++++++-
 include/net/dcbnl.h              |   4 +
 net/dcb/dcbnl.c                  | 174 ++++++++++++++++++++++++++++++++++++--
 5 files changed, 416 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ixgbe/ixgbe_dcb.h b/drivers/net/ixgbe/ixgbe_dcb.h
index 62dfd243bed..75f6efe1e36 100644
--- a/drivers/net/ixgbe/ixgbe_dcb.h
+++ b/drivers/net/ixgbe/ixgbe_dcb.h
@@ -108,7 +108,34 @@ enum dcb_rx_pba_cfg {
 	pba_80_48      /* PBA[0-3] each use 80KB, PBA[4-7] each use 48KB */
 };
 
+/*
+ * This structure contains many values encoded as fixed-point
+ * numbers, meaning that some of bits are dedicated to the
+ * magnitude and others to the fraction part. In the comments
+ * this is shown as f=n, where n is the number of fraction bits.
+ * These fraction bits are always the low-order bits. The size
+ * of the magnitude is not specified.
+ */
+struct bcn_config {
+	u32 rp_admin_mode[MAX_TRAFFIC_CLASS]; /* BCN enabled, per TC */
+	u32 bcna_option[2]; /* BCNA Port + MAC Addr */
+	u32 rp_w;        /* Derivative Weight, f=3 */
+	u32 rp_gi;       /* Increase Gain, f=12 */
+	u32 rp_gd;       /* Decrease Gain, f=12 */
+	u32 rp_ru;       /* Rate Unit */
+	u32 rp_alpha;    /* Max Decrease Factor, f=12 */
+	u32 rp_beta;     /* Max Increase Factor, f=12 */
+	u32 rp_ri;       /* Initial Rate */
+	u32 rp_td;       /* Drift Interval Timer */
+	u32 rp_rd;       /* Drift Increase */
+	u32 rp_tmax;     /* Severe Congestion Backoff Timer Range */
+	u32 rp_rmin;     /* Severe Congestion Restart Rate */
+	u32 rp_wrtt;     /* RTT Moving Average Weight */
+};
+
 struct ixgbe_dcb_config {
+	struct bcn_config bcn;
+
 	struct tc_configuration tc_config[MAX_TRAFFIC_CLASS];
 	u8     bw_percentage[2][MAX_BW_GROUP]; /* One each for Tx/Rx */
 
diff --git a/drivers/net/ixgbe/ixgbe_dcb_nl.c b/drivers/net/ixgbe/ixgbe_dcb_nl.c
index dd940a8f935..615c2803202 100644
--- a/drivers/net/ixgbe/ixgbe_dcb_nl.c
+++ b/drivers/net/ixgbe/ixgbe_dcb_nl.c
@@ -34,6 +34,7 @@
 #define BIT_PFC		0x02
 #define BIT_PG_RX	0x04
 #define BIT_PG_TX	0x08
+#define BIT_BCN         0x10
 
 int ixgbe_copy_dcb_cfg(struct ixgbe_dcb_config *src_dcb_cfg,
                        struct ixgbe_dcb_config *dst_dcb_cfg, int tc_max)
@@ -88,6 +89,23 @@ int ixgbe_copy_dcb_cfg(struct ixgbe_dcb_config *src_dcb_cfg,
 			src_dcb_cfg->tc_config[i - DCB_PFC_UP_ATTR_0].dcb_pfc;
 	}
 
+	for (i = DCB_BCN_ATTR_RP_0; i < DCB_BCN_ATTR_RP_ALL; i++) {
+		dst_dcb_cfg->bcn.rp_admin_mode[i - DCB_BCN_ATTR_RP_0] =
+			src_dcb_cfg->bcn.rp_admin_mode[i - DCB_BCN_ATTR_RP_0];
+	}
+	dst_dcb_cfg->bcn.rp_alpha = src_dcb_cfg->bcn.rp_alpha;
+	dst_dcb_cfg->bcn.rp_beta = src_dcb_cfg->bcn.rp_beta;
+	dst_dcb_cfg->bcn.rp_gd = src_dcb_cfg->bcn.rp_gd;
+	dst_dcb_cfg->bcn.rp_gi = src_dcb_cfg->bcn.rp_gi;
+	dst_dcb_cfg->bcn.rp_tmax = src_dcb_cfg->bcn.rp_tmax;
+	dst_dcb_cfg->bcn.rp_td = src_dcb_cfg->bcn.rp_td;
+	dst_dcb_cfg->bcn.rp_rmin = src_dcb_cfg->bcn.rp_rmin;
+	dst_dcb_cfg->bcn.rp_w = src_dcb_cfg->bcn.rp_w;
+	dst_dcb_cfg->bcn.rp_rd = src_dcb_cfg->bcn.rp_rd;
+	dst_dcb_cfg->bcn.rp_ru = src_dcb_cfg->bcn.rp_ru;
+	dst_dcb_cfg->bcn.rp_wrtt = src_dcb_cfg->bcn.rp_wrtt;
+	dst_dcb_cfg->bcn.rp_ri = src_dcb_cfg->bcn.rp_ri;
+
 	return 0;
 }
 
@@ -313,6 +331,7 @@ static u8 ixgbe_dcbnl_set_all(struct net_device *netdev)
 	struct ixgbe_adapter *adapter = netdev_priv(netdev);
 	int ret;
 
+	adapter->dcb_set_bitmap &= ~BIT_BCN;	/* no set for BCN */
 	if (!adapter->dcb_set_bitmap)
 		return 1;
 
@@ -417,6 +436,157 @@ static void ixgbe_dcbnl_setpfcstate(struct net_device *netdev, u8 state)
 	return;
 }
 
+static void ixgbe_dcbnl_getbcnrp(struct net_device *netdev, int priority,
+				  u8 *setting)
+{
+	struct ixgbe_adapter *adapter = netdev_priv(netdev);
+
+	*setting = adapter->dcb_cfg.bcn.rp_admin_mode[priority];
+}
+
+
+static void ixgbe_dcbnl_getbcncfg(struct net_device *netdev, int enum_index,
+				  u32 *setting)
+{
+	struct ixgbe_adapter *adapter = netdev_priv(netdev);
+
+	switch (enum_index) {
+	case DCB_BCN_ATTR_ALPHA:
+		*setting = adapter->dcb_cfg.bcn.rp_alpha;
+		break;
+	case DCB_BCN_ATTR_BETA:
+		*setting = adapter->dcb_cfg.bcn.rp_beta;
+		break;
+	case DCB_BCN_ATTR_GD:
+		*setting = adapter->dcb_cfg.bcn.rp_gd;
+		break;
+	case DCB_BCN_ATTR_GI:
+		*setting = adapter->dcb_cfg.bcn.rp_gi;
+		break;
+	case DCB_BCN_ATTR_TMAX:
+		*setting = adapter->dcb_cfg.bcn.rp_tmax;
+		break;
+	case DCB_BCN_ATTR_TD:
+		*setting = adapter->dcb_cfg.bcn.rp_td;
+		break;
+	case DCB_BCN_ATTR_RMIN:
+		*setting = adapter->dcb_cfg.bcn.rp_rmin;
+		break;
+	case DCB_BCN_ATTR_W:
+		*setting = adapter->dcb_cfg.bcn.rp_w;
+		break;
+	case DCB_BCN_ATTR_RD:
+		*setting = adapter->dcb_cfg.bcn.rp_rd;
+		break;
+	case DCB_BCN_ATTR_RU:
+		*setting = adapter->dcb_cfg.bcn.rp_ru;
+		break;
+	case DCB_BCN_ATTR_WRTT:
+		*setting = adapter->dcb_cfg.bcn.rp_wrtt;
+		break;
+	case DCB_BCN_ATTR_RI:
+		*setting = adapter->dcb_cfg.bcn.rp_ri;
+		break;
+	default:
+		*setting = -1;
+	}
+}
+
+static void ixgbe_dcbnl_setbcnrp(struct net_device *netdev, int priority,
+				 u8 setting)
+{
+	struct ixgbe_adapter *adapter = netdev_priv(netdev);
+
+	adapter->temp_dcb_cfg.bcn.rp_admin_mode[priority] = setting;
+
+	if (adapter->temp_dcb_cfg.bcn.rp_admin_mode[priority] !=
+	    adapter->dcb_cfg.bcn.rp_admin_mode[priority])
+		adapter->dcb_set_bitmap |= BIT_BCN;
+}
+
+static void ixgbe_dcbnl_setbcncfg(struct net_device *netdev, int enum_index,
+				 u32 setting)
+{
+	struct ixgbe_adapter *adapter = netdev_priv(netdev);
+
+	switch (enum_index) {
+	case DCB_BCN_ATTR_ALPHA:
+		adapter->temp_dcb_cfg.bcn.rp_alpha = setting;
+		if (adapter->temp_dcb_cfg.bcn.rp_alpha !=
+		    adapter->dcb_cfg.bcn.rp_alpha)
+			adapter->dcb_set_bitmap |= BIT_BCN;
+		break;
+	case DCB_BCN_ATTR_BETA:
+		adapter->temp_dcb_cfg.bcn.rp_beta = setting;
+		if (adapter->temp_dcb_cfg.bcn.rp_beta !=
+		    adapter->dcb_cfg.bcn.rp_beta)
+			adapter->dcb_set_bitmap |= BIT_BCN;
+		break;
+	case DCB_BCN_ATTR_GD:
+		adapter->temp_dcb_cfg.bcn.rp_gd = setting;
+		if (adapter->temp_dcb_cfg.bcn.rp_gd !=
+		    adapter->dcb_cfg.bcn.rp_gd)
+			adapter->dcb_set_bitmap |= BIT_BCN;
+		break;
+	case DCB_BCN_ATTR_GI:
+		adapter->temp_dcb_cfg.bcn.rp_gi = setting;
+		if (adapter->temp_dcb_cfg.bcn.rp_gi !=
+		    adapter->dcb_cfg.bcn.rp_gi)
+			adapter->dcb_set_bitmap |= BIT_BCN;
+		break;
+	case DCB_BCN_ATTR_TMAX:
+		adapter->temp_dcb_cfg.bcn.rp_tmax = setting;
+		if (adapter->temp_dcb_cfg.bcn.rp_tmax !=
+		    adapter->dcb_cfg.bcn.rp_tmax)
+			adapter->dcb_set_bitmap |= BIT_BCN;
+		break;
+	case DCB_BCN_ATTR_TD:
+		adapter->temp_dcb_cfg.bcn.rp_td = setting;
+		if (adapter->temp_dcb_cfg.bcn.rp_td !=
+		    adapter->dcb_cfg.bcn.rp_td)
+			adapter->dcb_set_bitmap |= BIT_BCN;
+		break;
+	case DCB_BCN_ATTR_RMIN:
+		adapter->temp_dcb_cfg.bcn.rp_rmin = setting;
+		if (adapter->temp_dcb_cfg.bcn.rp_rmin !=
+		    adapter->dcb_cfg.bcn.rp_rmin)
+			adapter->dcb_set_bitmap |= BIT_BCN;
+		break;
+	case DCB_BCN_ATTR_W:
+		adapter->temp_dcb_cfg.bcn.rp_w = setting;
+		if (adapter->temp_dcb_cfg.bcn.rp_w !=
+		    adapter->dcb_cfg.bcn.rp_w)
+			adapter->dcb_set_bitmap |= BIT_BCN;
+		break;
+	case DCB_BCN_ATTR_RD:
+		adapter->temp_dcb_cfg.bcn.rp_rd = setting;
+		if (adapter->temp_dcb_cfg.bcn.rp_rd !=
+		    adapter->dcb_cfg.bcn.rp_rd)
+			adapter->dcb_set_bitmap |= BIT_BCN;
+		break;
+	case DCB_BCN_ATTR_RU:
+		adapter->temp_dcb_cfg.bcn.rp_ru = setting;
+		if (adapter->temp_dcb_cfg.bcn.rp_ru !=
+		    adapter->dcb_cfg.bcn.rp_ru)
+			adapter->dcb_set_bitmap |= BIT_BCN;
+		break;
+	case DCB_BCN_ATTR_WRTT:
+		adapter->temp_dcb_cfg.bcn.rp_wrtt = setting;
+		if (adapter->temp_dcb_cfg.bcn.rp_wrtt !=
+		    adapter->dcb_cfg.bcn.rp_wrtt)
+			adapter->dcb_set_bitmap |= BIT_BCN;
+		break;
+	case DCB_BCN_ATTR_RI:
+		adapter->temp_dcb_cfg.bcn.rp_ri = setting;
+		if (adapter->temp_dcb_cfg.bcn.rp_ri !=
+		    adapter->dcb_cfg.bcn.rp_ri)
+			adapter->dcb_set_bitmap |= BIT_BCN;
+		break;
+	default:
+		break;
+	}
+}
+
 struct dcbnl_rtnl_ops dcbnl_ops = {
 	.getstate	= ixgbe_dcbnl_get_state,
 	.setstate	= ixgbe_dcbnl_set_state,
@@ -436,6 +606,10 @@ struct dcbnl_rtnl_ops dcbnl_ops = {
 	.getnumtcs	= ixgbe_dcbnl_getnumtcs,
 	.setnumtcs	= ixgbe_dcbnl_setnumtcs,
 	.getpfcstate	= ixgbe_dcbnl_getpfcstate,
-	.setpfcstate	= ixgbe_dcbnl_setpfcstate
+	.setpfcstate	= ixgbe_dcbnl_setpfcstate,
+	.getbcncfg      = ixgbe_dcbnl_getbcncfg,
+	.getbcnrp       = ixgbe_dcbnl_getbcnrp,
+	.setbcncfg      = ixgbe_dcbnl_setbcncfg,
+	.setbcnrp       = ixgbe_dcbnl_setbcnrp
 };
 
diff --git a/include/linux/dcbnl.h b/include/linux/dcbnl.h
index 6cc4560bc37..e73a61449ad 100644
--- a/include/linux/dcbnl.h
+++ b/include/linux/dcbnl.h
@@ -46,6 +46,8 @@ struct dcbmsg {
  * @DCB_CMD_GCAP: request the DCB capabilities of the device
  * @DCB_CMD_GNUMTCS: get the number of traffic classes currently supported
  * @DCB_CMD_SNUMTCS: set the number of traffic classes
+ * @DCB_CMD_GBCN: set backward congestion notification configuration
+ * @DCB_CMD_SBCN: get backward congestion notification configration.
  */
 enum dcbnl_commands {
 	DCB_CMD_UNDEFINED,
@@ -62,18 +64,24 @@ enum dcbnl_commands {
 	DCB_CMD_PFC_SCFG,
 
 	DCB_CMD_SET_ALL,
+
 	DCB_CMD_GPERM_HWADDR,
+
 	DCB_CMD_GCAP,
+
 	DCB_CMD_GNUMTCS,
 	DCB_CMD_SNUMTCS,
+
 	DCB_CMD_PFC_GSTATE,
 	DCB_CMD_PFC_SSTATE,
 
+	DCB_CMD_BCN_GCFG,
+	DCB_CMD_BCN_SCFG,
+
 	__DCB_CMD_ENUM_MAX,
 	DCB_CMD_MAX = __DCB_CMD_ENUM_MAX - 1,
 };
 
-
 /**
  * enum dcbnl_attrs - DCB top-level netlink attributes
  *
@@ -88,6 +96,7 @@ enum dcbnl_commands {
  * @DCB_ATTR_PERM_HWADDR: MAC address of the physical device (NLA_NESTED)
  * @DCB_ATTR_CAP: DCB capabilities of the device (NLA_NESTED)
  * @DCB_ATTR_NUMTCS: number of traffic classes supported (NLA_NESTED)
+ * @DCB_ATTR_BCN: backward congestion notification configuration (NLA_NESTED)
  */
 enum dcbnl_attrs {
 	DCB_ATTR_UNDEFINED,
@@ -102,6 +111,7 @@ enum dcbnl_attrs {
 	DCB_ATTR_PERM_HWADDR,
 	DCB_ATTR_CAP,
 	DCB_ATTR_NUMTCS,
+	DCB_ATTR_BCN,
 
 	__DCB_ATTR_ENUM_MAX,
 	DCB_ATTR_MAX = __DCB_ATTR_ENUM_MAX - 1,
@@ -282,6 +292,38 @@ enum dcbnl_numtcs_attrs {
 	DCB_NUMTCS_ATTR_MAX = __DCB_NUMTCS_ATTR_ENUM_MAX - 1,
 };
 
+enum dcbnl_bcn_attrs{
+	DCB_BCN_ATTR_UNDEFINED = 0,
+
+	DCB_BCN_ATTR_RP_0,
+	DCB_BCN_ATTR_RP_1,
+	DCB_BCN_ATTR_RP_2,
+	DCB_BCN_ATTR_RP_3,
+	DCB_BCN_ATTR_RP_4,
+	DCB_BCN_ATTR_RP_5,
+	DCB_BCN_ATTR_RP_6,
+	DCB_BCN_ATTR_RP_7,
+	DCB_BCN_ATTR_RP_ALL,
+
+	DCB_BCN_ATTR_ALPHA,
+	DCB_BCN_ATTR_BETA,
+	DCB_BCN_ATTR_GD,
+	DCB_BCN_ATTR_GI,
+	DCB_BCN_ATTR_TMAX,
+	DCB_BCN_ATTR_TD,
+	DCB_BCN_ATTR_RMIN,
+	DCB_BCN_ATTR_W,
+	DCB_BCN_ATTR_RD,
+	DCB_BCN_ATTR_RU,
+	DCB_BCN_ATTR_WRTT,
+	DCB_BCN_ATTR_RI,
+	DCB_BCN_ATTR_C,
+	DCB_BCN_ATTR_ALL,
+
+	__DCB_BCN_ATTR_ENUM_MAX,
+	DCB_BCN_ATTR_MAX = __DCB_BCN_ATTR_ENUM_MAX - 1,
+};
+
 /**
  * enum dcb_general_attr_values - general DCB attribute values
  *
diff --git a/include/net/dcbnl.h b/include/net/dcbnl.h
index c7d87caf3f9..91e0a3d7faf 100644
--- a/include/net/dcbnl.h
+++ b/include/net/dcbnl.h
@@ -44,6 +44,10 @@ struct dcbnl_rtnl_ops {
 	u8   (*setnumtcs)(struct net_device *, int, u8);
 	u8   (*getpfcstate)(struct net_device *);
 	void (*setpfcstate)(struct net_device *, u8);
+	void (*getbcncfg)(struct net_device *, int, u32 *);
+	void (*setbcncfg)(struct net_device *, int, u32);
+	void (*getbcnrp)(struct net_device *, int, u8 *);
+	void (*setbcnrp)(struct net_device *, int, u8);
 };
 
 #endif /* __NET_DCBNL_H__ */
diff --git a/net/dcb/dcbnl.c b/net/dcb/dcbnl.c
index 758419c6f59..b2bda3f610d 100644
--- a/net/dcb/dcbnl.c
+++ b/net/dcb/dcbnl.c
@@ -55,14 +55,15 @@ MODULE_LICENSE("GPL");
 
 /* DCB netlink attributes policy */
 static struct nla_policy dcbnl_rtnl_policy[DCB_ATTR_MAX + 1] = {
-	[DCB_ATTR_IFNAME]    = {.type = NLA_STRING, .len = IFNAMSIZ - 1},
-	[DCB_ATTR_STATE]     = {.type = NLA_U8},
-	[DCB_ATTR_PFC_CFG]   = {.type = NLA_NESTED},
-	[DCB_ATTR_PG_CFG]    = {.type = NLA_NESTED},
-	[DCB_ATTR_SET_ALL]   = {.type = NLA_U8},
+	[DCB_ATTR_IFNAME]      = {.type = NLA_NUL_STRING, .len = IFNAMSIZ - 1},
+	[DCB_ATTR_STATE]       = {.type = NLA_U8},
+	[DCB_ATTR_PFC_CFG]     = {.type = NLA_NESTED},
+	[DCB_ATTR_PG_CFG]      = {.type = NLA_NESTED},
+	[DCB_ATTR_SET_ALL]     = {.type = NLA_U8},
 	[DCB_ATTR_PERM_HWADDR] = {.type = NLA_FLAG},
-	[DCB_ATTR_CAP]       = {.type = NLA_NESTED},
-	[DCB_ATTR_PFC_STATE] = {.type = NLA_U8},
+	[DCB_ATTR_CAP]         = {.type = NLA_NESTED},
+	[DCB_ATTR_PFC_STATE]   = {.type = NLA_U8},
+	[DCB_ATTR_BCN]         = {.type = NLA_NESTED},
 };
 
 /* DCB priority flow control to User Priority nested attributes */
@@ -128,6 +129,33 @@ static struct nla_policy dcbnl_numtcs_nest[DCB_NUMTCS_ATTR_MAX + 1] = {
 	[DCB_NUMTCS_ATTR_PFC]     = {.type = NLA_U8},
 };
 
+/* DCB BCN nested attributes. */
+static struct nla_policy dcbnl_bcn_nest[DCB_BCN_ATTR_MAX + 1] = {
+	[DCB_BCN_ATTR_RP_0]         = {.type = NLA_U8},
+	[DCB_BCN_ATTR_RP_1]         = {.type = NLA_U8},
+	[DCB_BCN_ATTR_RP_2]         = {.type = NLA_U8},
+	[DCB_BCN_ATTR_RP_3]         = {.type = NLA_U8},
+	[DCB_BCN_ATTR_RP_4]         = {.type = NLA_U8},
+	[DCB_BCN_ATTR_RP_5]         = {.type = NLA_U8},
+	[DCB_BCN_ATTR_RP_6]         = {.type = NLA_U8},
+	[DCB_BCN_ATTR_RP_7]         = {.type = NLA_U8},
+	[DCB_BCN_ATTR_RP_ALL]       = {.type = NLA_FLAG},
+	[DCB_BCN_ATTR_ALPHA]        = {.type = NLA_U32},
+	[DCB_BCN_ATTR_BETA]         = {.type = NLA_U32},
+	[DCB_BCN_ATTR_GD]           = {.type = NLA_U32},
+	[DCB_BCN_ATTR_GI]           = {.type = NLA_U32},
+	[DCB_BCN_ATTR_TMAX]         = {.type = NLA_U32},
+	[DCB_BCN_ATTR_TD]           = {.type = NLA_U32},
+	[DCB_BCN_ATTR_RMIN]         = {.type = NLA_U32},
+	[DCB_BCN_ATTR_W]            = {.type = NLA_U32},
+	[DCB_BCN_ATTR_RD]           = {.type = NLA_U32},
+	[DCB_BCN_ATTR_RU]           = {.type = NLA_U32},
+	[DCB_BCN_ATTR_WRTT]         = {.type = NLA_U32},
+	[DCB_BCN_ATTR_RI]           = {.type = NLA_U32},
+	[DCB_BCN_ATTR_C]            = {.type = NLA_U32},
+	[DCB_BCN_ATTR_ALL]          = {.type = NLA_FLAG},
+};
+
 /* standard netlink reply call */
 static int dcbnl_reply(u8 value, u8 event, u8 cmd, u8 attr, u32 pid,
                        u32 seq, u16 flags)
@@ -843,6 +871,130 @@ static int dcbnl_pgrx_setcfg(struct net_device *netdev, struct nlattr **tb,
 	return __dcbnl_pg_setcfg(netdev, tb, pid, seq, flags, 1);
 }
 
+static int dcbnl_bcn_getcfg(struct net_device *netdev, struct nlattr **tb,
+                            u32 pid, u32 seq, u16 flags)
+{
+	struct sk_buff *dcbnl_skb;
+	struct nlmsghdr *nlh;
+	struct dcbmsg *dcb;
+	struct nlattr *bcn_nest;
+	struct nlattr *bcn_tb[DCB_BCN_ATTR_MAX + 1];
+	u8 value_byte;
+	u32 value_integer;
+	int ret  = -EINVAL;
+	bool getall = false;
+	int i;
+
+	if (!tb[DCB_ATTR_BCN] || !netdev->dcbnl_ops->getbcnrp ||
+	    !netdev->dcbnl_ops->getbcncfg)
+		return ret;
+
+	ret = nla_parse_nested(bcn_tb, DCB_BCN_ATTR_MAX,
+	                       tb[DCB_ATTR_BCN], dcbnl_bcn_nest);
+
+	if (ret)
+		goto err_out;
+
+	dcbnl_skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+	if (!dcbnl_skb)
+		goto err_out;
+
+	nlh = NLMSG_NEW(dcbnl_skb, pid, seq, RTM_GETDCB, sizeof(*dcb), flags);
+
+	dcb = NLMSG_DATA(nlh);
+	dcb->dcb_family = AF_UNSPEC;
+	dcb->cmd = DCB_CMD_BCN_GCFG;
+
+	bcn_nest = nla_nest_start(dcbnl_skb, DCB_ATTR_BCN);
+	if (!bcn_nest)
+		goto err;
+
+	if (bcn_tb[DCB_BCN_ATTR_ALL])
+		getall = true;
+
+	for (i = DCB_BCN_ATTR_RP_0; i <= DCB_BCN_ATTR_RP_7; i++) {
+		if (!getall && !bcn_tb[i])
+			continue;
+
+		netdev->dcbnl_ops->getbcnrp(netdev, i - DCB_BCN_ATTR_RP_0,
+		                            &value_byte);
+		ret = nla_put_u8(dcbnl_skb, i, value_byte);
+		if (ret)
+			goto err_bcn;
+	}
+
+	for (i = DCB_BCN_ATTR_ALPHA; i <= DCB_BCN_ATTR_RI; i++) {
+		if (!getall && !bcn_tb[i])
+			continue;
+
+		netdev->dcbnl_ops->getbcncfg(netdev, i,
+		                             &value_integer);
+		ret = nla_put_u32(dcbnl_skb, i, value_integer);
+		if (ret)
+			goto err_bcn;
+	}
+
+	nla_nest_end(dcbnl_skb, bcn_nest);
+
+	nlmsg_end(dcbnl_skb, nlh);
+
+	ret = rtnl_unicast(dcbnl_skb, &init_net, pid);
+	if (ret)
+		goto err;
+
+	return 0;
+
+err_bcn:
+	nla_nest_cancel(dcbnl_skb, bcn_nest);
+nlmsg_failure:
+err:
+	kfree(dcbnl_skb);
+err_out:
+	ret  = -EINVAL;
+	return ret;
+}
+
+static int dcbnl_bcn_setcfg(struct net_device *netdev, struct nlattr **tb,
+                            u32 pid, u32 seq, u16 flags)
+{
+	struct nlattr *data[DCB_BCN_ATTR_MAX + 1];
+	int i;
+	int ret = -EINVAL;
+	u8 value_byte;
+	u32 value_int;
+
+	if (!tb[DCB_ATTR_BCN] || !netdev->dcbnl_ops->setbcncfg
+	    || !netdev->dcbnl_ops->setbcnrp)
+		return ret;
+
+	ret = nla_parse_nested(data, DCB_BCN_ATTR_MAX,
+	                       tb[DCB_ATTR_BCN],
+	                       dcbnl_pfc_up_nest);
+	if (ret)
+		goto err;
+
+	for (i = DCB_BCN_ATTR_RP_0; i <= DCB_BCN_ATTR_RP_7; i++) {
+		if (data[i] == NULL)
+			continue;
+		value_byte = nla_get_u8(data[i]);
+		netdev->dcbnl_ops->setbcnrp(netdev,
+			data[i]->nla_type - DCB_BCN_ATTR_RP_0, value_byte);
+	}
+
+	for (i = DCB_BCN_ATTR_ALPHA; i <= DCB_BCN_ATTR_RI; i++) {
+		if (data[i] == NULL)
+			continue;
+		value_int = nla_get_u32(data[i]);
+		netdev->dcbnl_ops->setbcncfg(netdev,
+	                                     i, value_int);
+	}
+
+	ret = dcbnl_reply(0, RTM_SETDCB, DCB_CMD_BCN_SCFG, DCB_ATTR_BCN,
+	                  pid, seq, flags);
+err:
+	return ret;
+}
+
 static int dcb_doit(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg)
 {
 	struct net *net = sock_net(skb->sk);
@@ -891,6 +1043,10 @@ static int dcb_doit(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg)
 		ret = dcbnl_pgrx_getcfg(netdev, tb, pid, nlh->nlmsg_seq,
 		                        nlh->nlmsg_flags);
 		goto out;
+	case DCB_CMD_BCN_GCFG:
+		ret = dcbnl_bcn_getcfg(netdev, tb, pid, nlh->nlmsg_seq,
+		                       nlh->nlmsg_flags);
+		goto out;
 	case DCB_CMD_SSTATE:
 		ret = dcbnl_setstate(netdev, tb, pid, nlh->nlmsg_seq,
 		                     nlh->nlmsg_flags);
@@ -932,6 +1088,10 @@ static int dcb_doit(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg)
 		ret = dcbnl_setpfcstate(netdev, tb, pid, nlh->nlmsg_seq,
 		                        nlh->nlmsg_flags);
 		goto out;
+	case DCB_CMD_BCN_SCFG:
+		ret = dcbnl_bcn_setcfg(netdev, tb, pid, nlh->nlmsg_seq,
+		                       nlh->nlmsg_flags);
+		goto out;
 	default:
 		goto errout;
 	}
-- 
cgit v1.2.3-70-g09d2


From 2baf8a2daab65cdd3f20bfeb4676a2f6aff7c3bf Mon Sep 17 00:00:00 2001
From: Wang Chen <wangchen@cn.fujitsu.com>
Date: Fri, 21 Nov 2008 16:34:18 -0800
Subject: netdevice hdlc: Convert directly reference of netdev->priv

For killing directly reference of netdev->priv, use netdev->ml_priv to replace it.
Because the private pvc data comes from add_pvc() and can't be allocated in
alloc_netdev().

Signed-off-by: Wang Chen <wangchen@cn.fujitsu.com>
Acked-by: Krzysztof Halasa <khc@pm.waw.pl>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/wan/hdlc_fr.c | 10 +++++-----
 include/linux/hdlc.h      |  2 +-
 2 files changed, 6 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/wan/hdlc_fr.c b/drivers/net/wan/hdlc_fr.c
index d3d5055741a..f1ddd7c3459 100644
--- a/drivers/net/wan/hdlc_fr.c
+++ b/drivers/net/wan/hdlc_fr.c
@@ -342,7 +342,7 @@ static int fr_hard_header(struct sk_buff **skb_p, u16 dlci)
 
 static int pvc_open(struct net_device *dev)
 {
-	pvc_device *pvc = dev->priv;
+	pvc_device *pvc = dev->ml_priv;
 
 	if ((pvc->frad->flags & IFF_UP) == 0)
 		return -EIO;  /* Frad must be UP in order to activate PVC */
@@ -362,7 +362,7 @@ static int pvc_open(struct net_device *dev)
 
 static int pvc_close(struct net_device *dev)
 {
-	pvc_device *pvc = dev->priv;
+	pvc_device *pvc = dev->ml_priv;
 
 	if (--pvc->open_count == 0) {
 		hdlc_device *hdlc = dev_to_hdlc(pvc->frad);
@@ -381,7 +381,7 @@ static int pvc_close(struct net_device *dev)
 
 static int pvc_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
 {
-	pvc_device *pvc = dev->priv;
+	pvc_device *pvc = dev->ml_priv;
 	fr_proto_pvc_info info;
 
 	if (ifr->ifr_settings.type == IF_GET_PROTO) {
@@ -409,7 +409,7 @@ static int pvc_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
 
 static int pvc_xmit(struct sk_buff *skb, struct net_device *dev)
 {
-	pvc_device *pvc = dev->priv;
+	pvc_device *pvc = dev->ml_priv;
 
 	if (pvc->state.active) {
 		if (dev->type == ARPHRD_ETHER) {
@@ -1111,7 +1111,7 @@ static int fr_add_pvc(struct net_device *frad, unsigned int dlci, int type)
 	dev->change_mtu = pvc_change_mtu;
 	dev->mtu = HDLC_MAX_MTU;
 	dev->tx_queue_len = 0;
-	dev->priv = pvc;
+	dev->ml_priv = pvc;
 
 	result = dev_alloc_name(dev, dev->name);
 	if (result < 0) {
diff --git a/include/linux/hdlc.h b/include/linux/hdlc.h
index c59769693be..e960faac609 100644
--- a/include/linux/hdlc.h
+++ b/include/linux/hdlc.h
@@ -80,7 +80,7 @@ struct net_device *alloc_hdlcdev(void *priv);
 
 static inline struct hdlc_device* dev_to_hdlc(struct net_device *dev)
 {
-	return dev->priv;
+	return netdev_priv(dev);
 }
 
 static __inline__ void debug_frame(const struct sk_buff *skb)
-- 
cgit v1.2.3-70-g09d2


From f201ae2356c74bcae130b2177b3dca903ea98071 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Sun, 23 Nov 2008 06:22:56 +0100
Subject: tracing/function-return-tracer: store return stack into task_struct
 and allocate it dynamically

Impact: use deeper function tracing depth safely

Some tests showed that function return tracing needed a more deeper depth
of function calls. But it could be unsafe to store these return addresses
to the stack.

So these arrays will now be allocated dynamically into task_struct of current
only when the tracer is activated.

Typical scheme when tracer is activated:
- allocate a return stack for each task in global list.
- fork: allocate the return stack for the newly created task
- exit: free return stack of current
- idle init: same as fork

I chose a default depth of 50. I don't have overruns anymore.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/include/asm/ftrace.h      |  1 -
 arch/x86/include/asm/thread_info.h | 29 ------------
 arch/x86/kernel/ftrace.c           | 29 ++++++------
 include/linux/ftrace.h             |  5 ++
 include/linux/sched.h              | 23 +++++----
 kernel/exit.c                      |  5 +-
 kernel/fork.c                      |  4 ++
 kernel/sched.c                     |  3 ++
 kernel/trace/ftrace.c              | 96 +++++++++++++++++++++++++++++++++++++-
 9 files changed, 137 insertions(+), 58 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/include/asm/ftrace.h b/arch/x86/include/asm/ftrace.h
index 2bb43b433e0..754a3e082f9 100644
--- a/arch/x86/include/asm/ftrace.h
+++ b/arch/x86/include/asm/ftrace.h
@@ -29,7 +29,6 @@ struct dyn_arch_ftrace {
 #endif /* CONFIG_FUNCTION_TRACER */
 
 #ifdef CONFIG_FUNCTION_RET_TRACER
-#define FTRACE_RET_STACK_SIZE 20
 
 #ifndef __ASSEMBLY__
 
diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
index e90e81ef6ab..0921b4018c1 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -40,36 +40,8 @@ struct thread_info {
 						*/
 	__u8			supervisor_stack[0];
 #endif
-
-#ifdef CONFIG_FUNCTION_RET_TRACER
-	/* Index of current stored adress in ret_stack */
-	int		curr_ret_stack;
-	/* Stack of return addresses for return function tracing */
-	struct ftrace_ret_stack	ret_stack[FTRACE_RET_STACK_SIZE];
-	/*
-	 * Number of functions that haven't been traced
-	 * because of depth overrun.
-	 */
-	atomic_t	trace_overrun;
-#endif
 };
 
-#ifdef CONFIG_FUNCTION_RET_TRACER
-#define INIT_THREAD_INFO(tsk)			\
-{						\
-	.task		= &tsk,			\
-	.exec_domain	= &default_exec_domain,	\
-	.flags		= 0,			\
-	.cpu		= 0,			\
-	.preempt_count	= 1,			\
-	.addr_limit	= KERNEL_DS,		\
-	.restart_block = {			\
-		.fn = do_no_restart_syscall,	\
-	},					\
-	.curr_ret_stack = -1,\
-	.trace_overrun	= ATOMIC_INIT(0)	\
-}
-#else
 #define INIT_THREAD_INFO(tsk)			\
 {						\
 	.task		= &tsk,			\
@@ -82,7 +54,6 @@ struct thread_info {
 		.fn = do_no_restart_syscall,	\
 	},					\
 }
-#endif
 
 #define init_thread_info	(init_thread_union.thread_info)
 #define init_stack		(init_thread_union.stack)
diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c
index 356bb1eb6e9..bb137f7297e 100644
--- a/arch/x86/kernel/ftrace.c
+++ b/arch/x86/kernel/ftrace.c
@@ -350,19 +350,21 @@ static int push_return_trace(unsigned long ret, unsigned long long time,
 				unsigned long func)
 {
 	int index;
-	struct thread_info *ti = current_thread_info();
+
+	if (!current->ret_stack)
+		return -EBUSY;
 
 	/* The return trace stack is full */
-	if (ti->curr_ret_stack == FTRACE_RET_STACK_SIZE - 1) {
-		atomic_inc(&ti->trace_overrun);
+	if (current->curr_ret_stack == FTRACE_RETFUNC_DEPTH - 1) {
+		atomic_inc(&current->trace_overrun);
 		return -EBUSY;
 	}
 
-	index = ++ti->curr_ret_stack;
+	index = ++current->curr_ret_stack;
 	barrier();
-	ti->ret_stack[index].ret = ret;
-	ti->ret_stack[index].func = func;
-	ti->ret_stack[index].calltime = time;
+	current->ret_stack[index].ret = ret;
+	current->ret_stack[index].func = func;
+	current->ret_stack[index].calltime = time;
 
 	return 0;
 }
@@ -373,13 +375,12 @@ static void pop_return_trace(unsigned long *ret, unsigned long long *time,
 {
 	int index;
 
-	struct thread_info *ti = current_thread_info();
-	index = ti->curr_ret_stack;
-	*ret = ti->ret_stack[index].ret;
-	*func = ti->ret_stack[index].func;
-	*time = ti->ret_stack[index].calltime;
-	*overrun = atomic_read(&ti->trace_overrun);
-	ti->curr_ret_stack--;
+	index = current->curr_ret_stack;
+	*ret = current->ret_stack[index].ret;
+	*func = current->ret_stack[index].func;
+	*time = current->ret_stack[index].calltime;
+	*overrun = atomic_read(&current->trace_overrun);
+	current->curr_ret_stack--;
 }
 
 /*
diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index f7ba4ea5e12..2ba259b2def 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -323,6 +323,8 @@ struct ftrace_retfunc {
 };
 
 #ifdef CONFIG_FUNCTION_RET_TRACER
+#define FTRACE_RETFUNC_DEPTH 50
+#define FTRACE_RETSTACK_ALLOC_SIZE 32
 /* Type of a callback handler of tracing return function */
 typedef void (*trace_function_return_t)(struct ftrace_retfunc *);
 
@@ -330,6 +332,9 @@ extern int register_ftrace_return(trace_function_return_t func);
 /* The current handler in use */
 extern trace_function_return_t ftrace_function_return;
 extern void unregister_ftrace_return(void);
+
+extern void ftrace_retfunc_init_task(struct task_struct *t);
+extern void ftrace_retfunc_exit_task(struct task_struct *t);
 #endif
 
 #endif /* _LINUX_FTRACE_H */
diff --git a/include/linux/sched.h b/include/linux/sched.h
index c8e0db46420..bee1e93c95a 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1352,6 +1352,17 @@ struct task_struct {
 	unsigned long default_timer_slack_ns;
 
 	struct list_head	*scm_work_list;
+#ifdef CONFIG_FUNCTION_RET_TRACER
+	/* Index of current stored adress in ret_stack */
+	int curr_ret_stack;
+	/* Stack of return addresses for return function tracing */
+	struct ftrace_ret_stack	*ret_stack;
+	/*
+	 * Number of functions that haven't been traced
+	 * because of depth overrun.
+	 */
+	atomic_t trace_overrun;
+#endif
 };
 
 /*
@@ -2006,18 +2017,6 @@ static inline void setup_thread_stack(struct task_struct *p, struct task_struct
 {
 	*task_thread_info(p) = *task_thread_info(org);
 	task_thread_info(p)->task = p;
-
-#ifdef CONFIG_FUNCTION_RET_TRACER
-	/*
-	 * When fork() creates a child process, this function is called.
-	 * But the child task may not inherit the return adresses traced
-	 * by the return function tracer because it will directly execute
-	 * in userspace and will not return to kernel functions its parent
-	 * used.
-	 */
-	task_thread_info(p)->curr_ret_stack = -1;
-	atomic_set(&task_thread_info(p)->trace_overrun, 0);
-#endif
 }
 
 static inline unsigned long *end_of_stack(struct task_struct *p)
diff --git a/kernel/exit.c b/kernel/exit.c
index 35c8ec2ba03..b9d446329da 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -47,6 +47,7 @@
 #include <linux/task_io_accounting_ops.h>
 #include <linux/tracehook.h>
 #include <trace/sched.h>
+#include <linux/ftrace.h>
 
 #include <asm/uaccess.h>
 #include <asm/unistd.h>
@@ -1127,7 +1128,9 @@ NORET_TYPE void do_exit(long code)
 	preempt_disable();
 	/* causes final put_task_struct in finish_task_switch(). */
 	tsk->state = TASK_DEAD;
-
+#ifdef CONFIG_FUNCTION_RET_TRACER
+	ftrace_retfunc_exit_task(tsk);
+#endif
 	schedule();
 	BUG();
 	/* Avoid "noreturn function does return".  */
diff --git a/kernel/fork.c b/kernel/fork.c
index ac62f43ee43..d1eb30e69cc 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -47,6 +47,7 @@
 #include <linux/mount.h>
 #include <linux/audit.h>
 #include <linux/memcontrol.h>
+#include <linux/ftrace.h>
 #include <linux/profile.h>
 #include <linux/rmap.h>
 #include <linux/acct.h>
@@ -1269,6 +1270,9 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 	total_forks++;
 	spin_unlock(&current->sighand->siglock);
 	write_unlock_irq(&tasklist_lock);
+#ifdef CONFIG_FUNCTION_RET_TRACER
+	ftrace_retfunc_init_task(p);
+#endif
 	proc_fork_connector(p);
 	cgroup_post_fork(p);
 	return p;
diff --git a/kernel/sched.c b/kernel/sched.c
index 4de56108c86..fb17205950d 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -5901,6 +5901,9 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu)
 	 * The idle tasks have their own, simple scheduling class:
 	 */
 	idle->sched_class = &idle_sched_class;
+#ifdef CONFIG_FUNCTION_RET_TRACER
+	ftrace_retfunc_init_task(idle);
+#endif
 }
 
 /*
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index f212da48668..90d99fb02ae 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -1498,10 +1498,77 @@ ftrace_enable_sysctl(struct ctl_table *table, int write,
 
 #ifdef CONFIG_FUNCTION_RET_TRACER
 
+static atomic_t ftrace_retfunc_active;
+
 /* The callback that hooks the return of a function */
 trace_function_return_t ftrace_function_return =
 			(trace_function_return_t)ftrace_stub;
 
+
+/* Try to assign a return stack array on FTRACE_RETSTACK_ALLOC_SIZE tasks. */
+static int alloc_retstack_tasklist(struct ftrace_ret_stack **ret_stack_list)
+{
+	int i;
+	int ret = 0;
+	unsigned long flags;
+	int start = 0, end = FTRACE_RETSTACK_ALLOC_SIZE;
+	struct task_struct *g, *t;
+
+	for (i = 0; i < FTRACE_RETSTACK_ALLOC_SIZE; i++) {
+		ret_stack_list[i] = kmalloc(FTRACE_RETFUNC_DEPTH
+					* sizeof(struct ftrace_ret_stack),
+					GFP_KERNEL);
+		if (!ret_stack_list[i]) {
+			start = 0;
+			end = i;
+			ret = -ENOMEM;
+			goto free;
+		}
+	}
+
+	read_lock_irqsave(&tasklist_lock, flags);
+	do_each_thread(g, t) {
+		if (start == end) {
+			ret = -EAGAIN;
+			goto unlock;
+		}
+
+		if (t->ret_stack == NULL) {
+			t->ret_stack = ret_stack_list[start++];
+			t->curr_ret_stack = -1;
+			atomic_set(&t->trace_overrun, 0);
+		}
+	} while_each_thread(g, t);
+
+unlock:
+	read_unlock_irqrestore(&tasklist_lock, flags);
+free:
+	for (i = start; i < end; i++)
+		kfree(ret_stack_list[i]);
+	return ret;
+}
+
+/* Allocate a return stack for each task */
+static int start_return_tracing(void)
+{
+	struct ftrace_ret_stack **ret_stack_list;
+	int ret;
+
+	ret_stack_list = kmalloc(FTRACE_RETSTACK_ALLOC_SIZE *
+				sizeof(struct ftrace_ret_stack *),
+				GFP_KERNEL);
+
+	if (!ret_stack_list)
+		return -ENOMEM;
+
+	do {
+		ret = alloc_retstack_tasklist(ret_stack_list);
+	} while (ret == -EAGAIN);
+
+	kfree(ret_stack_list);
+	return ret;
+}
+
 int register_ftrace_return(trace_function_return_t func)
 {
 	int ret = 0;
@@ -1516,7 +1583,12 @@ int register_ftrace_return(trace_function_return_t func)
 		ret = -EBUSY;
 		goto out;
 	}
-
+	atomic_inc(&ftrace_retfunc_active);
+	ret = start_return_tracing();
+	if (ret) {
+		atomic_dec(&ftrace_retfunc_active);
+		goto out;
+	}
 	ftrace_tracing_type = FTRACE_TYPE_RETURN;
 	ftrace_function_return = func;
 	ftrace_startup();
@@ -1530,6 +1602,7 @@ void unregister_ftrace_return(void)
 {
 	mutex_lock(&ftrace_sysctl_lock);
 
+	atomic_dec(&ftrace_retfunc_active);
 	ftrace_function_return = (trace_function_return_t)ftrace_stub;
 	ftrace_shutdown();
 	/* Restore normal tracing type */
@@ -1537,6 +1610,27 @@ void unregister_ftrace_return(void)
 
 	mutex_unlock(&ftrace_sysctl_lock);
 }
+
+/* Allocate a return stack for newly created task */
+void ftrace_retfunc_init_task(struct task_struct *t)
+{
+	if (atomic_read(&ftrace_retfunc_active)) {
+		t->ret_stack = kmalloc(FTRACE_RETFUNC_DEPTH
+				* sizeof(struct ftrace_ret_stack),
+				GFP_KERNEL);
+		if (!t->ret_stack)
+			return;
+		t->curr_ret_stack = -1;
+		atomic_set(&t->trace_overrun, 0);
+	} else
+		t->ret_stack = NULL;
+}
+
+void ftrace_retfunc_exit_task(struct task_struct *t)
+{
+	kfree(t->ret_stack);
+	t->ret_stack = NULL;
+}
 #endif
 
 
-- 
cgit v1.2.3-70-g09d2


From 82f60f0bc854aada696f27d863c03bef91f1509d Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Sun, 23 Nov 2008 09:18:56 +0100
Subject: tracing/function-return-tracer: clean up task start/exit callbacks

Impact: cleanup

Eliminate #ifdefs in core code by using empty inline functions.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/ftrace.h | 3 +++
 kernel/exit.c          | 2 --
 kernel/fork.c          | 2 --
 kernel/sched.c         | 2 --
 4 files changed, 3 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index 2ba259b2def..938ca194264 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -335,6 +335,9 @@ extern void unregister_ftrace_return(void);
 
 extern void ftrace_retfunc_init_task(struct task_struct *t);
 extern void ftrace_retfunc_exit_task(struct task_struct *t);
+#else
+static inline void ftrace_retfunc_init_task(struct task_struct *t) { }
+static inline void ftrace_retfunc_exit_task(struct task_struct *t) { }
 #endif
 
 #endif /* _LINUX_FTRACE_H */
diff --git a/kernel/exit.c b/kernel/exit.c
index b9d446329da..ef04d03b328 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -1128,9 +1128,7 @@ NORET_TYPE void do_exit(long code)
 	preempt_disable();
 	/* causes final put_task_struct in finish_task_switch(). */
 	tsk->state = TASK_DEAD;
-#ifdef CONFIG_FUNCTION_RET_TRACER
 	ftrace_retfunc_exit_task(tsk);
-#endif
 	schedule();
 	BUG();
 	/* Avoid "noreturn function does return".  */
diff --git a/kernel/fork.c b/kernel/fork.c
index d1eb30e69cc..fbf4a4c0a62 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1270,9 +1270,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 	total_forks++;
 	spin_unlock(&current->sighand->siglock);
 	write_unlock_irq(&tasklist_lock);
-#ifdef CONFIG_FUNCTION_RET_TRACER
 	ftrace_retfunc_init_task(p);
-#endif
 	proc_fork_connector(p);
 	cgroup_post_fork(p);
 	return p;
diff --git a/kernel/sched.c b/kernel/sched.c
index fb17205950d..388d9db044a 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -5901,9 +5901,7 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu)
 	 * The idle tasks have their own, simple scheduling class:
 	 */
 	idle->sched_class = &idle_sched_class;
-#ifdef CONFIG_FUNCTION_RET_TRACER
 	ftrace_retfunc_init_task(idle);
-#endif
 }
 
 /*
-- 
cgit v1.2.3-70-g09d2


From 02b67518e2b1c490787dac7f35e1204e74fe21ba Mon Sep 17 00:00:00 2001
From: Török Edwin <edwintorok@gmail.com>
Date: Sat, 22 Nov 2008 13:28:47 +0200
Subject: tracing: add support for userspace stacktraces in tracing/iter_ctrl
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Impact: add new (default-off) tracing visualization feature

Usage example:

 mount -t debugfs nodev /sys/kernel/debug
 cd /sys/kernel/debug/tracing
 echo userstacktrace >iter_ctrl
 echo sched_switch >current_tracer
 echo 1 >tracing_enabled
 .... run application ...
 echo 0 >tracing_enabled

Then read one of 'trace','latency_trace','trace_pipe'.

To get the best output you can compile your userspace programs with
frame pointers (at least glibc + the app you are tracing).

Signed-off-by: Török Edwin <edwintorok@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 Documentation/ftrace.txt     |  5 ++-
 arch/x86/kernel/stacktrace.c | 57 +++++++++++++++++++++++++++
 include/linux/stacktrace.h   |  8 ++++
 kernel/trace/trace.c         | 93 ++++++++++++++++++++++++++++++++++++++++++++
 kernel/trace/trace.h         |  9 +++++
 5 files changed, 171 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/Documentation/ftrace.txt b/Documentation/ftrace.txt
index 753f4de4b17..79a80f79c06 100644
--- a/Documentation/ftrace.txt
+++ b/Documentation/ftrace.txt
@@ -324,7 +324,7 @@ output. To see what is available, simply cat the file:
 
   cat /debug/tracing/trace_options
   print-parent nosym-offset nosym-addr noverbose noraw nohex nobin \
- noblock nostacktrace nosched-tree
+ noblock nostacktrace nosched-tree nouserstacktrace
 
 To disable one of the options, echo in the option prepended with "no".
 
@@ -378,6 +378,9 @@ Here are the available options:
 		When a trace is recorded, so is the stack of functions.
 		This allows for back traces of trace sites.
 
+  userstacktrace - This option changes the trace.
+		   It records a stacktrace of the current userspace thread.
+
   sched-tree - TBD (any users??)
 
 
diff --git a/arch/x86/kernel/stacktrace.c b/arch/x86/kernel/stacktrace.c
index a03e7f6d90c..b1515306041 100644
--- a/arch/x86/kernel/stacktrace.c
+++ b/arch/x86/kernel/stacktrace.c
@@ -6,6 +6,7 @@
 #include <linux/sched.h>
 #include <linux/stacktrace.h>
 #include <linux/module.h>
+#include <linux/uaccess.h>
 #include <asm/stacktrace.h>
 
 static void save_stack_warning(void *data, char *msg)
@@ -83,3 +84,59 @@ void save_stack_trace_tsk(struct task_struct *tsk, struct stack_trace *trace)
 		trace->entries[trace->nr_entries++] = ULONG_MAX;
 }
 EXPORT_SYMBOL_GPL(save_stack_trace_tsk);
+
+/* Userspace stacktrace - based on kernel/trace/trace_sysprof.c */
+
+struct stack_frame {
+	const void __user	*next_fp;
+	unsigned long		return_address;
+};
+
+static int copy_stack_frame(const void __user *fp, struct stack_frame *frame)
+{
+	int ret;
+
+	if (!access_ok(VERIFY_READ, fp, sizeof(*frame)))
+		return 0;
+
+	ret = 1;
+	pagefault_disable();
+	if (__copy_from_user_inatomic(frame, fp, sizeof(*frame)))
+		ret = 0;
+	pagefault_enable();
+
+	return ret;
+}
+
+void save_stack_trace_user(struct stack_trace *trace)
+{
+	/*
+	 * Trace user stack if we are not a kernel thread
+	 */
+	if (current->mm) {
+		const struct pt_regs *regs = task_pt_regs(current);
+		const void __user *fp = (const void __user *)regs->bp;
+
+		if (trace->nr_entries < trace->max_entries)
+			trace->entries[trace->nr_entries++] = regs->ip;
+
+		while (trace->nr_entries < trace->max_entries) {
+			struct stack_frame frame;
+			frame.next_fp = NULL;
+			frame.return_address = 0;
+			if (!copy_stack_frame(fp, &frame))
+				break;
+			if ((unsigned long)fp < regs->sp)
+				break;
+			if (frame.return_address)
+				trace->entries[trace->nr_entries++] =
+					frame.return_address;
+			if (fp == frame.next_fp)
+				break;
+			fp = frame.next_fp;
+		}
+	}
+	if (trace->nr_entries < trace->max_entries)
+		trace->entries[trace->nr_entries++] = ULONG_MAX;
+}
+
diff --git a/include/linux/stacktrace.h b/include/linux/stacktrace.h
index b106fd8e0d5..68de51468f5 100644
--- a/include/linux/stacktrace.h
+++ b/include/linux/stacktrace.h
@@ -15,9 +15,17 @@ extern void save_stack_trace_tsk(struct task_struct *tsk,
 				struct stack_trace *trace);
 
 extern void print_stack_trace(struct stack_trace *trace, int spaces);
+
+#ifdef CONFIG_X86
+extern void save_stack_trace_user(struct stack_trace *trace);
+#else
+# define save_stack_trace_user(trace)              do { } while (0)
+#endif
+
 #else
 # define save_stack_trace(trace)			do { } while (0)
 # define save_stack_trace_tsk(tsk, trace)		do { } while (0)
+# define save_stack_trace_user(trace)              do { } while (0)
 # define print_stack_trace(trace, spaces)		do { } while (0)
 #endif
 
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 4ee6f037522..ced8b4fa9f5 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -275,6 +275,7 @@ static const char *trace_options[] = {
 	"ftrace_preempt",
 	"branch",
 	"annotate",
+	"userstacktrace",
 	NULL
 };
 
@@ -918,6 +919,44 @@ void __trace_stack(struct trace_array *tr,
 	ftrace_trace_stack(tr, data, flags, skip, preempt_count());
 }
 
+static void ftrace_trace_userstack(struct trace_array *tr,
+		   struct trace_array_cpu *data,
+		   unsigned long flags, int pc)
+{
+	struct userstack_entry *entry;
+	struct stack_trace trace;
+	struct ring_buffer_event *event;
+	unsigned long irq_flags;
+
+	if (!(trace_flags & TRACE_ITER_USERSTACKTRACE))
+		return;
+
+	event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry),
+					 &irq_flags);
+	if (!event)
+		return;
+	entry	= ring_buffer_event_data(event);
+	tracing_generic_entry_update(&entry->ent, flags, pc);
+	entry->ent.type		= TRACE_USER_STACK;
+
+	memset(&entry->caller, 0, sizeof(entry->caller));
+
+	trace.nr_entries	= 0;
+	trace.max_entries	= FTRACE_STACK_ENTRIES;
+	trace.skip		= 0;
+	trace.entries		= entry->caller;
+
+	save_stack_trace_user(&trace);
+	ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
+}
+
+void __trace_userstack(struct trace_array *tr,
+		   struct trace_array_cpu *data,
+		   unsigned long flags)
+{
+	ftrace_trace_userstack(tr, data, flags, preempt_count());
+}
+
 static void
 ftrace_trace_special(void *__tr, void *__data,
 		     unsigned long arg1, unsigned long arg2, unsigned long arg3,
@@ -941,6 +980,7 @@ ftrace_trace_special(void *__tr, void *__data,
 	entry->arg3			= arg3;
 	ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
 	ftrace_trace_stack(tr, data, irq_flags, 4, pc);
+	ftrace_trace_userstack(tr, data, irq_flags, pc);
 
 	trace_wake_up();
 }
@@ -979,6 +1019,7 @@ tracing_sched_switch_trace(struct trace_array *tr,
 	entry->next_cpu	= task_cpu(next);
 	ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
 	ftrace_trace_stack(tr, data, flags, 5, pc);
+	ftrace_trace_userstack(tr, data, flags, pc);
 }
 
 void
@@ -1008,6 +1049,7 @@ tracing_sched_wakeup_trace(struct trace_array *tr,
 	entry->next_cpu			= task_cpu(wakee);
 	ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
 	ftrace_trace_stack(tr, data, flags, 6, pc);
+	ftrace_trace_userstack(tr, data, flags, pc);
 
 	trace_wake_up();
 }
@@ -1387,6 +1429,31 @@ seq_print_ip_sym(struct trace_seq *s, unsigned long ip, unsigned long sym_flags)
 	return ret;
 }
 
+static int
+seq_print_userip_objs(const struct userstack_entry *entry, struct trace_seq *s,
+		unsigned long sym_flags)
+{
+	int ret = 1;
+	unsigned i;
+
+	for (i = 0; i < FTRACE_STACK_ENTRIES; i++) {
+		unsigned long ip = entry->caller[i];
+
+		if (ip == ULONG_MAX || !ret)
+			break;
+		if (i)
+			ret = trace_seq_puts(s, " <- ");
+		if (!ip) {
+			ret = trace_seq_puts(s, "??");
+			continue;
+		}
+		if (ret /*&& (sym_flags & TRACE_ITER_SYM_ADDR)*/)
+			ret = trace_seq_printf(s, " <" IP_FMT ">", ip);
+	}
+
+	return ret;
+}
+
 static void print_lat_help_header(struct seq_file *m)
 {
 	seq_puts(m, "#                  _------=> CPU#            \n");
@@ -1702,6 +1769,16 @@ print_lat_fmt(struct trace_iterator *iter, unsigned int trace_idx, int cpu)
 				 field->line);
 		break;
 	}
+	case TRACE_USER_STACK: {
+		struct userstack_entry *field;
+
+		trace_assign_type(field, entry);
+
+		seq_print_userip_objs(field, s, sym_flags);
+		if (entry->flags & TRACE_FLAG_CONT)
+			trace_seq_print_cont(s, iter);
+		break;
+	}
 	default:
 		trace_seq_printf(s, "Unknown type %d\n", entry->type);
 	}
@@ -1853,6 +1930,19 @@ static enum print_line_t print_trace_fmt(struct trace_iterator *iter)
 				 field->line);
 		break;
 	}
+	case TRACE_USER_STACK: {
+		struct userstack_entry *field;
+
+		trace_assign_type(field, entry);
+
+		ret = seq_print_userip_objs(field, s, sym_flags);
+		if (!ret)
+			return TRACE_TYPE_PARTIAL_LINE;
+		ret = trace_seq_putc(s, '\n');
+		if (!ret)
+			return TRACE_TYPE_PARTIAL_LINE;
+		break;
+	}
 	}
 	return TRACE_TYPE_HANDLED;
 }
@@ -1912,6 +2002,7 @@ static enum print_line_t print_raw_fmt(struct trace_iterator *iter)
 		break;
 	}
 	case TRACE_SPECIAL:
+	case TRACE_USER_STACK:
 	case TRACE_STACK: {
 		struct special_entry *field;
 
@@ -2000,6 +2091,7 @@ static enum print_line_t print_hex_fmt(struct trace_iterator *iter)
 		break;
 	}
 	case TRACE_SPECIAL:
+	case TRACE_USER_STACK:
 	case TRACE_STACK: {
 		struct special_entry *field;
 
@@ -2054,6 +2146,7 @@ static enum print_line_t print_bin_fmt(struct trace_iterator *iter)
 		break;
 	}
 	case TRACE_SPECIAL:
+	case TRACE_USER_STACK:
 	case TRACE_STACK: {
 		struct special_entry *field;
 
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 2cb12fd98f6..17bb4c830b0 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -26,6 +26,7 @@ enum trace_type {
 	TRACE_BOOT_CALL,
 	TRACE_BOOT_RET,
 	TRACE_FN_RET,
+	TRACE_USER_STACK,
 
 	__TRACE_LAST_TYPE
 };
@@ -42,6 +43,7 @@ struct trace_entry {
 	unsigned char		flags;
 	unsigned char		preempt_count;
 	int			pid;
+	int			tgid;
 };
 
 /*
@@ -99,6 +101,11 @@ struct stack_entry {
 	unsigned long		caller[FTRACE_STACK_ENTRIES];
 };
 
+struct userstack_entry {
+	struct trace_entry	ent;
+	unsigned long		caller[FTRACE_STACK_ENTRIES];
+};
+
 /*
  * ftrace_printk entry:
  */
@@ -240,6 +247,7 @@ extern void __ftrace_bad_type(void);
 		IF_ASSIGN(var, ent, struct ctx_switch_entry, 0);	\
 		IF_ASSIGN(var, ent, struct trace_field_cont, TRACE_CONT); \
 		IF_ASSIGN(var, ent, struct stack_entry, TRACE_STACK);	\
+		IF_ASSIGN(var, ent, struct userstack_entry, TRACE_USER_STACK);\
 		IF_ASSIGN(var, ent, struct print_entry, TRACE_PRINT);	\
 		IF_ASSIGN(var, ent, struct special_entry, 0);		\
 		IF_ASSIGN(var, ent, struct trace_mmiotrace_rw,		\
@@ -500,6 +508,7 @@ enum trace_iterator_flags {
 	TRACE_ITER_PREEMPTONLY		= 0x800,
 	TRACE_ITER_BRANCH		= 0x1000,
 	TRACE_ITER_ANNOTATE		= 0x2000,
+	TRACE_ITER_USERSTACKTRACE       = 0x4000
 };
 
 /*
-- 
cgit v1.2.3-70-g09d2


From 74e2f334f4440cbcb63e9ebbcdcea430d41bdfa3 Mon Sep 17 00:00:00 2001
From: Török Edwin <edwintorok@gmail.com>
Date: Sat, 22 Nov 2008 13:28:48 +0200
Subject: vfs, seqfile: make mangle_path() global
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Impact: expose new VFS API

make mangle_path() available, as per the suggestions of Christoph Hellwig
and Al Viro:

  http://lkml.org/lkml/2008/11/4/338

Signed-off-by: Török Edwin <edwintorok@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 fs/seq_file.c            | 14 +++++++++++++-
 include/linux/seq_file.h |  1 +
 2 files changed, 14 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/fs/seq_file.c b/fs/seq_file.c
index eba2eabcd2b..f5b61cc1cc1 100644
--- a/fs/seq_file.c
+++ b/fs/seq_file.c
@@ -357,7 +357,18 @@ int seq_printf(struct seq_file *m, const char *f, ...)
 }
 EXPORT_SYMBOL(seq_printf);
 
-static char *mangle_path(char *s, char *p, char *esc)
+/**
+ * 	mangle_path - mangle and copy path to buffer beginning
+ * 	@s - buffer start
+ * 	@p - beginning of path in above buffer
+ *      @esc - set of characters that need escaping
+ *
+ *      Copy the path from @p to @s, replacing each occurrence of character from
+ *      @esc with usual octal escape.
+ *      Returns pointer past last written character in @s, or NULL in case of
+ *      failure.
+ */
+char *mangle_path(char *s, char *p, char *esc)
 {
 	while (s <= p) {
 		char c = *p++;
@@ -376,6 +387,7 @@ static char *mangle_path(char *s, char *p, char *esc)
 	}
 	return NULL;
 }
+EXPORT_SYMBOL_GPL(mangle_path);
 
 /*
  * return the absolute path of 'dentry' residing in mount 'mnt'.
diff --git a/include/linux/seq_file.h b/include/linux/seq_file.h
index dc50bcc282a..b3dfa72f13b 100644
--- a/include/linux/seq_file.h
+++ b/include/linux/seq_file.h
@@ -34,6 +34,7 @@ struct seq_operations {
 
 #define SEQ_SKIP 1
 
+char *mangle_path(char *s, char *p, char *esc);
 int seq_open(struct file *, const struct seq_operations *);
 ssize_t seq_read(struct file *, char __user *, size_t, loff_t *);
 loff_t seq_lseek(struct file *, loff_t, int);
-- 
cgit v1.2.3-70-g09d2


From 42f565e116e0408b5ddc21a33c4a4d41fd572420 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Thu, 20 Nov 2008 23:57:47 -0500
Subject: trace: remove extra assign in branch check

Impact: clean up of branch check

The unlikely/likely profiler does an extra assign of the f.line.
This is not needed since it is already calculated at compile time.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/compiler.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/compiler.h b/include/linux/compiler.h
index c7d804a7a4d..c25e525121f 100644
--- a/include/linux/compiler.h
+++ b/include/linux/compiler.h
@@ -87,7 +87,6 @@ void ftrace_likely_update(struct ftrace_branch_data *f, int val, int expect);
 				.file = __FILE__,			\
 				.line = __LINE__,			\
 			};						\
-			______f.line = __LINE__;			\
 			______r = likely_notrace(x);			\
 			ftrace_likely_update(&______f, ______r, 1);	\
 			______r;					\
@@ -102,7 +101,6 @@ void ftrace_likely_update(struct ftrace_branch_data *f, int val, int expect);
 				.file = __FILE__,			\
 				.line = __LINE__,			\
 			};						\
-			______f.line = __LINE__;			\
 			______r = unlikely_notrace(x);			\
 			ftrace_likely_update(&______f, ______r, 0);	\
 			______r;					\
-- 
cgit v1.2.3-70-g09d2


From 45b797492a0758e64dff74e9db70e1f65e0603a5 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Fri, 21 Nov 2008 00:40:40 -0500
Subject: trace: consolidate unlikely and likely profiler

Impact: clean up to make one profiler of like and unlikely tracer

The likely and unlikely profiler prints out the file and line numbers
of the annotated branches that it is profiling. It shows the number
of times it was correct or incorrect in its guess. Having two
different files or sections for that matter to tell us if it was a
likely or unlikely is pretty pointless. We really only care if
it was correct or not.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/asm-generic/vmlinux.lds.h |  9 +++------
 include/linux/compiler.h          | 24 +++++-------------------
 kernel/trace/Kconfig              |  3 +--
 kernel/trace/trace_branch.c       | 39 +++++++++++++--------------------------
 4 files changed, 22 insertions(+), 53 deletions(-)

(limited to 'include/linux')

diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h
index 3b46ae46493..8bccb49981e 100644
--- a/include/asm-generic/vmlinux.lds.h
+++ b/include/asm-generic/vmlinux.lds.h
@@ -46,12 +46,9 @@
 #endif
 
 #ifdef CONFIG_TRACE_BRANCH_PROFILING
-#define LIKELY_PROFILE()	VMLINUX_SYMBOL(__start_likely_profile) = .;   \
-				*(_ftrace_likely)			      \
-				VMLINUX_SYMBOL(__stop_likely_profile) = .;    \
-				VMLINUX_SYMBOL(__start_unlikely_profile) = .; \
-				*(_ftrace_unlikely)			      \
-				VMLINUX_SYMBOL(__stop_unlikely_profile) = .;
+#define LIKELY_PROFILE()	VMLINUX_SYMBOL(__start_annotated_branch_profile) = .; \
+				*(_ftrace_annotated_branch)			      \
+				VMLINUX_SYMBOL(__stop_annotated_branch_profile) = .;
 #else
 #define LIKELY_PROFILE()
 #endif
diff --git a/include/linux/compiler.h b/include/linux/compiler.h
index c25e525121f..0628a2013fa 100644
--- a/include/linux/compiler.h
+++ b/include/linux/compiler.h
@@ -77,32 +77,18 @@ void ftrace_likely_update(struct ftrace_branch_data *f, int val, int expect);
 #define likely_notrace(x)	__builtin_expect(!!(x), 1)
 #define unlikely_notrace(x)	__builtin_expect(!!(x), 0)
 
-#define likely_check(x) ({						\
+#define __branch_check__(x, expect) ({					\
 			int ______r;					\
 			static struct ftrace_branch_data		\
 				__attribute__((__aligned__(4)))		\
-				__attribute__((section("_ftrace_likely"))) \
+				__attribute__((section("_ftrace_annotated_branch"))) \
 				______f = {				\
 				.func = __func__,			\
 				.file = __FILE__,			\
 				.line = __LINE__,			\
 			};						\
 			______r = likely_notrace(x);			\
-			ftrace_likely_update(&______f, ______r, 1);	\
-			______r;					\
-		})
-#define unlikely_check(x) ({						\
-			int ______r;					\
-			static struct ftrace_branch_data		\
-				__attribute__((__aligned__(4)))		\
-				__attribute__((section("_ftrace_unlikely"))) \
-				______f = {				\
-				.func = __func__,			\
-				.file = __FILE__,			\
-				.line = __LINE__,			\
-			};						\
-			______r = unlikely_notrace(x);			\
-			ftrace_likely_update(&______f, ______r, 0);	\
+			ftrace_likely_update(&______f, ______r, expect); \
 			______r;					\
 		})
 
@@ -112,10 +98,10 @@ void ftrace_likely_update(struct ftrace_branch_data *f, int val, int expect);
  * written by Daniel Walker.
  */
 # ifndef likely
-#  define likely(x)	(__builtin_constant_p(x) ? !!(x) : likely_check(x))
+#  define likely(x)	(__builtin_constant_p(x) ? !!(x) : __branch_check__(x, 1))
 # endif
 # ifndef unlikely
-#  define unlikely(x)	(__builtin_constant_p(x) ? !!(x) : unlikely_check(x))
+#  define unlikely(x)	(__builtin_constant_p(x) ? !!(x) : __branch_check__(x, 0))
 # endif
 #else
 # define likely(x)	__builtin_expect(!!(x), 1)
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index b8378fad29a..7e354870570 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -166,8 +166,7 @@ config TRACE_BRANCH_PROFILING
 	  This tracer profiles all the the likely and unlikely macros
 	  in the kernel. It will display the results in:
 
-	  /debugfs/tracing/profile_likely
-	  /debugfs/tracing/profile_unlikely
+	  /debugfs/tracing/profile_annotated_branch
 
 	  Note: this will add a significant overhead, only turn this
 	  on if you need to profile the system's use of these macros.
diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c
index 23f9b02ce96..21dedc8b50a 100644
--- a/kernel/trace/trace_branch.c
+++ b/kernel/trace/trace_branch.c
@@ -261,7 +261,7 @@ static struct seq_operations tracing_likely_seq_ops = {
 	.show		= t_show,
 };
 
-static int tracing_likely_open(struct inode *inode, struct file *file)
+static int tracing_branch_open(struct inode *inode, struct file *file)
 {
 	int ret;
 
@@ -274,25 +274,18 @@ static int tracing_likely_open(struct inode *inode, struct file *file)
 	return ret;
 }
 
-static struct file_operations tracing_likely_fops = {
-	.open		= tracing_likely_open,
+static const struct file_operations tracing_branch_fops = {
+	.open		= tracing_branch_open,
 	.read		= seq_read,
 	.llseek		= seq_lseek,
 };
 
-extern unsigned long __start_likely_profile[];
-extern unsigned long __stop_likely_profile[];
-extern unsigned long __start_unlikely_profile[];
-extern unsigned long __stop_unlikely_profile[];
+extern unsigned long __start_annotated_branch_profile[];
+extern unsigned long __stop_annotated_branch_profile[];
 
-static struct ftrace_pointer ftrace_likely_pos = {
-	.start			= __start_likely_profile,
-	.stop			= __stop_likely_profile,
-};
-
-static struct ftrace_pointer ftrace_unlikely_pos = {
-	.start			= __start_unlikely_profile,
-	.stop			= __stop_unlikely_profile,
+static const struct ftrace_pointer ftrace_annotated_branch_pos = {
+	.start			= __start_annotated_branch_profile,
+	.stop			= __stop_annotated_branch_profile,
 };
 
 static __init int ftrace_branch_init(void)
@@ -302,18 +295,12 @@ static __init int ftrace_branch_init(void)
 
 	d_tracer = tracing_init_dentry();
 
-	entry = debugfs_create_file("profile_likely", 0444, d_tracer,
-				    &ftrace_likely_pos,
-				    &tracing_likely_fops);
-	if (!entry)
-		pr_warning("Could not create debugfs 'profile_likely' entry\n");
-
-	entry = debugfs_create_file("profile_unlikely", 0444, d_tracer,
-				    &ftrace_unlikely_pos,
-				    &tracing_likely_fops);
+	entry = debugfs_create_file("profile_annotated_branch", 0444, d_tracer,
+				    &ftrace_annotated_branch_pos,
+				    &tracing_branch_fops);
 	if (!entry)
-		pr_warning("Could not create debugfs"
-			   " 'profile_unlikely' entry\n");
+		pr_warning("Could not create debugfs "
+			   "'profile_annotatet_branch' entry\n");
 
 	return 0;
 }
-- 
cgit v1.2.3-70-g09d2


From 2bcd521a684cc94befbe2ce7d5b613c841b0d304 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Fri, 21 Nov 2008 01:30:54 -0500
Subject: trace: profile all if conditionals

Impact: feature to profile if statements

This patch adds a branch profiler for all if () statements.
The results will be found in:

  /debugfs/tracing/profile_branch

For example:

   miss      hit    %        Function                  File              Line
 ------- ---------  -        --------                  ----              ----
       0        1 100 x86_64_start_reservations      head64.c             127
       0        1 100 copy_bootdata                  head64.c             69
       1        0   0 x86_64_start_kernel            head64.c             111
      32        0   0 set_intr_gate                  desc.h               319
       1        0   0 reserve_ebda_region            head.c               51
       1        0   0 reserve_ebda_region            head.c               47
       0        1 100 reserve_ebda_region            head.c               42
       0        0   X maxcpus                        main.c               165

Miss means the branch was not taken. Hit means the branch was taken.
The percent is the percentage the branch was taken.

This adds a significant amount of overhead and should only be used
by those analyzing their system.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/asm-generic/vmlinux.lds.h | 11 ++++++++++-
 include/linux/compiler.h          | 38 ++++++++++++++++++++++++++++++++++++--
 kernel/trace/Kconfig              | 16 ++++++++++++++++
 kernel/trace/trace_branch.c       | 33 +++++++++++++++++++++++++++++++--
 4 files changed, 93 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h
index 8bccb49981e..eba835a2c2c 100644
--- a/include/asm-generic/vmlinux.lds.h
+++ b/include/asm-generic/vmlinux.lds.h
@@ -53,6 +53,14 @@
 #define LIKELY_PROFILE()
 #endif
 
+#ifdef CONFIG_PROFILE_ALL_BRANCHES
+#define BRANCH_PROFILE()	VMLINUX_SYMBOL(__start_branch_profile) = .;   \
+				*(_ftrace_branch)			      \
+				VMLINUX_SYMBOL(__stop_branch_profile) = .;
+#else
+#define BRANCH_PROFILE()
+#endif
+
 /* .data section */
 #define DATA_DATA							\
 	*(.data)							\
@@ -72,7 +80,8 @@
 	VMLINUX_SYMBOL(__start___tracepoints) = .;			\
 	*(__tracepoints)						\
 	VMLINUX_SYMBOL(__stop___tracepoints) = .;			\
-	LIKELY_PROFILE()
+	LIKELY_PROFILE()		       				\
+	BRANCH_PROFILE()
 
 #define RO_DATA(align)							\
 	. = ALIGN((align));						\
diff --git a/include/linux/compiler.h b/include/linux/compiler.h
index 0628a2013fa..ea7c6be354b 100644
--- a/include/linux/compiler.h
+++ b/include/linux/compiler.h
@@ -63,8 +63,16 @@ struct ftrace_branch_data {
 	const char *func;
 	const char *file;
 	unsigned line;
-	unsigned long correct;
-	unsigned long incorrect;
+	union {
+		struct {
+			unsigned long correct;
+			unsigned long incorrect;
+		};
+		struct {
+			unsigned long miss;
+			unsigned long hit;
+		};
+	};
 };
 
 /*
@@ -103,6 +111,32 @@ void ftrace_likely_update(struct ftrace_branch_data *f, int val, int expect);
 # ifndef unlikely
 #  define unlikely(x)	(__builtin_constant_p(x) ? !!(x) : __branch_check__(x, 0))
 # endif
+
+#ifdef CONFIG_PROFILE_ALL_BRANCHES
+/*
+ * "Define 'is'", Bill Clinton
+ * "Define 'if'", Steven Rostedt
+ */
+#define if(cond) if (__builtin_constant_p((cond)) ? !!(cond) :		\
+	({								\
+		int ______r;						\
+		static struct ftrace_branch_data			\
+			__attribute__((__aligned__(4)))			\
+			__attribute__((section("_ftrace_branch")))	\
+			______f = {					\
+				.func = __func__,			\
+				.file = __FILE__,			\
+				.line = __LINE__,			\
+			};						\
+		______r = !!(cond);					\
+		if (______r)						\
+			______f.hit++;					\
+		else							\
+			______f.miss++;					\
+		______r;						\
+	}))
+#endif /* CONFIG_PROFILE_ALL_BRANCHES */
+
 #else
 # define likely(x)	__builtin_expect(!!(x), 1)
 # define unlikely(x)	__builtin_expect(!!(x), 0)
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 7e354870570..61e8cca6ff4 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -173,6 +173,22 @@ config TRACE_BRANCH_PROFILING
 
 	  Say N if unsure.
 
+config PROFILE_ALL_BRANCHES
+	bool "Profile all if conditionals"
+	depends on TRACE_BRANCH_PROFILING
+	help
+	  This tracer profiles all branch conditions. Every if ()
+	  taken in the kernel is recorded whether it hit or miss.
+	  The results will be displayed in:
+
+	  /debugfs/tracing/profile_branch
+
+	  This configuration, when enabled, will impose a great overhead
+	  on the system. This should only be enabled when the system
+	  is to be analyzed
+
+	  Say N if unsure.
+
 config TRACING_BRANCHES
 	bool
 	help
diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c
index 142acb3b4e0..85792aec64d 100644
--- a/kernel/trace/trace_branch.c
+++ b/kernel/trace/trace_branch.c
@@ -185,6 +185,7 @@ EXPORT_SYMBOL(ftrace_likely_update);
 struct ftrace_pointer {
 	void		*start;
 	void		*stop;
+	int		hit;
 };
 
 static void *
@@ -223,13 +224,17 @@ static void t_stop(struct seq_file *m, void *p)
 
 static int t_show(struct seq_file *m, void *v)
 {
+	struct ftrace_pointer *fp = m->private;
 	struct ftrace_branch_data *p = v;
 	const char *f;
 	long percent;
 
 	if (v == (void *)1) {
-		seq_printf(m, " correct incorrect  %% "
-			      "       Function                "
+		if (fp->hit)
+			seq_printf(m, "   miss      hit    %% ");
+		else
+			seq_printf(m, " correct incorrect  %% ");
+		seq_printf(m, "       Function                "
 			      "  File              Line\n"
 			      " ------- ---------  - "
 			      "       --------                "
@@ -243,6 +248,9 @@ static int t_show(struct seq_file *m, void *v)
 		f--;
 	f++;
 
+	/*
+	 * The miss is overlayed on correct, and hit on incorrect.
+	 */
 	if (p->correct) {
 		percent = p->incorrect * 100;
 		percent /= p->correct + p->incorrect;
@@ -284,6 +292,18 @@ static const struct file_operations tracing_branch_fops = {
 	.llseek		= seq_lseek,
 };
 
+#ifdef CONFIG_PROFILE_ALL_BRANCHES
+extern unsigned long __start_branch_profile[];
+extern unsigned long __stop_branch_profile[];
+
+static struct ftrace_pointer ftrace_branch_pos = {
+	.start			= __start_branch_profile,
+	.stop			= __stop_branch_profile,
+	.hit			= 1,
+};
+
+#endif /* CONFIG_PROFILE_ALL_BRANCHES */
+
 extern unsigned long __start_annotated_branch_profile[];
 extern unsigned long __stop_annotated_branch_profile[];
 
@@ -306,6 +326,15 @@ static __init int ftrace_branch_init(void)
 		pr_warning("Could not create debugfs "
 			   "'profile_annotatet_branch' entry\n");
 
+#ifdef CONFIG_PROFILE_ALL_BRANCHES
+	entry = debugfs_create_file("profile_branch", 0444, d_tracer,
+				    &ftrace_branch_pos,
+				    &tracing_branch_fops);
+	if (!entry)
+		pr_warning("Could not create debugfs"
+			   " 'profile_branch' entry\n");
+#endif
+
 	return 0;
 }
 
-- 
cgit v1.2.3-70-g09d2


From 033601a32b2012b6948e80e739cca40bff4de4a0 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Fri, 21 Nov 2008 12:41:55 -0500
Subject: ring-buffer: add tracing_off_permanent

Impact: feature to permanently disable ring buffer

This patch adds a API to the ring buffer code that will permanently
disable the ring buffer from ever recording. This should only be
called when some serious anomaly is detected, and the system
may be in an unstable state. When that happens, shutting down the
recording to the ring buffers may be appropriate.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/ring_buffer.h |  1 +
 kernel/trace/ring_buffer.c  | 79 +++++++++++++++++++++++++++++++++++++--------
 2 files changed, 67 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h
index e097c2e6b6d..3bb87a753fa 100644
--- a/include/linux/ring_buffer.h
+++ b/include/linux/ring_buffer.h
@@ -122,6 +122,7 @@ void ring_buffer_normalize_time_stamp(int cpu, u64 *ts);
 
 void tracing_on(void);
 void tracing_off(void);
+void tracing_off_permanent(void);
 
 enum ring_buffer_flags {
 	RB_FL_OVERWRITE		= 1 << 0,
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 85ced143c2c..e206951603c 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -18,8 +18,46 @@
 
 #include "trace.h"
 
-/* Global flag to disable all recording to ring buffers */
-static int ring_buffers_off __read_mostly;
+/*
+ * A fast way to enable or disable all ring buffers is to
+ * call tracing_on or tracing_off. Turning off the ring buffers
+ * prevents all ring buffers from being recorded to.
+ * Turning this switch on, makes it OK to write to the
+ * ring buffer, if the ring buffer is enabled itself.
+ *
+ * There's three layers that must be on in order to write
+ * to the ring buffer.
+ *
+ * 1) This global flag must be set.
+ * 2) The ring buffer must be enabled for recording.
+ * 3) The per cpu buffer must be enabled for recording.
+ *
+ * In case of an anomaly, this global flag has a bit set that
+ * will permantly disable all ring buffers.
+ */
+
+/*
+ * Global flag to disable all recording to ring buffers
+ *  This has two bits: ON, DISABLED
+ *
+ *  ON   DISABLED
+ * ---- ----------
+ *   0      0        : ring buffers are off
+ *   1      0        : ring buffers are on
+ *   X      1        : ring buffers are permanently disabled
+ */
+
+enum {
+	RB_BUFFERS_ON_BIT	= 0,
+	RB_BUFFERS_DISABLED_BIT	= 1,
+};
+
+enum {
+	RB_BUFFERS_ON		= 1 << RB_BUFFERS_ON_BIT,
+	RB_BUFFERS_DISABLED	= 1 << RB_BUFFERS_DISABLED_BIT,
+};
+
+static long ring_buffer_flags __read_mostly = RB_BUFFERS_ON;
 
 /**
  * tracing_on - enable all tracing buffers
@@ -29,7 +67,7 @@ static int ring_buffers_off __read_mostly;
  */
 void tracing_on(void)
 {
-	ring_buffers_off = 0;
+	set_bit(RB_BUFFERS_ON_BIT, &ring_buffer_flags);
 }
 
 /**
@@ -42,7 +80,18 @@ void tracing_on(void)
  */
 void tracing_off(void)
 {
-	ring_buffers_off = 1;
+	clear_bit(RB_BUFFERS_ON_BIT, &ring_buffer_flags);
+}
+
+/**
+ * tracing_off_permanent - permanently disable ring buffers
+ *
+ * This function, once called, will disable all ring buffers
+ * permanenty.
+ */
+void tracing_off_permanent(void)
+{
+	set_bit(RB_BUFFERS_DISABLED_BIT, &ring_buffer_flags);
 }
 
 #include "trace.h"
@@ -1185,7 +1234,7 @@ ring_buffer_lock_reserve(struct ring_buffer *buffer,
 	struct ring_buffer_event *event;
 	int cpu, resched;
 
-	if (ring_buffers_off)
+	if (ring_buffer_flags != RB_BUFFERS_ON)
 		return NULL;
 
 	if (atomic_read(&buffer->record_disabled))
@@ -1297,7 +1346,7 @@ int ring_buffer_write(struct ring_buffer *buffer,
 	int ret = -EBUSY;
 	int cpu, resched;
 
-	if (ring_buffers_off)
+	if (ring_buffer_flags != RB_BUFFERS_ON)
 		return -EBUSY;
 
 	if (atomic_read(&buffer->record_disabled))
@@ -2178,12 +2227,14 @@ static ssize_t
 rb_simple_read(struct file *filp, char __user *ubuf,
 	       size_t cnt, loff_t *ppos)
 {
-	int *p = filp->private_data;
+	long *p = filp->private_data;
 	char buf[64];
 	int r;
 
-	/* !ring_buffers_off == tracing_on */
-	r = sprintf(buf, "%d\n", !*p);
+	if (test_bit(RB_BUFFERS_DISABLED_BIT, p))
+		r = sprintf(buf, "permanently disabled\n");
+	else
+		r = sprintf(buf, "%d\n", test_bit(RB_BUFFERS_ON_BIT, p));
 
 	return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
 }
@@ -2192,7 +2243,7 @@ static ssize_t
 rb_simple_write(struct file *filp, const char __user *ubuf,
 		size_t cnt, loff_t *ppos)
 {
-	int *p = filp->private_data;
+	long *p = filp->private_data;
 	char buf[64];
 	long val;
 	int ret;
@@ -2209,8 +2260,10 @@ rb_simple_write(struct file *filp, const char __user *ubuf,
 	if (ret < 0)
 		return ret;
 
-	/* !ring_buffers_off == tracing_on */
-	*p = !val;
+	if (val)
+		set_bit(RB_BUFFERS_ON_BIT, p);
+	else
+		clear_bit(RB_BUFFERS_ON_BIT, p);
 
 	(*ppos)++;
 
@@ -2232,7 +2285,7 @@ static __init int rb_init_debugfs(void)
 	d_tracer = tracing_init_dentry();
 
 	entry = debugfs_create_file("tracing_on", 0644, d_tracer,
-				    &ring_buffers_off, &rb_simple_fops);
+				    &ring_buffer_flags, &rb_simple_fops);
 	if (!entry)
 		pr_warning("Could not create debugfs 'tracing_on' entry\n");
 
-- 
cgit v1.2.3-70-g09d2


From 69bb54ec05f57da7f6fac2cec0820cbc970df20f Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Fri, 21 Nov 2008 12:59:38 -0500
Subject: ftrace: add ftrace_off_permanent

Impact: add new API to disable all of ftrace on anomalies

It case of a serious anomaly being detected (like something caught by
lockdep) it is a good idea to disable all tracing immediately, without
grabing any locks.

This patch adds ftrace_off_permanent that disables the tracers, function
tracing and ring buffers without a way to enable them again. This should
only be used when something serious has been detected.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/ftrace.h |  2 ++
 kernel/trace/trace.c   | 15 +++++++++++++++
 2 files changed, 17 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index f7ba4ea5e12..13e9cfc0992 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -257,6 +257,7 @@ extern int ftrace_dump_on_oops;
 
 extern void tracing_start(void);
 extern void tracing_stop(void);
+extern void ftrace_off_permanent(void);
 
 extern void
 ftrace_special(unsigned long arg1, unsigned long arg2, unsigned long arg3);
@@ -290,6 +291,7 @@ ftrace_printk(const char *fmt, ...) __attribute__ ((format (printf, 1, 0)));
 
 static inline void tracing_start(void) { }
 static inline void tracing_stop(void) { }
+static inline void ftrace_off_permanent(void) { }
 static inline int
 ftrace_printk(const char *fmt, ...)
 {
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 4ee6f037522..0dbfb23ced9 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -660,6 +660,21 @@ static void trace_init_cmdlines(void)
 static int trace_stop_count;
 static DEFINE_SPINLOCK(tracing_start_lock);
 
+/**
+ * ftrace_off_permanent - disable all ftrace code permanently
+ *
+ * This should only be called when a serious anomally has
+ * been detected.  This will turn off the function tracing,
+ * ring buffers, and other tracing utilites. It takes no
+ * locks and can be called from any context.
+ */
+void ftrace_off_permanent(void)
+{
+	tracing_disabled = 1;
+	ftrace_stop();
+	tracing_off_permanent();
+}
+
 /**
  * tracing_start - quick start of the tracer
  *
-- 
cgit v1.2.3-70-g09d2


From 8d7c6a96164651dbbab449ef0b5c20ae1f76a3a1 Mon Sep 17 00:00:00 2001
From: Török Edwin <edwintorok@gmail.com>
Date: Sun, 23 Nov 2008 12:39:06 +0200
Subject: tracing/stack-tracer: fix style issues
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Impact: cleanup

Signed-off-by: Török Edwin <edwintorok@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/stacktrace.c | 51 +++++++++++++++++++++++++-------------------
 include/linux/stacktrace.h   |  2 +-
 kernel/trace/trace.c         |  7 +++---
 3 files changed, 33 insertions(+), 27 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/kernel/stacktrace.c b/arch/x86/kernel/stacktrace.c
index b1515306041..10786af9554 100644
--- a/arch/x86/kernel/stacktrace.c
+++ b/arch/x86/kernel/stacktrace.c
@@ -89,7 +89,7 @@ EXPORT_SYMBOL_GPL(save_stack_trace_tsk);
 
 struct stack_frame {
 	const void __user	*next_fp;
-	unsigned long		return_address;
+	unsigned long		ret_addr;
 };
 
 static int copy_stack_frame(const void __user *fp, struct stack_frame *frame)
@@ -108,33 +108,40 @@ static int copy_stack_frame(const void __user *fp, struct stack_frame *frame)
 	return ret;
 }
 
+static inline void __save_stack_trace_user(struct stack_trace *trace)
+{
+	const struct pt_regs *regs = task_pt_regs(current);
+	const void __user *fp = (const void __user *)regs->bp;
+
+	if (trace->nr_entries < trace->max_entries)
+		trace->entries[trace->nr_entries++] = regs->ip;
+
+	while (trace->nr_entries < trace->max_entries) {
+		struct stack_frame frame;
+
+		frame.next_fp = NULL;
+		frame.ret_addr = 0;
+		if (!copy_stack_frame(fp, &frame))
+			break;
+		if ((unsigned long)fp < regs->sp)
+			break;
+		if (frame.ret_addr) {
+			trace->entries[trace->nr_entries++] =
+				frame.ret_addr;
+		}
+		if (fp == frame.next_fp)
+			break;
+		fp = frame.next_fp;
+	}
+}
+
 void save_stack_trace_user(struct stack_trace *trace)
 {
 	/*
 	 * Trace user stack if we are not a kernel thread
 	 */
 	if (current->mm) {
-		const struct pt_regs *regs = task_pt_regs(current);
-		const void __user *fp = (const void __user *)regs->bp;
-
-		if (trace->nr_entries < trace->max_entries)
-			trace->entries[trace->nr_entries++] = regs->ip;
-
-		while (trace->nr_entries < trace->max_entries) {
-			struct stack_frame frame;
-			frame.next_fp = NULL;
-			frame.return_address = 0;
-			if (!copy_stack_frame(fp, &frame))
-				break;
-			if ((unsigned long)fp < regs->sp)
-				break;
-			if (frame.return_address)
-				trace->entries[trace->nr_entries++] =
-					frame.return_address;
-			if (fp == frame.next_fp)
-				break;
-			fp = frame.next_fp;
-		}
+		__save_stack_trace_user(trace);
 	}
 	if (trace->nr_entries < trace->max_entries)
 		trace->entries[trace->nr_entries++] = ULONG_MAX;
diff --git a/include/linux/stacktrace.h b/include/linux/stacktrace.h
index 68de51468f5..fd42d685110 100644
--- a/include/linux/stacktrace.h
+++ b/include/linux/stacktrace.h
@@ -25,7 +25,7 @@ extern void save_stack_trace_user(struct stack_trace *trace);
 #else
 # define save_stack_trace(trace)			do { } while (0)
 # define save_stack_trace_tsk(tsk, trace)		do { } while (0)
-# define save_stack_trace_user(trace)              do { } while (0)
+# define save_stack_trace_user(trace)			do { } while (0)
 # define print_stack_trace(trace, spaces)		do { } while (0)
 #endif
 
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 62776b71b1c..dedf35f3697 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -948,9 +948,9 @@ static void ftrace_trace_userstack(struct trace_array *tr,
 		   struct trace_array_cpu *data,
 		   unsigned long flags, int pc)
 {
+	struct ring_buffer_event *event;
 	struct userstack_entry *entry;
 	struct stack_trace trace;
-	struct ring_buffer_event *event;
 	unsigned long irq_flags;
 
 	if (!(trace_flags & TRACE_ITER_USERSTACKTRACE))
@@ -1471,8 +1471,7 @@ static inline int seq_print_user_ip(struct trace_seq *s, struct mm_struct *mm,
 	if (file) {
 		ret = trace_seq_path(s, &file->f_path);
 		if (ret)
-			ret = trace_seq_printf(s, "[+0x%lx]",
-					ip - vmstart);
+			ret = trace_seq_printf(s, "[+0x%lx]", ip - vmstart);
 	}
 	if (ret && ((sym_flags & TRACE_ITER_SYM_ADDR) || !file))
 		ret = trace_seq_printf(s, " <" IP_FMT ">", ip);
@@ -1485,7 +1484,7 @@ seq_print_userip_objs(const struct userstack_entry *entry, struct trace_seq *s,
 {
 	struct mm_struct *mm = NULL;
 	int ret = 1;
-	unsigned i;
+	unsigned int i;
 
 	if (trace_flags & TRACE_ITER_SYM_USEROBJ) {
 		struct task_struct *task;
-- 
cgit v1.2.3-70-g09d2


From 8d26487fd4ddda7a0237da418fb8669fb06ae557 Mon Sep 17 00:00:00 2001
From: Török Edwin <edwintorok@gmail.com>
Date: Sun, 23 Nov 2008 12:39:08 +0200
Subject: tracing/stack-tracer: introduce CONFIG_USER_STACKTRACE_SUPPORT
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Impact: cleanup

User stack tracing is just implemented for x86, but it is not x86 specific.

Introduce a generic config flag, that is currently enabled only for x86.
When other arches implement it, they will have to
SELECT USER_STACKTRACE_SUPPORT.

Signed-off-by: Török Edwin <edwintorok@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/Kconfig           | 1 +
 include/linux/stacktrace.h | 2 +-
 kernel/trace/Kconfig       | 3 +++
 3 files changed, 5 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 7a146baaa99..e49a4fd718f 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -36,6 +36,7 @@ config X86
 	select HAVE_ARCH_TRACEHOOK
 	select HAVE_GENERIC_DMA_COHERENT if X86_32
 	select HAVE_EFFICIENT_UNALIGNED_ACCESS
+	select USER_STACKTRACE_SUPPORT
 
 config ARCH_DEFCONFIG
 	string
diff --git a/include/linux/stacktrace.h b/include/linux/stacktrace.h
index fd42d685110..1a8cecc4f38 100644
--- a/include/linux/stacktrace.h
+++ b/include/linux/stacktrace.h
@@ -16,7 +16,7 @@ extern void save_stack_trace_tsk(struct task_struct *tsk,
 
 extern void print_stack_trace(struct stack_trace *trace, int spaces);
 
-#ifdef CONFIG_X86
+#ifdef CONFIG_USER_STACKTRACE_SUPPORT
 extern void save_stack_trace_user(struct stack_trace *trace);
 #else
 # define save_stack_trace_user(trace)              do { } while (0)
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index b8378fad29a..87fc34a1bb9 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -3,6 +3,9 @@
 #  select HAVE_FUNCTION_TRACER:
 #
 
+config USER_STACKTRACE_SUPPORT
+	bool
+
 config NOP_TRACER
 	bool
 
-- 
cgit v1.2.3-70-g09d2


From e262a7ba31f0cd4dae225e6d2e9037e5ac6108e8 Mon Sep 17 00:00:00 2001
From: Richard Kennedy <richard@rsk.demon.co.uk>
Date: Sun, 23 Nov 2008 14:34:43 +0000
Subject: irq.h: remove padding from irq_desc on 64bits

Impact: reduce struct irq_desc size

struct irq_desc: reorder to remove padding on 64bits

shrinks irq_desc to 128 bytes which saves data space & cache lines

On a generic x86_64/SMP build this reduces the reported data size by
64k.

Signed-off-by: Richard Kennedy <richard@rsk.demon.co.uk>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/irq.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/irq.h b/include/linux/irq.h
index d058c57be02..838a977885e 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -142,8 +142,8 @@ struct irq_chip {
  * @depth:		disable-depth, for nested irq_disable() calls
  * @wake_depth:		enable depth, for multiple set_irq_wake() callers
  * @irq_count:		stats field to detect stalled irqs
- * @irqs_unhandled:	stats field for spurious unhandled interrupts
  * @last_unhandled:	aging timer for unhandled count
+ * @irqs_unhandled:	stats field for spurious unhandled interrupts
  * @lock:		locking for SMP
  * @affinity:		IRQ affinity on SMP
  * @cpu:		cpu index useful for balancing
@@ -165,8 +165,8 @@ struct irq_desc {
 	unsigned int		depth;		/* nested irq disables */
 	unsigned int		wake_depth;	/* nested wake enables */
 	unsigned int		irq_count;	/* For detecting broken IRQs */
-	unsigned int		irqs_unhandled;
 	unsigned long		last_unhandled;	/* Aging timer for unhandled count */
+	unsigned int		irqs_unhandled;
 	spinlock_t		lock;
 #ifdef CONFIG_SMP
 	cpumask_t		affinity;
-- 
cgit v1.2.3-70-g09d2


From b20a9c24d5c5d466d7e4a25c6f1bedbd2d16ad4f Mon Sep 17 00:00:00 2001
From: Gerrit Renker <gerrit@erg.abdn.ac.uk>
Date: Sun, 23 Nov 2008 16:02:31 -0800
Subject: dccp: Set per-connection CCIDs via socket options

With this patch, TX/RX CCIDs can now be changed on a per-connection
basis, which overrides the defaults set by the global sysctl variables
for TX/RX CCIDs.

To make full use of this facility, the remaining patches of this patch
set are needed, which track dependencies and activate negotiated
feature values.

Signed-off-by: Gerrit Renker <gerrit@erg.abdn.ac.uk>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/dccp.txt | 14 ++++++++++++++
 include/linux/dccp.h              |  5 +++++
 net/dccp/ackvec.c                 |  9 ++++-----
 net/dccp/ackvec.h                 |  5 ++---
 net/dccp/feat.h                   |  2 ++
 net/dccp/proto.c                  | 34 ++++++++++++++++++++++++++++++++++
 6 files changed, 61 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/networking/dccp.txt b/Documentation/networking/dccp.txt
index 43df4487379..610083ff73f 100644
--- a/Documentation/networking/dccp.txt
+++ b/Documentation/networking/dccp.txt
@@ -61,6 +61,20 @@ DCCP_SOCKOPT_AVAILABLE_CCIDS is also read-only and returns the list of CCIDs
 supported by the endpoint (see include/linux/dccp.h for symbolic constants).
 The caller needs to provide a sufficiently large (> 2) array of type uint8_t.
 
+DCCP_SOCKOPT_CCID is write-only and sets both the TX and RX CCIDs at the same
+time, combining the operation of the next two socket options. This option is
+preferrable over the latter two, since often applications will use the same
+type of CCID for both directions; and mixed use of CCIDs is not currently well
+understood. This socket option takes as argument at least one uint8_t value, or
+an array of uint8_t values, which must match available CCIDS (see above). CCIDs
+must be registered on the socket before calling connect() or listen().
+
+DCCP_SOCKOPT_TX_CCID is read/write. It returns the current CCID (if set) or sets
+the preference list for the TX CCID, using the same format as DCCP_SOCKOPT_CCID.
+Please note that the getsockopt argument type here is `int', not uint8_t.
+
+DCCP_SOCKOPT_RX_CCID is analogous to DCCP_SOCKOPT_TX_CCID, but for the RX CCID.
+
 DCCP_SOCKOPT_SERVER_TIMEWAIT enables the server (listening socket) to hold
 timewait state when closing the connection (RFC 4340, 8.3). The usual case is
 that the closing server sends a CloseReq, whereupon the client holds timewait
diff --git a/include/linux/dccp.h b/include/linux/dccp.h
index eda389ce04f..6a72ff52a8a 100644
--- a/include/linux/dccp.h
+++ b/include/linux/dccp.h
@@ -168,6 +168,8 @@ enum {
 	DCCPO_MIN_CCID_SPECIFIC = 128,
 	DCCPO_MAX_CCID_SPECIFIC = 255,
 };
+/* maximum size of a single TLV-encoded DCCP option (sans type/len bytes) */
+#define DCCP_SINGLE_OPT_MAXLEN	253
 
 /* DCCP CCIDS */
 enum {
@@ -203,6 +205,9 @@ enum dccp_feature_numbers {
 #define DCCP_SOCKOPT_SEND_CSCOV		10
 #define DCCP_SOCKOPT_RECV_CSCOV		11
 #define DCCP_SOCKOPT_AVAILABLE_CCIDS	12
+#define DCCP_SOCKOPT_CCID		13
+#define DCCP_SOCKOPT_TX_CCID		14
+#define DCCP_SOCKOPT_RX_CCID		15
 #define DCCP_SOCKOPT_CCID_RX_INFO	128
 #define DCCP_SOCKOPT_CCID_TX_INFO	192
 
diff --git a/net/dccp/ackvec.c b/net/dccp/ackvec.c
index 1e8be246ad1..01e4d39fa23 100644
--- a/net/dccp/ackvec.c
+++ b/net/dccp/ackvec.c
@@ -12,7 +12,6 @@
 #include "ackvec.h"
 #include "dccp.h"
 
-#include <linux/dccp.h>
 #include <linux/init.h>
 #include <linux/errno.h>
 #include <linux/kernel.h>
@@ -68,7 +67,7 @@ int dccp_insert_option_ackvec(struct sock *sk, struct sk_buff *skb)
 	struct dccp_sock *dp = dccp_sk(sk);
 	struct dccp_ackvec *av = dp->dccps_hc_rx_ackvec;
 	/* Figure out how many options do we need to represent the ackvec */
-	const u16 nr_opts = DIV_ROUND_UP(av->av_vec_len, DCCP_MAX_ACKVEC_OPT_LEN);
+	const u8 nr_opts = DIV_ROUND_UP(av->av_vec_len, DCCP_SINGLE_OPT_MAXLEN);
 	u16 len = av->av_vec_len + 2 * nr_opts, i;
 	u32 elapsed_time;
 	const unsigned char *tail, *from;
@@ -100,8 +99,8 @@ int dccp_insert_option_ackvec(struct sock *sk, struct sk_buff *skb)
 	for (i = 0; i < nr_opts; ++i) {
 		int copylen = len;
 
-		if (len > DCCP_MAX_ACKVEC_OPT_LEN)
-			copylen = DCCP_MAX_ACKVEC_OPT_LEN;
+		if (len > DCCP_SINGLE_OPT_MAXLEN)
+			copylen = DCCP_SINGLE_OPT_MAXLEN;
 
 		*to++ = DCCPO_ACK_VECTOR_0;
 		*to++ = copylen + 2;
@@ -432,7 +431,7 @@ found:
 int dccp_ackvec_parse(struct sock *sk, const struct sk_buff *skb,
 		      u64 *ackno, const u8 opt, const u8 *value, const u8 len)
 {
-	if (len > DCCP_MAX_ACKVEC_OPT_LEN)
+	if (len > DCCP_SINGLE_OPT_MAXLEN)
 		return -1;
 
 	/* dccp_ackvector_print(DCCP_SKB_CB(skb)->dccpd_ack_seq, value, len); */
diff --git a/net/dccp/ackvec.h b/net/dccp/ackvec.h
index bcb64fb4ace..4ccee030524 100644
--- a/net/dccp/ackvec.h
+++ b/net/dccp/ackvec.h
@@ -11,15 +11,14 @@
  *	published by the Free Software Foundation.
  */
 
+#include <linux/dccp.h>
 #include <linux/compiler.h>
 #include <linux/ktime.h>
 #include <linux/list.h>
 #include <linux/types.h>
 
-/* Read about the ECN nonce to see why it is 253 */
-#define DCCP_MAX_ACKVEC_OPT_LEN 253
 /* We can spread an ack vector across multiple options */
-#define DCCP_MAX_ACKVEC_LEN (DCCP_MAX_ACKVEC_OPT_LEN * 2)
+#define DCCP_MAX_ACKVEC_LEN (DCCP_SINGLE_OPT_MAXLEN * 2)
 
 #define DCCP_ACKVEC_STATE_RECEIVED	0
 #define DCCP_ACKVEC_STATE_ECN_MARKED	(1 << 6)
diff --git a/net/dccp/feat.h b/net/dccp/feat.h
index 4d172822df1..093af1610d1 100644
--- a/net/dccp/feat.h
+++ b/net/dccp/feat.h
@@ -22,6 +22,8 @@
 /* Wmin=32 and Wmax=2^46-1 from 7.5.2 */
 #define DCCPF_SEQ_WMIN		32
 #define DCCPF_SEQ_WMAX		0x3FFFFFFFFFFFull
+/* Maximum number of SP values that fit in a single (Confirm) option */
+#define DCCP_FEAT_MAX_SP_VALS	(DCCP_SINGLE_OPT_MAXLEN - 2)
 
 enum dccp_feat_type {
 	FEAT_AT_RX   = 1,	/* located at RX side of half-connection  */
diff --git a/net/dccp/proto.c b/net/dccp/proto.c
index 8b63394ec24..445884cf1c2 100644
--- a/net/dccp/proto.c
+++ b/net/dccp/proto.c
@@ -501,6 +501,36 @@ static int dccp_setsockopt_cscov(struct sock *sk, int cscov, bool rx)
 	return rc;
 }
 
+static int dccp_setsockopt_ccid(struct sock *sk, int type,
+				char __user *optval, int optlen)
+{
+	u8 *val;
+	int rc = 0;
+
+	if (optlen < 1 || optlen > DCCP_FEAT_MAX_SP_VALS)
+		return -EINVAL;
+
+	val = kmalloc(optlen, GFP_KERNEL);
+	if (val == NULL)
+		return -ENOMEM;
+
+	if (copy_from_user(val, optval, optlen)) {
+		kfree(val);
+		return -EFAULT;
+	}
+
+	lock_sock(sk);
+	if (type == DCCP_SOCKOPT_TX_CCID || type == DCCP_SOCKOPT_CCID)
+		rc = dccp_feat_register_sp(sk, DCCPF_CCID, 1, val, optlen);
+
+	if (!rc && (type == DCCP_SOCKOPT_RX_CCID || type == DCCP_SOCKOPT_CCID))
+		rc = dccp_feat_register_sp(sk, DCCPF_CCID, 0, val, optlen);
+	release_sock(sk);
+
+	kfree(val);
+	return rc;
+}
+
 static int do_dccp_setsockopt(struct sock *sk, int level, int optname,
 		char __user *optval, int optlen)
 {
@@ -515,6 +545,10 @@ static int do_dccp_setsockopt(struct sock *sk, int level, int optname,
 	case DCCP_SOCKOPT_CHANGE_R:
 		DCCP_WARN("sockopt(CHANGE_L/R) is deprecated: fix your app\n");
 		return 0;
+	case DCCP_SOCKOPT_CCID:
+	case DCCP_SOCKOPT_RX_CCID:
+	case DCCP_SOCKOPT_TX_CCID:
+		return dccp_setsockopt_ccid(sk, optname, optval, optlen);
 	}
 
 	if (optlen < (int)sizeof(int))
-- 
cgit v1.2.3-70-g09d2


From 1f87e235e6fb92c2968b52b9191de04f1aff8e77 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <dada1@cosmosbay.com>
Date: Sun, 23 Nov 2008 23:24:32 -0800
Subject: eth: Declare an optimized compare_ether_addr_64bits() function

Linus mentioned we could try to perform long word operations, even
on potentially unaligned addresses, on x86 at least. David mentioned
the HAVE_EFFICIENT_UNALIGNED_ACCESS test to handle this on all
arches that have efficient unailgned accesses.

I tried this idea and got nice assembly on 32 bits:

158:   33 82 38 01 00 00       xor    0x138(%edx),%eax
15e:   33 8a 34 01 00 00       xor    0x134(%edx),%ecx
164:   c1 e0 10                shl    $0x10,%eax
167:   09 c1                   or     %eax,%ecx
169:   74 0b                   je     176 <eth_type_trans+0x87>

And very nice assembly on 64 bits of course (one xor, one shl)

Nice oprofile improvement in eth_type_trans(), 0.17 % instead of 0.41 %,
expected since we remove 8 instructions on a fast path.

This patch implements a compare_ether_addr_64bits() function, that
uses the CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS ifdef to efficiently
perform the 6 bytes comparison on all capable arches.

Signed-off-by: Eric Dumazet <dada1@cosmosbay.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/etherdevice.h | 42 ++++++++++++++++++++++++++++++++++++++++++
 net/ethernet/eth.c          |  6 +++---
 2 files changed, 45 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/etherdevice.h b/include/linux/etherdevice.h
index 0e5e9706003..1cb0f0b9092 100644
--- a/include/linux/etherdevice.h
+++ b/include/linux/etherdevice.h
@@ -27,6 +27,7 @@
 #include <linux/if_ether.h>
 #include <linux/netdevice.h>
 #include <linux/random.h>
+#include <asm/unaligned.h>
 
 #ifdef __KERNEL__
 extern __be16		eth_type_trans(struct sk_buff *skb, struct net_device *dev);
@@ -140,6 +141,47 @@ static inline unsigned compare_ether_addr(const u8 *addr1, const u8 *addr2)
 	BUILD_BUG_ON(ETH_ALEN != 6);
 	return ((a[0] ^ b[0]) | (a[1] ^ b[1]) | (a[2] ^ b[2])) != 0;
 }
+
+static inline unsigned long zap_last_2bytes(unsigned long value)
+{
+#ifdef __BIG_ENDIAN
+	return value >> 16;
+#else
+	return value << 16;
+#endif
+}
+
+/**
+ * compare_ether_addr_64bits - Compare two Ethernet addresses
+ * @addr1: Pointer to an array of 8 bytes
+ * @addr2: Pointer to an other array of 8 bytes
+ *
+ * Compare two ethernet addresses, returns 0 if equal.
+ * Same result than "memcmp(addr1, addr2, ETH_ALEN)" but without conditional
+ * branches, and possibly long word memory accesses on CPU allowing cheap
+ * unaligned memory reads.
+ * arrays = { byte1, byte2, byte3, byte4, byte6, byte7, pad1, pad2}
+ *
+ * Please note that alignment of addr1 & addr2 is only guaranted to be 16 bits.
+ */
+
+static inline unsigned compare_ether_addr_64bits(const u8 addr1[6+2],
+						 const u8 addr2[6+2])
+{
+#ifdef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
+	unsigned long fold = ((*(unsigned long *)addr1) ^
+			      (*(unsigned long *)addr2));
+
+	if (sizeof(fold) == 8)
+		return zap_last_2bytes(fold) != 0;
+
+	fold |= zap_last_2bytes((*(unsigned long *)(addr1 + 4)) ^
+				(*(unsigned long *)(addr2 + 4)));
+	return fold != 0;
+#else
+	return compare_ether_addr(addr1, addr2);
+#endif
+}
 #endif	/* __KERNEL__ */
 
 #endif	/* _LINUX_ETHERDEVICE_H */
diff --git a/net/ethernet/eth.c b/net/ethernet/eth.c
index a87a171d991..280352aba40 100644
--- a/net/ethernet/eth.c
+++ b/net/ethernet/eth.c
@@ -165,8 +165,8 @@ __be16 eth_type_trans(struct sk_buff *skb, struct net_device *dev)
 	skb_pull(skb, ETH_HLEN);
 	eth = eth_hdr(skb);
 
-	if (is_multicast_ether_addr(eth->h_dest)) {
-		if (!compare_ether_addr(eth->h_dest, dev->broadcast))
+	if (unlikely(is_multicast_ether_addr(eth->h_dest))) {
+		if (!compare_ether_addr_64bits(eth->h_dest, dev->broadcast))
 			skb->pkt_type = PACKET_BROADCAST;
 		else
 			skb->pkt_type = PACKET_MULTICAST;
@@ -181,7 +181,7 @@ __be16 eth_type_trans(struct sk_buff *skb, struct net_device *dev)
 	 */
 
 	else if (1 /*dev->flags&IFF_PROMISC */ ) {
-		if (unlikely(compare_ether_addr(eth->h_dest, dev->dev_addr)))
+		if (unlikely(compare_ether_addr_64bits(eth->h_dest, dev->dev_addr)))
 			skb->pkt_type = PACKET_OTHERHOST;
 	}
 
-- 
cgit v1.2.3-70-g09d2


From 758b2cdc6f6a22c702bd8f2344382fb1270b2161 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Tue, 25 Nov 2008 02:35:04 +1030
Subject: sched: wrap sched_group and sched_domain cpumask accesses.

Impact: trivial wrap of member accesses

This eases the transition in the next patch.

We also get rid of a temporary cpumask in find_idlest_cpu() thanks to
for_each_cpu_and, and sched_balance_self() due to getting weight before
setting sd to NULL.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/sched.h |  10 +++++
 kernel/sched.c        | 114 ++++++++++++++++++++++++--------------------------
 kernel/sched_fair.c   |  10 ++---
 kernel/sched_rt.c     |   3 +-
 kernel/sched_stats.h  |   3 +-
 5 files changed, 73 insertions(+), 67 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 4ce5c603c51..2b95aa9f779 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -786,6 +786,11 @@ struct sched_group {
 	u32 reciprocal_cpu_power;
 };
 
+static inline struct cpumask *sched_group_cpus(struct sched_group *sg)
+{
+	return &sg->cpumask;
+}
+
 enum sched_domain_level {
 	SD_LV_NONE = 0,
 	SD_LV_SIBLING,
@@ -866,6 +871,11 @@ struct sched_domain {
 #endif
 };
 
+static inline struct cpumask *sched_domain_span(struct sched_domain *sd)
+{
+	return &sd->span;
+}
+
 extern void partition_sched_domains(int ndoms_new, cpumask_t *doms_new,
 				    struct sched_domain_attr *dattr_new);
 extern int arch_reinit_sched_domains(void);
diff --git a/kernel/sched.c b/kernel/sched.c
index a2de33d0534..575f38acf4d 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1501,7 +1501,7 @@ static int tg_shares_up(struct task_group *tg, void *data)
 	struct sched_domain *sd = data;
 	int i;
 
-	for_each_cpu_mask(i, sd->span) {
+	for_each_cpu(i, sched_domain_span(sd)) {
 		/*
 		 * If there are currently no tasks on the cpu pretend there
 		 * is one of average load so that when a new task gets to
@@ -1522,7 +1522,7 @@ static int tg_shares_up(struct task_group *tg, void *data)
 	if (!sd->parent || !(sd->parent->flags & SD_LOAD_BALANCE))
 		shares = tg->shares;
 
-	for_each_cpu_mask(i, sd->span)
+	for_each_cpu(i, sched_domain_span(sd))
 		update_group_shares_cpu(tg, i, shares, rq_weight);
 
 	return 0;
@@ -2053,15 +2053,17 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
 		int i;
 
 		/* Skip over this group if it has no CPUs allowed */
-		if (!cpus_intersects(group->cpumask, p->cpus_allowed))
+		if (!cpumask_intersects(sched_group_cpus(group),
+					&p->cpus_allowed))
 			continue;
 
-		local_group = cpu_isset(this_cpu, group->cpumask);
+		local_group = cpumask_test_cpu(this_cpu,
+					       sched_group_cpus(group));
 
 		/* Tally up the load of all CPUs in the group */
 		avg_load = 0;
 
-		for_each_cpu(i, &group->cpumask) {
+		for_each_cpu(i, sched_group_cpus(group)) {
 			/* Bias balancing toward cpus of our domain */
 			if (local_group)
 				load = source_load(i, load_idx);
@@ -2093,17 +2095,14 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
  * find_idlest_cpu - find the idlest cpu among the cpus in group.
  */
 static int
-find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu,
-		cpumask_t *tmp)
+find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
 {
 	unsigned long load, min_load = ULONG_MAX;
 	int idlest = -1;
 	int i;
 
 	/* Traverse only the allowed CPUs */
-	cpus_and(*tmp, group->cpumask, p->cpus_allowed);
-
-	for_each_cpu(i, tmp) {
+	for_each_cpu_and(i, sched_group_cpus(group), &p->cpus_allowed) {
 		load = weighted_cpuload(i);
 
 		if (load < min_load || (load == min_load && i == this_cpu)) {
@@ -2145,7 +2144,6 @@ static int sched_balance_self(int cpu, int flag)
 		update_shares(sd);
 
 	while (sd) {
-		cpumask_t span, tmpmask;
 		struct sched_group *group;
 		int new_cpu, weight;
 
@@ -2154,14 +2152,13 @@ static int sched_balance_self(int cpu, int flag)
 			continue;
 		}
 
-		span = sd->span;
 		group = find_idlest_group(sd, t, cpu);
 		if (!group) {
 			sd = sd->child;
 			continue;
 		}
 
-		new_cpu = find_idlest_cpu(group, t, cpu, &tmpmask);
+		new_cpu = find_idlest_cpu(group, t, cpu);
 		if (new_cpu == -1 || new_cpu == cpu) {
 			/* Now try balancing at a lower domain level of cpu */
 			sd = sd->child;
@@ -2170,10 +2167,10 @@ static int sched_balance_self(int cpu, int flag)
 
 		/* Now try balancing at a lower domain level of new_cpu */
 		cpu = new_cpu;
+		weight = cpumask_weight(sched_domain_span(sd));
 		sd = NULL;
-		weight = cpus_weight(span);
 		for_each_domain(cpu, tmp) {
-			if (weight <= cpus_weight(tmp->span))
+			if (weight <= cpumask_weight(sched_domain_span(tmp)))
 				break;
 			if (tmp->flags & flag)
 				sd = tmp;
@@ -2218,7 +2215,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
 		cpu = task_cpu(p);
 
 		for_each_domain(this_cpu, sd) {
-			if (cpu_isset(cpu, sd->span)) {
+			if (cpumask_test_cpu(cpu, sched_domain_span(sd))) {
 				update_shares(sd);
 				break;
 			}
@@ -2266,7 +2263,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
 	else {
 		struct sched_domain *sd;
 		for_each_domain(this_cpu, sd) {
-			if (cpu_isset(cpu, sd->span)) {
+			if (cpumask_test_cpu(cpu, sched_domain_span(sd))) {
 				schedstat_inc(sd, ttwu_wake_remote);
 				break;
 			}
@@ -3109,10 +3106,11 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
 		unsigned long sum_avg_load_per_task;
 		unsigned long avg_load_per_task;
 
-		local_group = cpu_isset(this_cpu, group->cpumask);
+		local_group = cpumask_test_cpu(this_cpu,
+					       sched_group_cpus(group));
 
 		if (local_group)
-			balance_cpu = first_cpu(group->cpumask);
+			balance_cpu = cpumask_first(sched_group_cpus(group));
 
 		/* Tally up the load of all CPUs in the group */
 		sum_weighted_load = sum_nr_running = avg_load = 0;
@@ -3121,13 +3119,8 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
 		max_cpu_load = 0;
 		min_cpu_load = ~0UL;
 
-		for_each_cpu(i, &group->cpumask) {
-			struct rq *rq;
-
-			if (!cpu_isset(i, *cpus))
-				continue;
-
-			rq = cpu_rq(i);
+		for_each_cpu_and(i, sched_group_cpus(group), cpus) {
+			struct rq *rq = cpu_rq(i);
 
 			if (*sd_idle && rq->nr_running)
 				*sd_idle = 0;
@@ -3238,8 +3231,8 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
 		 */
 		if ((sum_nr_running < min_nr_running) ||
 		    (sum_nr_running == min_nr_running &&
-		     first_cpu(group->cpumask) <
-		     first_cpu(group_min->cpumask))) {
+		     cpumask_first(sched_group_cpus(group)) <
+		     cpumask_first(sched_group_cpus(group_min)))) {
 			group_min = group;
 			min_nr_running = sum_nr_running;
 			min_load_per_task = sum_weighted_load /
@@ -3254,8 +3247,8 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
 		if (sum_nr_running <= group_capacity - 1) {
 			if (sum_nr_running > leader_nr_running ||
 			    (sum_nr_running == leader_nr_running &&
-			     first_cpu(group->cpumask) >
-			      first_cpu(group_leader->cpumask))) {
+			     cpumask_first(sched_group_cpus(group)) >
+			     cpumask_first(sched_group_cpus(group_leader)))) {
 				group_leader = group;
 				leader_nr_running = sum_nr_running;
 			}
@@ -3400,7 +3393,7 @@ find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle,
 	unsigned long max_load = 0;
 	int i;
 
-	for_each_cpu(i, &group->cpumask) {
+	for_each_cpu(i, sched_group_cpus(group)) {
 		unsigned long wl;
 
 		if (!cpu_isset(i, *cpus))
@@ -3746,7 +3739,7 @@ static void active_load_balance(struct rq *busiest_rq, int busiest_cpu)
 	/* Search for an sd spanning us and the target CPU. */
 	for_each_domain(target_cpu, sd) {
 		if ((sd->flags & SD_LOAD_BALANCE) &&
-		    cpu_isset(busiest_cpu, sd->span))
+		    cpumask_test_cpu(busiest_cpu, sched_domain_span(sd)))
 				break;
 	}
 
@@ -6618,7 +6611,7 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
 	struct sched_group *group = sd->groups;
 	char str[256];
 
-	cpulist_scnprintf(str, sizeof(str), sd->span);
+	cpulist_scnprintf(str, sizeof(str), *sched_domain_span(sd));
 	cpus_clear(*groupmask);
 
 	printk(KERN_DEBUG "%*s domain %d: ", level, "", level);
@@ -6633,11 +6626,11 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
 
 	printk(KERN_CONT "span %s level %s\n", str, sd->name);
 
-	if (!cpu_isset(cpu, sd->span)) {
+	if (!cpumask_test_cpu(cpu, sched_domain_span(sd))) {
 		printk(KERN_ERR "ERROR: domain->span does not contain "
 				"CPU%d\n", cpu);
 	}
-	if (!cpu_isset(cpu, group->cpumask)) {
+	if (!cpumask_test_cpu(cpu, sched_group_cpus(group))) {
 		printk(KERN_ERR "ERROR: domain->groups does not contain"
 				" CPU%d\n", cpu);
 	}
@@ -6657,31 +6650,32 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
 			break;
 		}
 
-		if (!cpus_weight(group->cpumask)) {
+		if (!cpumask_weight(sched_group_cpus(group))) {
 			printk(KERN_CONT "\n");
 			printk(KERN_ERR "ERROR: empty group\n");
 			break;
 		}
 
-		if (cpus_intersects(*groupmask, group->cpumask)) {
+		if (cpumask_intersects(groupmask, sched_group_cpus(group))) {
 			printk(KERN_CONT "\n");
 			printk(KERN_ERR "ERROR: repeated CPUs\n");
 			break;
 		}
 
-		cpus_or(*groupmask, *groupmask, group->cpumask);
+		cpumask_or(groupmask, groupmask, sched_group_cpus(group));
 
-		cpulist_scnprintf(str, sizeof(str), group->cpumask);
+		cpulist_scnprintf(str, sizeof(str), *sched_group_cpus(group));
 		printk(KERN_CONT " %s", str);
 
 		group = group->next;
 	} while (group != sd->groups);
 	printk(KERN_CONT "\n");
 
-	if (!cpus_equal(sd->span, *groupmask))
+	if (!cpumask_equal(sched_domain_span(sd), groupmask))
 		printk(KERN_ERR "ERROR: groups don't span domain->span\n");
 
-	if (sd->parent && !cpus_subset(*groupmask, sd->parent->span))
+	if (sd->parent &&
+	    !cpumask_subset(groupmask, sched_domain_span(sd->parent)))
 		printk(KERN_ERR "ERROR: parent span is not a superset "
 			"of domain->span\n");
 	return 0;
@@ -6721,7 +6715,7 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu)
 
 static int sd_degenerate(struct sched_domain *sd)
 {
-	if (cpus_weight(sd->span) == 1)
+	if (cpumask_weight(sched_domain_span(sd)) == 1)
 		return 1;
 
 	/* Following flags need at least 2 groups */
@@ -6752,7 +6746,7 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
 	if (sd_degenerate(parent))
 		return 1;
 
-	if (!cpus_equal(sd->span, parent->span))
+	if (!cpumask_equal(sched_domain_span(sd), sched_domain_span(parent)))
 		return 0;
 
 	/* Does parent contain flags not in child? */
@@ -6913,10 +6907,10 @@ init_sched_build_groups(const cpumask_t *span, const cpumask_t *cpu_map,
 		int group = group_fn(i, cpu_map, &sg, tmpmask);
 		int j;
 
-		if (cpu_isset(i, *covered))
+		if (cpumask_test_cpu(i, covered))
 			continue;
 
-		cpus_clear(sg->cpumask);
+		cpumask_clear(sched_group_cpus(sg));
 		sg->__cpu_power = 0;
 
 		for_each_cpu(j, span) {
@@ -6924,7 +6918,7 @@ init_sched_build_groups(const cpumask_t *span, const cpumask_t *cpu_map,
 				continue;
 
 			cpu_set(j, *covered);
-			cpu_set(j, sg->cpumask);
+			cpumask_set_cpu(j, sched_group_cpus(sg));
 		}
 		if (!first)
 			first = sg;
@@ -7119,11 +7113,11 @@ static void init_numa_sched_groups_power(struct sched_group *group_head)
 	if (!sg)
 		return;
 	do {
-		for_each_cpu(j, &sg->cpumask) {
+		for_each_cpu(j, sched_group_cpus(sg)) {
 			struct sched_domain *sd;
 
 			sd = &per_cpu(phys_domains, j);
-			if (j != first_cpu(sd->groups->cpumask)) {
+			if (j != cpumask_first(sched_group_cpus(sd->groups))) {
 				/*
 				 * Only add "power" once for each
 				 * physical package.
@@ -7200,7 +7194,7 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd)
 
 	WARN_ON(!sd || !sd->groups);
 
-	if (cpu != first_cpu(sd->groups->cpumask))
+	if (cpu != cpumask_first(sched_group_cpus(sd->groups)))
 		return;
 
 	child = sd->child;
@@ -7372,7 +7366,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
 			sd = &per_cpu(allnodes_domains, i);
 			SD_INIT(sd, ALLNODES);
 			set_domain_attribute(sd, attr);
-			sd->span = *cpu_map;
+			cpumask_copy(sched_domain_span(sd), cpu_map);
 			cpu_to_allnodes_group(i, cpu_map, &sd->groups, tmpmask);
 			p = sd;
 			sd_allnodes = 1;
@@ -7382,18 +7376,19 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
 		sd = &per_cpu(node_domains, i);
 		SD_INIT(sd, NODE);
 		set_domain_attribute(sd, attr);
-		sched_domain_node_span(cpu_to_node(i), &sd->span);
+		sched_domain_node_span(cpu_to_node(i), sched_domain_span(sd));
 		sd->parent = p;
 		if (p)
 			p->child = sd;
-		cpus_and(sd->span, sd->span, *cpu_map);
+		cpumask_and(sched_domain_span(sd),
+			    sched_domain_span(sd), cpu_map);
 #endif
 
 		p = sd;
 		sd = &per_cpu(phys_domains, i);
 		SD_INIT(sd, CPU);
 		set_domain_attribute(sd, attr);
-		sd->span = *nodemask;
+		cpumask_copy(sched_domain_span(sd), nodemask);
 		sd->parent = p;
 		if (p)
 			p->child = sd;
@@ -7404,8 +7399,9 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
 		sd = &per_cpu(core_domains, i);
 		SD_INIT(sd, MC);
 		set_domain_attribute(sd, attr);
-		sd->span = cpu_coregroup_map(i);
-		cpus_and(sd->span, sd->span, *cpu_map);
+		*sched_domain_span(sd) = cpu_coregroup_map(i);
+		cpumask_and(sched_domain_span(sd),
+			    sched_domain_span(sd), cpu_map);
 		sd->parent = p;
 		p->child = sd;
 		cpu_to_core_group(i, cpu_map, &sd->groups, tmpmask);
@@ -7416,8 +7412,8 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
 		sd = &per_cpu(cpu_domains, i);
 		SD_INIT(sd, SIBLING);
 		set_domain_attribute(sd, attr);
-		sd->span = per_cpu(cpu_sibling_map, i);
-		cpus_and(sd->span, sd->span, *cpu_map);
+		cpumask_and(sched_domain_span(sd),
+			    &per_cpu(cpu_sibling_map, i), cpu_map);
 		sd->parent = p;
 		p->child = sd;
 		cpu_to_cpu_group(i, cpu_map, &sd->groups, tmpmask);
@@ -7503,7 +7499,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
 			sd->groups = sg;
 		}
 		sg->__cpu_power = 0;
-		sg->cpumask = *nodemask;
+		cpumask_copy(sched_group_cpus(sg), nodemask);
 		sg->next = sg;
 		cpus_or(*covered, *covered, *nodemask);
 		prev = sg;
@@ -7530,7 +7526,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
 				goto error;
 			}
 			sg->__cpu_power = 0;
-			sg->cpumask = *tmpmask;
+			cpumask_copy(sched_group_cpus(sg), tmpmask);
 			sg->next = prev->next;
 			cpus_or(*covered, *covered, *tmpmask);
 			prev->next = sg;
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 98345e45b05..bba00402ed9 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -1024,7 +1024,6 @@ static void yield_task_fair(struct rq *rq)
 #if defined(ARCH_HAS_SCHED_WAKE_IDLE)
 static int wake_idle(int cpu, struct task_struct *p)
 {
-	cpumask_t tmp;
 	struct sched_domain *sd;
 	int i;
 
@@ -1044,10 +1043,9 @@ static int wake_idle(int cpu, struct task_struct *p)
 		if ((sd->flags & SD_WAKE_IDLE)
 		    || ((sd->flags & SD_WAKE_IDLE_FAR)
 			&& !task_hot(p, task_rq(p)->clock, sd))) {
-			cpus_and(tmp, sd->span, p->cpus_allowed);
-			cpus_and(tmp, tmp, cpu_active_map);
-			for_each_cpu_mask_nr(i, tmp) {
-				if (idle_cpu(i)) {
+			for_each_cpu_and(i, sched_domain_span(sd),
+					 &p->cpus_allowed) {
+				if (cpu_active(i) && idle_cpu(i)) {
 					if (i != task_cpu(p)) {
 						schedstat_inc(p,
 						       se.nr_wakeups_idle);
@@ -1240,7 +1238,7 @@ static int select_task_rq_fair(struct task_struct *p, int sync)
 	 * this_cpu and prev_cpu are present in:
 	 */
 	for_each_domain(this_cpu, sd) {
-		if (cpu_isset(prev_cpu, sd->span)) {
+		if (cpumask_test_cpu(prev_cpu, sched_domain_span(sd))) {
 			this_sd = sd;
 			break;
 		}
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 2bdd4442359..4cd813abc23 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -1017,7 +1017,8 @@ static int find_lowest_rq(struct task_struct *task)
 			cpumask_t domain_mask;
 			int       best_cpu;
 
-			cpus_and(domain_mask, sd->span, *lowest_mask);
+			cpumask_and(&domain_mask, sched_domain_span(sd),
+				    lowest_mask);
 
 			best_cpu = pick_optimal_cpu(this_cpu,
 						    &domain_mask);
diff --git a/kernel/sched_stats.h b/kernel/sched_stats.h
index 7dbf72a2b02..ce340835d05 100644
--- a/kernel/sched_stats.h
+++ b/kernel/sched_stats.h
@@ -42,7 +42,8 @@ static int show_schedstat(struct seq_file *seq, void *v)
 		for_each_domain(cpu, sd) {
 			enum cpu_idle_type itype;
 
-			cpumask_scnprintf(mask_str, mask_len, sd->span);
+			cpumask_scnprintf(mask_str, mask_len,
+					  *sched_domain_span(sd));
 			seq_printf(seq, "domain%d %s", dcount++, mask_str);
 			for (itype = CPU_IDLE; itype < CPU_MAX_IDLE_TYPES;
 					itype++) {
-- 
cgit v1.2.3-70-g09d2


From 6c99e9ad47d9c082bd096f42fb49e397b05d58a8 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Tue, 25 Nov 2008 02:35:04 +1030
Subject: sched: convert struct sched_group/sched_domain cpumask_ts to variable
 bitmaps

Impact: (future) size reduction for large NR_CPUS.

We move the 'cpumask' member of sched_group to the end, so when we
kmalloc it we can do a minimal allocation: saves space for small
nr_cpu_ids but big CONFIG_NR_CPUS.  Similar trick for 'span' in
sched_domain.

This isn't quite as good as converting to a cpumask_var_t, as some
sched_groups are actually static, but it's safer: we don't have to
figure out where to call alloc_cpumask_var/free_cpumask_var.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/sched.h | 11 +++++----
 kernel/sched.c        | 65 ++++++++++++++++++++++++++++++++-------------------
 2 files changed, 48 insertions(+), 28 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 2b95aa9f779..c5be6c6bc74 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -771,7 +771,6 @@ enum cpu_idle_type {
 
 struct sched_group {
 	struct sched_group *next;	/* Must be a circular list */
-	cpumask_t cpumask;
 
 	/*
 	 * CPU power of this group, SCHED_LOAD_SCALE being max power for a
@@ -784,11 +783,13 @@ struct sched_group {
 	 * (see include/linux/reciprocal_div.h)
 	 */
 	u32 reciprocal_cpu_power;
+
+	unsigned long cpumask[];
 };
 
 static inline struct cpumask *sched_group_cpus(struct sched_group *sg)
 {
-	return &sg->cpumask;
+	return to_cpumask(sg->cpumask);
 }
 
 enum sched_domain_level {
@@ -814,7 +815,6 @@ struct sched_domain {
 	struct sched_domain *parent;	/* top domain must be null terminated */
 	struct sched_domain *child;	/* bottom domain must be null terminated */
 	struct sched_group *groups;	/* the balancing groups of the domain */
-	cpumask_t span;			/* span of all CPUs in this domain */
 	unsigned long min_interval;	/* Minimum balance interval ms */
 	unsigned long max_interval;	/* Maximum balance interval ms */
 	unsigned int busy_factor;	/* less balancing by factor if busy */
@@ -869,11 +869,14 @@ struct sched_domain {
 #ifdef CONFIG_SCHED_DEBUG
 	char *name;
 #endif
+
+	/* span of all CPUs in this domain */
+	unsigned long span[];
 };
 
 static inline struct cpumask *sched_domain_span(struct sched_domain *sd)
 {
-	return &sd->span;
+	return to_cpumask(sd->span);
 }
 
 extern void partition_sched_domains(int ndoms_new, cpumask_t *doms_new,
diff --git a/kernel/sched.c b/kernel/sched.c
index 575f38acf4d..6b9606a6cab 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -7005,19 +7005,34 @@ static void sched_domain_node_span(int node, cpumask_t *span)
 
 int sched_smt_power_savings = 0, sched_mc_power_savings = 0;
 
+/*
+ * The cpus mask in sched_group and sched_domain hangs off the end.
+ * FIXME: use cpumask_var_t or dynamic percpu alloc to avoid wasting space
+ * for nr_cpu_ids < CONFIG_NR_CPUS.
+ */
+struct static_sched_group {
+	struct sched_group sg;
+	DECLARE_BITMAP(cpus, CONFIG_NR_CPUS);
+};
+
+struct static_sched_domain {
+	struct sched_domain sd;
+	DECLARE_BITMAP(span, CONFIG_NR_CPUS);
+};
+
 /*
  * SMT sched-domains:
  */
 #ifdef CONFIG_SCHED_SMT
-static DEFINE_PER_CPU(struct sched_domain, cpu_domains);
-static DEFINE_PER_CPU(struct sched_group, sched_group_cpus);
+static DEFINE_PER_CPU(struct static_sched_domain, cpu_domains);
+static DEFINE_PER_CPU(struct static_sched_group, sched_group_cpus);
 
 static int
 cpu_to_cpu_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg,
 		 cpumask_t *unused)
 {
 	if (sg)
-		*sg = &per_cpu(sched_group_cpus, cpu);
+		*sg = &per_cpu(sched_group_cpus, cpu).sg;
 	return cpu;
 }
 #endif /* CONFIG_SCHED_SMT */
@@ -7026,8 +7041,8 @@ cpu_to_cpu_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg,
  * multi-core sched-domains:
  */
 #ifdef CONFIG_SCHED_MC
-static DEFINE_PER_CPU(struct sched_domain, core_domains);
-static DEFINE_PER_CPU(struct sched_group, sched_group_core);
+static DEFINE_PER_CPU(struct static_sched_domain, core_domains);
+static DEFINE_PER_CPU(struct static_sched_group, sched_group_core);
 #endif /* CONFIG_SCHED_MC */
 
 #if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT)
@@ -7041,7 +7056,7 @@ cpu_to_core_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg,
 	cpus_and(*mask, *mask, *cpu_map);
 	group = first_cpu(*mask);
 	if (sg)
-		*sg = &per_cpu(sched_group_core, group);
+		*sg = &per_cpu(sched_group_core, group).sg;
 	return group;
 }
 #elif defined(CONFIG_SCHED_MC)
@@ -7050,13 +7065,13 @@ cpu_to_core_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg,
 		  cpumask_t *unused)
 {
 	if (sg)
-		*sg = &per_cpu(sched_group_core, cpu);
+		*sg = &per_cpu(sched_group_core, cpu).sg;
 	return cpu;
 }
 #endif
 
-static DEFINE_PER_CPU(struct sched_domain, phys_domains);
-static DEFINE_PER_CPU(struct sched_group, sched_group_phys);
+static DEFINE_PER_CPU(struct static_sched_domain, phys_domains);
+static DEFINE_PER_CPU(struct static_sched_group, sched_group_phys);
 
 static int
 cpu_to_phys_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg,
@@ -7075,7 +7090,7 @@ cpu_to_phys_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg,
 	group = cpu;
 #endif
 	if (sg)
-		*sg = &per_cpu(sched_group_phys, group);
+		*sg = &per_cpu(sched_group_phys, group).sg;
 	return group;
 }
 
@@ -7089,7 +7104,7 @@ static DEFINE_PER_CPU(struct sched_domain, node_domains);
 static struct sched_group ***sched_group_nodes_bycpu;
 
 static DEFINE_PER_CPU(struct sched_domain, allnodes_domains);
-static DEFINE_PER_CPU(struct sched_group, sched_group_allnodes);
+static DEFINE_PER_CPU(struct static_sched_group, sched_group_allnodes);
 
 static int cpu_to_allnodes_group(int cpu, const cpumask_t *cpu_map,
 				 struct sched_group **sg, cpumask_t *nodemask)
@@ -7101,7 +7116,7 @@ static int cpu_to_allnodes_group(int cpu, const cpumask_t *cpu_map,
 	group = first_cpu(*nodemask);
 
 	if (sg)
-		*sg = &per_cpu(sched_group_allnodes, group);
+		*sg = &per_cpu(sched_group_allnodes, group).sg;
 	return group;
 }
 
@@ -7116,7 +7131,7 @@ static void init_numa_sched_groups_power(struct sched_group *group_head)
 		for_each_cpu(j, sched_group_cpus(sg)) {
 			struct sched_domain *sd;
 
-			sd = &per_cpu(phys_domains, j);
+			sd = &per_cpu(phys_domains, j).sd;
 			if (j != cpumask_first(sched_group_cpus(sd->groups))) {
 				/*
 				 * Only add "power" once for each
@@ -7385,7 +7400,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
 #endif
 
 		p = sd;
-		sd = &per_cpu(phys_domains, i);
+		sd = &per_cpu(phys_domains, i).sd;
 		SD_INIT(sd, CPU);
 		set_domain_attribute(sd, attr);
 		cpumask_copy(sched_domain_span(sd), nodemask);
@@ -7396,7 +7411,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
 
 #ifdef CONFIG_SCHED_MC
 		p = sd;
-		sd = &per_cpu(core_domains, i);
+		sd = &per_cpu(core_domains, i).sd;
 		SD_INIT(sd, MC);
 		set_domain_attribute(sd, attr);
 		*sched_domain_span(sd) = cpu_coregroup_map(i);
@@ -7409,7 +7424,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
 
 #ifdef CONFIG_SCHED_SMT
 		p = sd;
-		sd = &per_cpu(cpu_domains, i);
+		sd = &per_cpu(cpu_domains, i).sd;
 		SD_INIT(sd, SIBLING);
 		set_domain_attribute(sd, attr);
 		cpumask_and(sched_domain_span(sd),
@@ -7485,7 +7500,8 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
 		sched_domain_node_span(i, domainspan);
 		cpus_and(*domainspan, *domainspan, *cpu_map);
 
-		sg = kmalloc_node(sizeof(struct sched_group), GFP_KERNEL, i);
+		sg = kmalloc_node(sizeof(struct sched_group) + cpumask_size(),
+				  GFP_KERNEL, i);
 		if (!sg) {
 			printk(KERN_WARNING "Can not alloc domain group for "
 				"node %d\n", i);
@@ -7518,7 +7534,8 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
 			if (cpus_empty(*tmpmask))
 				continue;
 
-			sg = kmalloc_node(sizeof(struct sched_group),
+			sg = kmalloc_node(sizeof(struct sched_group) +
+					  cpumask_size(),
 					  GFP_KERNEL, i);
 			if (!sg) {
 				printk(KERN_WARNING
@@ -7538,21 +7555,21 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
 	/* Calculate CPU power for physical packages and nodes */
 #ifdef CONFIG_SCHED_SMT
 	for_each_cpu(i, cpu_map) {
-		struct sched_domain *sd = &per_cpu(cpu_domains, i);
+		struct sched_domain *sd = &per_cpu(cpu_domains, i).sd;
 
 		init_sched_groups_power(i, sd);
 	}
 #endif
 #ifdef CONFIG_SCHED_MC
 	for_each_cpu(i, cpu_map) {
-		struct sched_domain *sd = &per_cpu(core_domains, i);
+		struct sched_domain *sd = &per_cpu(core_domains, i).sd;
 
 		init_sched_groups_power(i, sd);
 	}
 #endif
 
 	for_each_cpu(i, cpu_map) {
-		struct sched_domain *sd = &per_cpu(phys_domains, i);
+		struct sched_domain *sd = &per_cpu(phys_domains, i).sd;
 
 		init_sched_groups_power(i, sd);
 	}
@@ -7574,11 +7591,11 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
 	for_each_cpu(i, cpu_map) {
 		struct sched_domain *sd;
 #ifdef CONFIG_SCHED_SMT
-		sd = &per_cpu(cpu_domains, i);
+		sd = &per_cpu(cpu_domains, i).sd;
 #elif defined(CONFIG_SCHED_MC)
-		sd = &per_cpu(core_domains, i);
+		sd = &per_cpu(core_domains, i).sd;
 #else
-		sd = &per_cpu(phys_domains, i);
+		sd = &per_cpu(phys_domains, i).sd;
 #endif
 		cpu_attach_domain(sd, rd, i);
 	}
-- 
cgit v1.2.3-70-g09d2


From 6a7b3dc3440f7b5a9b67594af01ed562cdeb41e4 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Tue, 25 Nov 2008 02:35:04 +1030
Subject: sched: convert nohz_cpu_mask to cpumask_var_t.

Impact: (future) size reduction for large NR_CPUS.

Dynamically allocating cpumasks (when CONFIG_CPUMASK_OFFSTACK) saves
space for small nr_cpu_ids but big CONFIG_NR_CPUS.  cpumask_var_t
is just a struct cpumask for !CONFIG_CPUMASK_OFFSTACK.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/sched.h    |  2 +-
 kernel/rcuclassic.c      |  2 +-
 kernel/sched.c           |  7 +++++--
 kernel/time/tick-sched.c | 10 +++++-----
 4 files changed, 12 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index c5be6c6bc74..1e33e2cb7f8 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -249,7 +249,7 @@ extern void init_idle_bootup_task(struct task_struct *idle);
 extern int runqueue_is_locked(void);
 extern void task_rq_unlock_wait(struct task_struct *p);
 
-extern cpumask_t nohz_cpu_mask;
+extern cpumask_var_t nohz_cpu_mask;
 #if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ)
 extern int select_nohz_load_balancer(int cpu);
 #else
diff --git a/kernel/rcuclassic.c b/kernel/rcuclassic.c
index e503a002f33..c03ca3e6191 100644
--- a/kernel/rcuclassic.c
+++ b/kernel/rcuclassic.c
@@ -393,7 +393,7 @@ static void rcu_start_batch(struct rcu_ctrlblk *rcp)
 		 * unnecessarily.
 		 */
 		smp_mb();
-		cpus_andnot(rcp->cpumask, cpu_online_map, nohz_cpu_mask);
+		cpumask_andnot(&rcp->cpumask, cpu_online_mask, nohz_cpu_mask);
 
 		rcp->signaled = 0;
 	}
diff --git a/kernel/sched.c b/kernel/sched.c
index 6b9606a6cab..2723d7a4a42 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -5870,9 +5870,9 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu)
  * indicates which cpus entered this state. This is used
  * in the rcu update to wait only for active cpus. For system
  * which do not switch off the HZ timer nohz_cpu_mask should
- * always be CPU_MASK_NONE.
+ * always be CPU_BITS_NONE.
  */
-cpumask_t nohz_cpu_mask = CPU_MASK_NONE;
+cpumask_var_t nohz_cpu_mask;
 
 /*
  * Increase the granularity value when there are more CPUs,
@@ -8274,6 +8274,9 @@ void __init sched_init(void)
 	 */
 	current->sched_class = &fair_sched_class;
 
+	/* Allocate the nohz_cpu_mask if CONFIG_CPUMASK_OFFSTACK */
+	alloc_bootmem_cpumask_var(&nohz_cpu_mask);
+
 	scheduler_running = 1;
 }
 
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 342fc9ccab4..70f872c71f4 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -144,7 +144,7 @@ void tick_nohz_update_jiffies(void)
 	if (!ts->tick_stopped)
 		return;
 
-	cpu_clear(cpu, nohz_cpu_mask);
+	cpumask_clear_cpu(cpu, nohz_cpu_mask);
 	now = ktime_get();
 	ts->idle_waketime = now;
 
@@ -283,7 +283,7 @@ void tick_nohz_stop_sched_tick(int inidle)
 	if ((long)delta_jiffies >= 1) {
 
 		if (delta_jiffies > 1)
-			cpu_set(cpu, nohz_cpu_mask);
+			cpumask_set_cpu(cpu, nohz_cpu_mask);
 		/*
 		 * nohz_stop_sched_tick can be called several times before
 		 * the nohz_restart_sched_tick is called. This happens when
@@ -296,7 +296,7 @@ void tick_nohz_stop_sched_tick(int inidle)
 				/*
 				 * sched tick not stopped!
 				 */
-				cpu_clear(cpu, nohz_cpu_mask);
+				cpumask_clear_cpu(cpu, nohz_cpu_mask);
 				goto out;
 			}
 
@@ -354,7 +354,7 @@ void tick_nohz_stop_sched_tick(int inidle)
 		 * softirq.
 		 */
 		tick_do_update_jiffies64(ktime_get());
-		cpu_clear(cpu, nohz_cpu_mask);
+		cpumask_clear_cpu(cpu, nohz_cpu_mask);
 	}
 	raise_softirq_irqoff(TIMER_SOFTIRQ);
 out:
@@ -432,7 +432,7 @@ void tick_nohz_restart_sched_tick(void)
 	select_nohz_load_balancer(0);
 	now = ktime_get();
 	tick_do_update_jiffies64(now);
-	cpu_clear(cpu, nohz_cpu_mask);
+	cpumask_clear_cpu(cpu, nohz_cpu_mask);
 
 	/*
 	 * We stopped the tick in idle. Update process times would miss the
-- 
cgit v1.2.3-70-g09d2


From 96f874e26428ab5d2db681c100210c254775e154 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Tue, 25 Nov 2008 02:35:14 +1030
Subject: sched: convert remaining old-style cpumask operators

Impact: Trivial API conversion

  NR_CPUS -> nr_cpu_ids
  cpumask_t -> struct cpumask
  sizeof(cpumask_t) -> cpumask_size()
  cpumask_a = cpumask_b -> cpumask_copy(&cpumask_a, &cpumask_b)

  cpu_set() -> cpumask_set_cpu()
  first_cpu() -> cpumask_first()
  cpumask_of_cpu() -> cpumask_of()
  cpus_* -> cpumask_*

There are some FIXMEs where we all archs to complete infrastructure
(patches have been sent):

  cpu_coregroup_map -> cpu_coregroup_mask
  node_to_cpumask* -> cpumask_of_node

There is also one FIXME where we pass an array of cpumasks to
partition_sched_domains(): this implies knowing the definition of
'struct cpumask' and the size of a cpumask.  This will be fixed in a
future patch.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/sched.h |  16 ++--
 kernel/sched.c        | 212 +++++++++++++++++++++++++++-----------------------
 kernel/sched_fair.c   |   4 +-
 kernel/sched_rt.c     |  18 ++---
 4 files changed, 132 insertions(+), 118 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 1e33e2cb7f8..4b7b0187374 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -879,7 +879,7 @@ static inline struct cpumask *sched_domain_span(struct sched_domain *sd)
 	return to_cpumask(sd->span);
 }
 
-extern void partition_sched_domains(int ndoms_new, cpumask_t *doms_new,
+extern void partition_sched_domains(int ndoms_new, struct cpumask *doms_new,
 				    struct sched_domain_attr *dattr_new);
 extern int arch_reinit_sched_domains(void);
 
@@ -888,7 +888,7 @@ extern int arch_reinit_sched_domains(void);
 struct sched_domain_attr;
 
 static inline void
-partition_sched_domains(int ndoms_new, cpumask_t *doms_new,
+partition_sched_domains(int ndoms_new, struct cpumask *doms_new,
 			struct sched_domain_attr *dattr_new)
 {
 }
@@ -970,7 +970,7 @@ struct sched_class {
 	void (*task_wake_up) (struct rq *this_rq, struct task_struct *task);
 
 	void (*set_cpus_allowed)(struct task_struct *p,
-				 const cpumask_t *newmask);
+				 const struct cpumask *newmask);
 
 	void (*rq_online)(struct rq *rq);
 	void (*rq_offline)(struct rq *rq);
@@ -1612,12 +1612,12 @@ extern cputime_t task_gtime(struct task_struct *p);
 
 #ifdef CONFIG_SMP
 extern int set_cpus_allowed_ptr(struct task_struct *p,
-				const cpumask_t *new_mask);
+				const struct cpumask *new_mask);
 #else
 static inline int set_cpus_allowed_ptr(struct task_struct *p,
-				       const cpumask_t *new_mask)
+				       const struct cpumask *new_mask)
 {
-	if (!cpu_isset(0, *new_mask))
+	if (!cpumask_test_cpu(0, new_mask))
 		return -EINVAL;
 	return 0;
 }
@@ -2230,8 +2230,8 @@ __trace_special(void *__tr, void *__data,
 }
 #endif
 
-extern long sched_setaffinity(pid_t pid, const cpumask_t *new_mask);
-extern long sched_getaffinity(pid_t pid, cpumask_t *mask);
+extern long sched_setaffinity(pid_t pid, const struct cpumask *new_mask);
+extern long sched_getaffinity(pid_t pid, struct cpumask *mask);
 
 extern int sched_mc_power_savings, sched_smt_power_savings;
 
diff --git a/kernel/sched.c b/kernel/sched.c
index f2be6187003..eba6a156d33 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -2829,7 +2829,7 @@ static void sched_migrate_task(struct task_struct *p, int dest_cpu)
 	struct rq *rq;
 
 	rq = task_rq_lock(p, &flags);
-	if (!cpu_isset(dest_cpu, p->cpus_allowed)
+	if (!cpumask_test_cpu(dest_cpu, &p->cpus_allowed)
 	    || unlikely(!cpu_active(dest_cpu)))
 		goto out;
 
@@ -2895,7 +2895,7 @@ int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
 	 * 2) cannot be migrated to this CPU due to cpus_allowed, or
 	 * 3) are cache-hot on their current CPU.
 	 */
-	if (!cpu_isset(this_cpu, p->cpus_allowed)) {
+	if (!cpumask_test_cpu(this_cpu, &p->cpus_allowed)) {
 		schedstat_inc(p, se.nr_failed_migrations_affine);
 		return 0;
 	}
@@ -3070,7 +3070,7 @@ static int move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
 static struct sched_group *
 find_busiest_group(struct sched_domain *sd, int this_cpu,
 		   unsigned long *imbalance, enum cpu_idle_type idle,
-		   int *sd_idle, const cpumask_t *cpus, int *balance)
+		   int *sd_idle, const struct cpumask *cpus, int *balance)
 {
 	struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups;
 	unsigned long max_load, avg_load, total_load, this_load, total_pwr;
@@ -3387,7 +3387,7 @@ ret:
  */
 static struct rq *
 find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle,
-		   unsigned long imbalance, const cpumask_t *cpus)
+		   unsigned long imbalance, const struct cpumask *cpus)
 {
 	struct rq *busiest = NULL, *rq;
 	unsigned long max_load = 0;
@@ -3396,7 +3396,7 @@ find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle,
 	for_each_cpu(i, sched_group_cpus(group)) {
 		unsigned long wl;
 
-		if (!cpu_isset(i, *cpus))
+		if (!cpumask_test_cpu(i, cpus))
 			continue;
 
 		rq = cpu_rq(i);
@@ -3426,7 +3426,7 @@ find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle,
  */
 static int load_balance(int this_cpu, struct rq *this_rq,
 			struct sched_domain *sd, enum cpu_idle_type idle,
-			int *balance, cpumask_t *cpus)
+			int *balance, struct cpumask *cpus)
 {
 	int ld_moved, all_pinned = 0, active_balance = 0, sd_idle = 0;
 	struct sched_group *group;
@@ -3434,7 +3434,7 @@ static int load_balance(int this_cpu, struct rq *this_rq,
 	struct rq *busiest;
 	unsigned long flags;
 
-	cpus_setall(*cpus);
+	cpumask_setall(cpus);
 
 	/*
 	 * When power savings policy is enabled for the parent domain, idle
@@ -3494,8 +3494,8 @@ redo:
 
 		/* All tasks on this runqueue were pinned by CPU affinity */
 		if (unlikely(all_pinned)) {
-			cpu_clear(cpu_of(busiest), *cpus);
-			if (!cpus_empty(*cpus))
+			cpumask_clear_cpu(cpu_of(busiest), cpus);
+			if (!cpumask_empty(cpus))
 				goto redo;
 			goto out_balanced;
 		}
@@ -3512,7 +3512,8 @@ redo:
 			/* don't kick the migration_thread, if the curr
 			 * task on busiest cpu can't be moved to this_cpu
 			 */
-			if (!cpu_isset(this_cpu, busiest->curr->cpus_allowed)) {
+			if (!cpumask_test_cpu(this_cpu,
+					      &busiest->curr->cpus_allowed)) {
 				spin_unlock_irqrestore(&busiest->lock, flags);
 				all_pinned = 1;
 				goto out_one_pinned;
@@ -3587,7 +3588,7 @@ out:
  */
 static int
 load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd,
-			cpumask_t *cpus)
+			struct cpumask *cpus)
 {
 	struct sched_group *group;
 	struct rq *busiest = NULL;
@@ -3596,7 +3597,7 @@ load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd,
 	int sd_idle = 0;
 	int all_pinned = 0;
 
-	cpus_setall(*cpus);
+	cpumask_setall(cpus);
 
 	/*
 	 * When power savings policy is enabled for the parent domain, idle
@@ -3640,8 +3641,8 @@ redo:
 		double_unlock_balance(this_rq, busiest);
 
 		if (unlikely(all_pinned)) {
-			cpu_clear(cpu_of(busiest), *cpus);
-			if (!cpus_empty(*cpus))
+			cpumask_clear_cpu(cpu_of(busiest), cpus);
+			if (!cpumask_empty(cpus))
 				goto redo;
 		}
 	}
@@ -5376,7 +5377,7 @@ out_unlock:
 	return retval;
 }
 
-long sched_setaffinity(pid_t pid, const cpumask_t *in_mask)
+long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
 {
 	cpumask_var_t cpus_allowed, new_mask;
 	struct task_struct *p;
@@ -5445,13 +5446,13 @@ out_put_task:
 }
 
 static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len,
-			     cpumask_t *new_mask)
+			     struct cpumask *new_mask)
 {
-	if (len < sizeof(cpumask_t)) {
-		memset(new_mask, 0, sizeof(cpumask_t));
-	} else if (len > sizeof(cpumask_t)) {
-		len = sizeof(cpumask_t);
-	}
+	if (len < cpumask_size())
+		cpumask_clear(new_mask);
+	else if (len > cpumask_size())
+		len = cpumask_size();
+
 	return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0;
 }
 
@@ -5477,7 +5478,7 @@ asmlinkage long sys_sched_setaffinity(pid_t pid, unsigned int len,
 	return retval;
 }
 
-long sched_getaffinity(pid_t pid, cpumask_t *mask)
+long sched_getaffinity(pid_t pid, struct cpumask *mask)
 {
 	struct task_struct *p;
 	int retval;
@@ -5494,7 +5495,7 @@ long sched_getaffinity(pid_t pid, cpumask_t *mask)
 	if (retval)
 		goto out_unlock;
 
-	cpus_and(*mask, p->cpus_allowed, cpu_online_map);
+	cpumask_and(mask, &p->cpus_allowed, cpu_online_mask);
 
 out_unlock:
 	read_unlock(&tasklist_lock);
@@ -5872,7 +5873,7 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu)
 	idle->se.exec_start = sched_clock();
 
 	idle->prio = idle->normal_prio = MAX_PRIO;
-	idle->cpus_allowed = cpumask_of_cpu(cpu);
+	cpumask_copy(&idle->cpus_allowed, cpumask_of(cpu));
 	__set_task_cpu(idle, cpu);
 
 	rq->curr = rq->idle = idle;
@@ -5956,7 +5957,7 @@ static inline void sched_init_granularity(void)
  * task must not exit() & deallocate itself prematurely. The
  * call is not atomic; no spinlocks may be held.
  */
-int set_cpus_allowed_ptr(struct task_struct *p, const cpumask_t *new_mask)
+int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
 {
 	struct migration_req req;
 	unsigned long flags;
@@ -5964,13 +5965,13 @@ int set_cpus_allowed_ptr(struct task_struct *p, const cpumask_t *new_mask)
 	int ret = 0;
 
 	rq = task_rq_lock(p, &flags);
-	if (!cpus_intersects(*new_mask, cpu_online_map)) {
+	if (!cpumask_intersects(new_mask, cpu_online_mask)) {
 		ret = -EINVAL;
 		goto out;
 	}
 
 	if (unlikely((p->flags & PF_THREAD_BOUND) && p != current &&
-		     !cpus_equal(p->cpus_allowed, *new_mask))) {
+		     !cpumask_equal(&p->cpus_allowed, new_mask))) {
 		ret = -EINVAL;
 		goto out;
 	}
@@ -5978,12 +5979,12 @@ int set_cpus_allowed_ptr(struct task_struct *p, const cpumask_t *new_mask)
 	if (p->sched_class->set_cpus_allowed)
 		p->sched_class->set_cpus_allowed(p, new_mask);
 	else {
-		p->cpus_allowed = *new_mask;
-		p->rt.nr_cpus_allowed = cpus_weight(*new_mask);
+		cpumask_copy(&p->cpus_allowed, new_mask);
+		p->rt.nr_cpus_allowed = cpumask_weight(new_mask);
 	}
 
 	/* Can the task run on the task's current CPU? If so, we're done */
-	if (cpu_isset(task_cpu(p), *new_mask))
+	if (cpumask_test_cpu(task_cpu(p), new_mask))
 		goto out;
 
 	if (migrate_task(p, cpumask_any_and(cpu_online_mask, new_mask), &req)) {
@@ -6028,7 +6029,7 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
 	if (task_cpu(p) != src_cpu)
 		goto done;
 	/* Affinity changed (again). */
-	if (!cpu_isset(dest_cpu, p->cpus_allowed))
+	if (!cpumask_test_cpu(dest_cpu, &p->cpus_allowed))
 		goto fail;
 
 	on_rq = p->se.on_rq;
@@ -6629,13 +6630,13 @@ early_initcall(migration_init);
 #ifdef CONFIG_SCHED_DEBUG
 
 static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
-				  cpumask_t *groupmask)
+				  struct cpumask *groupmask)
 {
 	struct sched_group *group = sd->groups;
 	char str[256];
 
 	cpulist_scnprintf(str, sizeof(str), *sched_domain_span(sd));
-	cpus_clear(*groupmask);
+	cpumask_clear(groupmask);
 
 	printk(KERN_DEBUG "%*s domain %d: ", level, "", level);
 
@@ -6936,24 +6937,25 @@ __setup("isolcpus=", isolated_cpu_setup);
 /*
  * init_sched_build_groups takes the cpumask we wish to span, and a pointer
  * to a function which identifies what group(along with sched group) a CPU
- * belongs to. The return value of group_fn must be a >= 0 and < NR_CPUS
- * (due to the fact that we keep track of groups covered with a cpumask_t).
+ * belongs to. The return value of group_fn must be a >= 0 and < nr_cpu_ids
+ * (due to the fact that we keep track of groups covered with a struct cpumask).
  *
  * init_sched_build_groups will build a circular linked list of the groups
  * covered by the given span, and will set each group's ->cpumask correctly,
  * and ->cpu_power to 0.
  */
 static void
-init_sched_build_groups(const cpumask_t *span, const cpumask_t *cpu_map,
-			int (*group_fn)(int cpu, const cpumask_t *cpu_map,
+init_sched_build_groups(const struct cpumask *span,
+			const struct cpumask *cpu_map,
+			int (*group_fn)(int cpu, const struct cpumask *cpu_map,
 					struct sched_group **sg,
-					cpumask_t *tmpmask),
-			cpumask_t *covered, cpumask_t *tmpmask)
+					struct cpumask *tmpmask),
+			struct cpumask *covered, struct cpumask *tmpmask)
 {
 	struct sched_group *first = NULL, *last = NULL;
 	int i;
 
-	cpus_clear(*covered);
+	cpumask_clear(covered);
 
 	for_each_cpu(i, span) {
 		struct sched_group *sg;
@@ -6970,7 +6972,7 @@ init_sched_build_groups(const cpumask_t *span, const cpumask_t *cpu_map,
 			if (group_fn(j, cpu_map, NULL, tmpmask) != group)
 				continue;
 
-			cpu_set(j, *covered);
+			cpumask_set_cpu(j, covered);
 			cpumask_set_cpu(j, sched_group_cpus(sg));
 		}
 		if (!first)
@@ -7035,9 +7037,10 @@ static int find_next_best_node(int node, nodemask_t *used_nodes)
  * should be one that prevents unnecessary balancing, but also spreads tasks
  * out optimally.
  */
-static void sched_domain_node_span(int node, cpumask_t *span)
+static void sched_domain_node_span(int node, struct cpumask *span)
 {
 	nodemask_t used_nodes;
+	/* FIXME: use cpumask_of_node() */
 	node_to_cpumask_ptr(nodemask, node);
 	int i;
 
@@ -7081,8 +7084,8 @@ static DEFINE_PER_CPU(struct static_sched_domain, cpu_domains);
 static DEFINE_PER_CPU(struct static_sched_group, sched_group_cpus);
 
 static int
-cpu_to_cpu_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg,
-		 cpumask_t *unused)
+cpu_to_cpu_group(int cpu, const struct cpumask *cpu_map,
+		 struct sched_group **sg, struct cpumask *unused)
 {
 	if (sg)
 		*sg = &per_cpu(sched_group_cpus, cpu).sg;
@@ -7100,22 +7103,21 @@ static DEFINE_PER_CPU(struct static_sched_group, sched_group_core);
 
 #if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT)
 static int
-cpu_to_core_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg,
-		  cpumask_t *mask)
+cpu_to_core_group(int cpu, const struct cpumask *cpu_map,
+		  struct sched_group **sg, struct cpumask *mask)
 {
 	int group;
 
-	*mask = per_cpu(cpu_sibling_map, cpu);
-	cpus_and(*mask, *mask, *cpu_map);
-	group = first_cpu(*mask);
+	cpumask_and(mask, &per_cpu(cpu_sibling_map, cpu), cpu_map);
+	group = cpumask_first(mask);
 	if (sg)
 		*sg = &per_cpu(sched_group_core, group).sg;
 	return group;
 }
 #elif defined(CONFIG_SCHED_MC)
 static int
-cpu_to_core_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg,
-		  cpumask_t *unused)
+cpu_to_core_group(int cpu, const struct cpumask *cpu_map,
+		  struct sched_group **sg, struct cpumask *unused)
 {
 	if (sg)
 		*sg = &per_cpu(sched_group_core, cpu).sg;
@@ -7127,18 +7129,18 @@ static DEFINE_PER_CPU(struct static_sched_domain, phys_domains);
 static DEFINE_PER_CPU(struct static_sched_group, sched_group_phys);
 
 static int
-cpu_to_phys_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg,
-		  cpumask_t *mask)
+cpu_to_phys_group(int cpu, const struct cpumask *cpu_map,
+		  struct sched_group **sg, struct cpumask *mask)
 {
 	int group;
 #ifdef CONFIG_SCHED_MC
+	/* FIXME: Use cpu_coregroup_mask. */
 	*mask = cpu_coregroup_map(cpu);
 	cpus_and(*mask, *mask, *cpu_map);
-	group = first_cpu(*mask);
+	group = cpumask_first(mask);
 #elif defined(CONFIG_SCHED_SMT)
-	*mask = per_cpu(cpu_sibling_map, cpu);
-	cpus_and(*mask, *mask, *cpu_map);
-	group = first_cpu(*mask);
+	cpumask_and(mask, &per_cpu(cpu_sibling_map, cpu), cpu_map);
+	group = cpumask_first(mask);
 #else
 	group = cpu;
 #endif
@@ -7159,14 +7161,16 @@ static struct sched_group ***sched_group_nodes_bycpu;
 static DEFINE_PER_CPU(struct sched_domain, allnodes_domains);
 static DEFINE_PER_CPU(struct static_sched_group, sched_group_allnodes);
 
-static int cpu_to_allnodes_group(int cpu, const cpumask_t *cpu_map,
-				 struct sched_group **sg, cpumask_t *nodemask)
+static int cpu_to_allnodes_group(int cpu, const struct cpumask *cpu_map,
+				 struct sched_group **sg,
+				 struct cpumask *nodemask)
 {
 	int group;
+	/* FIXME: use cpumask_of_node */
 	node_to_cpumask_ptr(pnodemask, cpu_to_node(cpu));
 
-	cpus_and(*nodemask, *pnodemask, *cpu_map);
-	group = first_cpu(*nodemask);
+	cpumask_and(nodemask, pnodemask, cpu_map);
+	group = cpumask_first(nodemask);
 
 	if (sg)
 		*sg = &per_cpu(sched_group_allnodes, group).sg;
@@ -7202,7 +7206,8 @@ static void init_numa_sched_groups_power(struct sched_group *group_head)
 
 #ifdef CONFIG_NUMA
 /* Free memory allocated for various sched_group structures */
-static void free_sched_groups(const cpumask_t *cpu_map, cpumask_t *nodemask)
+static void free_sched_groups(const struct cpumask *cpu_map,
+			      struct cpumask *nodemask)
 {
 	int cpu, i;
 
@@ -7215,10 +7220,11 @@ static void free_sched_groups(const cpumask_t *cpu_map, cpumask_t *nodemask)
 
 		for (i = 0; i < nr_node_ids; i++) {
 			struct sched_group *oldsg, *sg = sched_group_nodes[i];
+			/* FIXME: Use cpumask_of_node */
 			node_to_cpumask_ptr(pnodemask, i);
 
 			cpus_and(*nodemask, *pnodemask, *cpu_map);
-			if (cpus_empty(*nodemask))
+			if (cpumask_empty(nodemask))
 				continue;
 
 			if (sg == NULL)
@@ -7236,7 +7242,8 @@ next_sg:
 	}
 }
 #else /* !CONFIG_NUMA */
-static void free_sched_groups(const cpumask_t *cpu_map, cpumask_t *nodemask)
+static void free_sched_groups(const struct cpumask *cpu_map,
+			      struct cpumask *nodemask)
 {
 }
 #endif /* CONFIG_NUMA */
@@ -7366,7 +7373,7 @@ static void set_domain_attribute(struct sched_domain *sd,
  * Build sched domains for a given set of cpus and attach the sched domains
  * to the individual cpus
  */
-static int __build_sched_domains(const cpumask_t *cpu_map,
+static int __build_sched_domains(const struct cpumask *cpu_map,
 				 struct sched_domain_attr *attr)
 {
 	int i, err = -ENOMEM;
@@ -7416,7 +7423,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
 	}
 
 #ifdef CONFIG_NUMA
-	sched_group_nodes_bycpu[first_cpu(*cpu_map)] = sched_group_nodes;
+	sched_group_nodes_bycpu[cpumask_first(cpu_map)] = sched_group_nodes;
 #endif
 
 	/*
@@ -7425,12 +7432,13 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
 	for_each_cpu(i, cpu_map) {
 		struct sched_domain *sd = NULL, *p;
 
+		/* FIXME: use cpumask_of_node */
 		*nodemask = node_to_cpumask(cpu_to_node(i));
 		cpus_and(*nodemask, *nodemask, *cpu_map);
 
 #ifdef CONFIG_NUMA
-		if (cpus_weight(*cpu_map) >
-				SD_NODES_PER_DOMAIN*cpus_weight(*nodemask)) {
+		if (cpumask_weight(cpu_map) >
+				SD_NODES_PER_DOMAIN*cpumask_weight(nodemask)) {
 			sd = &per_cpu(allnodes_domains, i);
 			SD_INIT(sd, ALLNODES);
 			set_domain_attribute(sd, attr);
@@ -7491,9 +7499,9 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
 #ifdef CONFIG_SCHED_SMT
 	/* Set up CPU (sibling) groups */
 	for_each_cpu(i, cpu_map) {
-		*this_sibling_map = per_cpu(cpu_sibling_map, i);
-		cpus_and(*this_sibling_map, *this_sibling_map, *cpu_map);
-		if (i != first_cpu(*this_sibling_map))
+		cpumask_and(this_sibling_map,
+			    &per_cpu(cpu_sibling_map, i), cpu_map);
+		if (i != cpumask_first(this_sibling_map))
 			continue;
 
 		init_sched_build_groups(this_sibling_map, cpu_map,
@@ -7505,9 +7513,10 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
 #ifdef CONFIG_SCHED_MC
 	/* Set up multi-core groups */
 	for_each_cpu(i, cpu_map) {
+		/* FIXME: Use cpu_coregroup_mask */
 		*this_core_map = cpu_coregroup_map(i);
 		cpus_and(*this_core_map, *this_core_map, *cpu_map);
-		if (i != first_cpu(*this_core_map))
+		if (i != cpumask_first(this_core_map))
 			continue;
 
 		init_sched_build_groups(this_core_map, cpu_map,
@@ -7518,9 +7527,10 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
 
 	/* Set up physical groups */
 	for (i = 0; i < nr_node_ids; i++) {
+		/* FIXME: Use cpumask_of_node */
 		*nodemask = node_to_cpumask(i);
 		cpus_and(*nodemask, *nodemask, *cpu_map);
-		if (cpus_empty(*nodemask))
+		if (cpumask_empty(nodemask))
 			continue;
 
 		init_sched_build_groups(nodemask, cpu_map,
@@ -7541,17 +7551,18 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
 		struct sched_group *sg, *prev;
 		int j;
 
+		/* FIXME: Use cpumask_of_node */
 		*nodemask = node_to_cpumask(i);
-		cpus_clear(*covered);
+		cpumask_clear(covered);
 
 		cpus_and(*nodemask, *nodemask, *cpu_map);
-		if (cpus_empty(*nodemask)) {
+		if (cpumask_empty(nodemask)) {
 			sched_group_nodes[i] = NULL;
 			continue;
 		}
 
 		sched_domain_node_span(i, domainspan);
-		cpus_and(*domainspan, *domainspan, *cpu_map);
+		cpumask_and(domainspan, domainspan, cpu_map);
 
 		sg = kmalloc_node(sizeof(struct sched_group) + cpumask_size(),
 				  GFP_KERNEL, i);
@@ -7570,21 +7581,22 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
 		sg->__cpu_power = 0;
 		cpumask_copy(sched_group_cpus(sg), nodemask);
 		sg->next = sg;
-		cpus_or(*covered, *covered, *nodemask);
+		cpumask_or(covered, covered, nodemask);
 		prev = sg;
 
 		for (j = 0; j < nr_node_ids; j++) {
 			int n = (i + j) % nr_node_ids;
+			/* FIXME: Use cpumask_of_node */
 			node_to_cpumask_ptr(pnodemask, n);
 
-			cpus_complement(*notcovered, *covered);
-			cpus_and(*tmpmask, *notcovered, *cpu_map);
-			cpus_and(*tmpmask, *tmpmask, *domainspan);
-			if (cpus_empty(*tmpmask))
+			cpumask_complement(notcovered, covered);
+			cpumask_and(tmpmask, notcovered, cpu_map);
+			cpumask_and(tmpmask, tmpmask, domainspan);
+			if (cpumask_empty(tmpmask))
 				break;
 
-			cpus_and(*tmpmask, *tmpmask, *pnodemask);
-			if (cpus_empty(*tmpmask))
+			cpumask_and(tmpmask, tmpmask, pnodemask);
+			if (cpumask_empty(tmpmask))
 				continue;
 
 			sg = kmalloc_node(sizeof(struct sched_group) +
@@ -7598,7 +7610,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
 			sg->__cpu_power = 0;
 			cpumask_copy(sched_group_cpus(sg), tmpmask);
 			sg->next = prev->next;
-			cpus_or(*covered, *covered, *tmpmask);
+			cpumask_or(covered, covered, tmpmask);
 			prev->next = sg;
 			prev = sg;
 		}
@@ -7634,7 +7646,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
 	if (sd_allnodes) {
 		struct sched_group *sg;
 
-		cpu_to_allnodes_group(first_cpu(*cpu_map), cpu_map, &sg,
+		cpu_to_allnodes_group(cpumask_first(cpu_map), cpu_map, &sg,
 								tmpmask);
 		init_numa_sched_groups_power(sg);
 	}
@@ -7690,12 +7702,12 @@ error:
 #endif
 }
 
-static int build_sched_domains(const cpumask_t *cpu_map)
+static int build_sched_domains(const struct cpumask *cpu_map)
 {
 	return __build_sched_domains(cpu_map, NULL);
 }
 
-static cpumask_t *doms_cur;	/* current sched domains */
+static struct cpumask *doms_cur;	/* current sched domains */
 static int ndoms_cur;		/* number of sched domains in 'doms_cur' */
 static struct sched_domain_attr *dattr_cur;
 				/* attribues of custom domains in 'doms_cur' */
@@ -7716,13 +7728,13 @@ void __attribute__((weak)) arch_update_cpu_topology(void)
  * For now this just excludes isolated cpus, but could be used to
  * exclude other special cases in the future.
  */
-static int arch_init_sched_domains(const cpumask_t *cpu_map)
+static int arch_init_sched_domains(const struct cpumask *cpu_map)
 {
 	int err;
 
 	arch_update_cpu_topology();
 	ndoms_cur = 1;
-	doms_cur = kmalloc(sizeof(cpumask_t), GFP_KERNEL);
+	doms_cur = kmalloc(cpumask_size(), GFP_KERNEL);
 	if (!doms_cur)
 		doms_cur = fallback_doms;
 	cpumask_andnot(doms_cur, cpu_map, cpu_isolated_map);
@@ -7733,8 +7745,8 @@ static int arch_init_sched_domains(const cpumask_t *cpu_map)
 	return err;
 }
 
-static void arch_destroy_sched_domains(const cpumask_t *cpu_map,
-				       cpumask_t *tmpmask)
+static void arch_destroy_sched_domains(const struct cpumask *cpu_map,
+				       struct cpumask *tmpmask)
 {
 	free_sched_groups(cpu_map, tmpmask);
 }
@@ -7743,15 +7755,16 @@ static void arch_destroy_sched_domains(const cpumask_t *cpu_map,
  * Detach sched domains from a group of cpus specified in cpu_map
  * These cpus will now be attached to the NULL domain
  */
-static void detach_destroy_domains(const cpumask_t *cpu_map)
+static void detach_destroy_domains(const struct cpumask *cpu_map)
 {
-	cpumask_t tmpmask;
+	/* Save because hotplug lock held. */
+	static DECLARE_BITMAP(tmpmask, CONFIG_NR_CPUS);
 	int i;
 
 	for_each_cpu(i, cpu_map)
 		cpu_attach_domain(NULL, &def_root_domain, i);
 	synchronize_sched();
-	arch_destroy_sched_domains(cpu_map, &tmpmask);
+	arch_destroy_sched_domains(cpu_map, to_cpumask(tmpmask));
 }
 
 /* handle null as "default" */
@@ -7776,7 +7789,7 @@ static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur,
  * doms_new[] to the current sched domain partitioning, doms_cur[].
  * It destroys each deleted domain and builds each new domain.
  *
- * 'doms_new' is an array of cpumask_t's of length 'ndoms_new'.
+ * 'doms_new' is an array of cpumask's of length 'ndoms_new'.
  * The masks don't intersect (don't overlap.) We should setup one
  * sched domain for each mask. CPUs not in any of the cpumasks will
  * not be load balanced. If the same cpumask appears both in the
@@ -7790,13 +7803,14 @@ static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur,
  * the single partition 'fallback_doms', it also forces the domains
  * to be rebuilt.
  *
- * If doms_new == NULL it will be replaced with cpu_online_map.
+ * If doms_new == NULL it will be replaced with cpu_online_mask.
  * ndoms_new == 0 is a special case for destroying existing domains,
  * and it will not create the default domain.
  *
  * Call with hotplug lock held
  */
-void partition_sched_domains(int ndoms_new, cpumask_t *doms_new,
+/* FIXME: Change to struct cpumask *doms_new[] */
+void partition_sched_domains(int ndoms_new, struct cpumask *doms_new,
 			     struct sched_domain_attr *dattr_new)
 {
 	int i, j, n;
@@ -7811,7 +7825,7 @@ void partition_sched_domains(int ndoms_new, cpumask_t *doms_new,
 	/* Destroy deleted domains */
 	for (i = 0; i < ndoms_cur; i++) {
 		for (j = 0; j < n; j++) {
-			if (cpus_equal(doms_cur[i], doms_new[j])
+			if (cpumask_equal(&doms_cur[i], &doms_new[j])
 			    && dattrs_equal(dattr_cur, i, dattr_new, j))
 				goto match1;
 		}
@@ -7831,7 +7845,7 @@ match1:
 	/* Build new domains */
 	for (i = 0; i < ndoms_new; i++) {
 		for (j = 0; j < ndoms_cur; j++) {
-			if (cpus_equal(doms_new[i], doms_cur[j])
+			if (cpumask_equal(&doms_new[i], &doms_cur[j])
 			    && dattrs_equal(dattr_new, i, dattr_cur, j))
 				goto match2;
 		}
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index bba00402ed9..08ffffd4a41 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -1017,7 +1017,7 @@ static void yield_task_fair(struct rq *rq)
  * search starts with cpus closest then further out as needed,
  * so we always favor a closer, idle cpu.
  * Domains may include CPUs that are not usable for migration,
- * hence we need to mask them out (cpu_active_map)
+ * hence we need to mask them out (cpu_active_mask)
  *
  * Returns the CPU we should wake onto.
  */
@@ -1244,7 +1244,7 @@ static int select_task_rq_fair(struct task_struct *p, int sync)
 		}
 	}
 
-	if (unlikely(!cpu_isset(this_cpu, p->cpus_allowed)))
+	if (unlikely(!cpumask_test_cpu(this_cpu, &p->cpus_allowed)))
 		goto out;
 
 	/*
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 1f0e99d1a8c..fb3964579a8 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -923,7 +923,7 @@ static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep);
 static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu)
 {
 	if (!task_running(rq, p) &&
-	    (cpu < 0 || cpu_isset(cpu, p->cpus_allowed)) &&
+	    (cpu < 0 || cpumask_test_cpu(cpu, &p->cpus_allowed)) &&
 	    (p->rt.nr_cpus_allowed > 1))
 		return 1;
 	return 0;
@@ -982,7 +982,7 @@ static inline int pick_optimal_cpu(int this_cpu, cpumask_t *mask)
 static int find_lowest_rq(struct task_struct *task)
 {
 	struct sched_domain *sd;
-	cpumask_t *lowest_mask = __get_cpu_var(local_cpu_mask);
+	struct cpumask *lowest_mask = __get_cpu_var(local_cpu_mask);
 	int this_cpu = smp_processor_id();
 	int cpu      = task_cpu(task);
 
@@ -997,7 +997,7 @@ static int find_lowest_rq(struct task_struct *task)
 	 * I guess we might want to change cpupri_find() to ignore those
 	 * in the first place.
 	 */
-	cpus_and(*lowest_mask, *lowest_mask, cpu_active_map);
+	cpumask_and(lowest_mask, lowest_mask, cpu_active_mask);
 
 	/*
 	 * At this point we have built a mask of cpus representing the
@@ -1007,7 +1007,7 @@ static int find_lowest_rq(struct task_struct *task)
 	 * We prioritize the last cpu that the task executed on since
 	 * it is most likely cache-hot in that location.
 	 */
-	if (cpu_isset(cpu, *lowest_mask))
+	if (cpumask_test_cpu(cpu, lowest_mask))
 		return cpu;
 
 	/*
@@ -1064,8 +1064,8 @@ static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq)
 			 * Also make sure that it wasn't scheduled on its rq.
 			 */
 			if (unlikely(task_rq(task) != rq ||
-				     !cpu_isset(lowest_rq->cpu,
-						task->cpus_allowed) ||
+				     !cpumask_test_cpu(lowest_rq->cpu,
+						       &task->cpus_allowed) ||
 				     task_running(rq, task) ||
 				     !task->se.on_rq)) {
 
@@ -1315,9 +1315,9 @@ move_one_task_rt(struct rq *this_rq, int this_cpu, struct rq *busiest,
 }
 
 static void set_cpus_allowed_rt(struct task_struct *p,
-				const cpumask_t *new_mask)
+				const struct cpumask *new_mask)
 {
-	int weight = cpus_weight(*new_mask);
+	int weight = cpumask_weight(new_mask);
 
 	BUG_ON(!rt_task(p));
 
@@ -1338,7 +1338,7 @@ static void set_cpus_allowed_rt(struct task_struct *p,
 		update_rt_migration(rq);
 	}
 
-	p->cpus_allowed    = *new_mask;
+	cpumask_copy(&p->cpus_allowed, new_mask);
 	p->rt.nr_cpus_allowed = weight;
 }
 
-- 
cgit v1.2.3-70-g09d2


From 1acdac104668a0834cfa267de9946fac7764d486 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 20 Nov 2008 10:02:53 -0800
Subject: futex: make clock selectable for FUTEX_WAIT_BITSET

FUTEX_WAIT_BITSET could be used instead of FUTEX_WAIT by setting the
bit set to FUTEX_BITSET_MATCH_ANY, but FUTEX_WAIT uses CLOCK_REALTIME
while FUTEX_WAIT_BITSET uses CLOCK_MONOTONIC.

Add a flag to select CLOCK_REALTIME for FUTEX_WAIT_BITSET so glibc can
replace the FUTEX_WAIT logic which needs to do gettimeofday() calls
before and after the syscall to convert the absolute timeout to a
relative timeout for FUTEX_WAIT.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Ulrich Drepper <drepper@redhat.com>
---
 include/linux/futex.h |  3 ++-
 kernel/futex.c        | 24 +++++++++++++++++-------
 2 files changed, 19 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/futex.h b/include/linux/futex.h
index 8f627b9ae2b..3bf5bb5a34f 100644
--- a/include/linux/futex.h
+++ b/include/linux/futex.h
@@ -25,7 +25,8 @@ union ktime;
 #define FUTEX_WAKE_BITSET	10
 
 #define FUTEX_PRIVATE_FLAG	128
-#define FUTEX_CMD_MASK		~FUTEX_PRIVATE_FLAG
+#define FUTEX_CLOCK_REALTIME	256
+#define FUTEX_CMD_MASK		~(FUTEX_PRIVATE_FLAG | FUTEX_CLOCK_REALTIME)
 
 #define FUTEX_WAIT_PRIVATE	(FUTEX_WAIT | FUTEX_PRIVATE_FLAG)
 #define FUTEX_WAKE_PRIVATE	(FUTEX_WAKE | FUTEX_PRIVATE_FLAG)
diff --git a/kernel/futex.c b/kernel/futex.c
index e10c5c8786a..ba0d3b83c09 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -1142,12 +1142,13 @@ handle_fault:
  * In case we must use restart_block to restart a futex_wait,
  * we encode in the 'flags' shared capability
  */
-#define FLAGS_SHARED  1
+#define FLAGS_SHARED		0x01
+#define FLAGS_CLOCKRT		0x02
 
 static long futex_wait_restart(struct restart_block *restart);
 
 static int futex_wait(u32 __user *uaddr, int fshared,
-		      u32 val, ktime_t *abs_time, u32 bitset)
+		      u32 val, ktime_t *abs_time, u32 bitset, int clockrt)
 {
 	struct task_struct *curr = current;
 	DECLARE_WAITQUEUE(wait, curr);
@@ -1233,8 +1234,10 @@ static int futex_wait(u32 __user *uaddr, int fshared,
 			slack = current->timer_slack_ns;
 			if (rt_task(current))
 				slack = 0;
-			hrtimer_init_on_stack(&t.timer, CLOCK_MONOTONIC,
-						HRTIMER_MODE_ABS);
+			hrtimer_init_on_stack(&t.timer,
+					      clockrt ? CLOCK_REALTIME :
+					      CLOCK_MONOTONIC,
+					      HRTIMER_MODE_ABS);
 			hrtimer_init_sleeper(&t, current);
 			hrtimer_set_expires_range_ns(&t.timer, *abs_time, slack);
 
@@ -1289,6 +1292,8 @@ static int futex_wait(u32 __user *uaddr, int fshared,
 
 		if (fshared)
 			restart->futex.flags |= FLAGS_SHARED;
+		if (clockrt)
+			restart->futex.flags |= FLAGS_CLOCKRT;
 		return -ERESTART_RESTARTBLOCK;
 	}
 
@@ -1312,7 +1317,8 @@ static long futex_wait_restart(struct restart_block *restart)
 	if (restart->futex.flags & FLAGS_SHARED)
 		fshared = 1;
 	return (long)futex_wait(uaddr, fshared, restart->futex.val, &t,
-				restart->futex.bitset);
+				restart->futex.bitset,
+				restart->futex.flags & FLAGS_CLOCKRT);
 }
 
 
@@ -1905,18 +1911,22 @@ void exit_robust_list(struct task_struct *curr)
 long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
 		u32 __user *uaddr2, u32 val2, u32 val3)
 {
-	int ret = -ENOSYS;
+	int clockrt, ret = -ENOSYS;
 	int cmd = op & FUTEX_CMD_MASK;
 	int fshared = 0;
 
 	if (!(op & FUTEX_PRIVATE_FLAG))
 		fshared = 1;
 
+	clockrt = op & FUTEX_CLOCK_REALTIME;
+	if (clockrt && cmd != FUTEX_WAIT_BITSET)
+		return -ENOSYS;
+
 	switch (cmd) {
 	case FUTEX_WAIT:
 		val3 = FUTEX_BITSET_MATCH_ANY;
 	case FUTEX_WAIT_BITSET:
-		ret = futex_wait(uaddr, fshared, val, timeout, val3);
+		ret = futex_wait(uaddr, fshared, val, timeout, val3, clockrt);
 		break;
 	case FUTEX_WAKE:
 		val3 = FUTEX_BITSET_MATCH_ANY;
-- 
cgit v1.2.3-70-g09d2


From 18b6e0414e42d95183f07d8177e3ff0241abd825 Mon Sep 17 00:00:00 2001
From: Serge Hallyn <serue@us.ibm.com>
Date: Wed, 15 Oct 2008 16:38:45 -0500
Subject: User namespaces: set of cleanups (v2)

The user_ns is moved from nsproxy to user_struct, so that a struct
cred by itself is sufficient to determine access (which it otherwise
would not be).  Corresponding ecryptfs fixes (by David Howells) are
here as well.

Fix refcounting.  The following rules now apply:
        1. The task pins the user struct.
        2. The user struct pins its user namespace.
        3. The user namespace pins the struct user which created it.

User namespaces are cloned during copy_creds().  Unsharing a new user_ns
is no longer possible.  (We could re-add that, but it'll cause code
duplication and doesn't seem useful if PAM doesn't need to clone user
namespaces).

When a user namespace is created, its first user (uid 0) gets empty
keyrings and a clean group_info.

This incorporates a previous patch by David Howells.  Here
is his original patch description:

>I suggest adding the attached incremental patch.  It makes the following
>changes:
>
> (1) Provides a current_user_ns() macro to wrap accesses to current's user
>     namespace.
>
> (2) Fixes eCryptFS.
>
> (3) Renames create_new_userns() to create_user_ns() to be more consistent
>     with the other associated functions and because the 'new' in the name is
>     superfluous.
>
> (4) Moves the argument and permission checks made for CLONE_NEWUSER to the
>     beginning of do_fork() so that they're done prior to making any attempts
>     at allocation.
>
> (5) Calls create_user_ns() after prepare_creds(), and gives it the new creds
>     to fill in rather than have it return the new root user.  I don't imagine
>     the new root user being used for anything other than filling in a cred
>     struct.
>
>     This also permits me to get rid of a get_uid() and a free_uid(), as the
>     reference the creds were holding on the old user_struct can just be
>     transferred to the new namespace's creator pointer.
>
> (6) Makes create_user_ns() reset the UIDs and GIDs of the creds under
>     preparation rather than doing it in copy_creds().
>
>David

>Signed-off-by: David Howells <dhowells@redhat.com>

Changelog:
	Oct 20: integrate dhowells comments
		1. leave thread_keyring alone
		2. use current_user_ns() in set_user()

Signed-off-by: Serge Hallyn <serue@us.ibm.com>
---
 fs/ecryptfs/messaging.c        | 13 ++++----
 fs/ecryptfs/miscdev.c          | 19 ++++-------
 include/linux/cred.h           |  2 ++
 include/linux/init_task.h      |  1 -
 include/linux/nsproxy.h        |  1 -
 include/linux/sched.h          |  1 +
 include/linux/user_namespace.h | 13 +++-----
 kernel/cred.c                  | 15 +++++++--
 kernel/fork.c                  | 19 +++++++++--
 kernel/nsproxy.c               | 15 ++-------
 kernel/sys.c                   |  4 +--
 kernel/user.c                  | 47 ++++++++------------------
 kernel/user_namespace.c        | 75 +++++++++++++++++-------------------------
 13 files changed, 96 insertions(+), 129 deletions(-)

(limited to 'include/linux')

diff --git a/fs/ecryptfs/messaging.c b/fs/ecryptfs/messaging.c
index e0b0a4e28b9..6913f727624 100644
--- a/fs/ecryptfs/messaging.c
+++ b/fs/ecryptfs/messaging.c
@@ -360,7 +360,7 @@ int ecryptfs_process_response(struct ecryptfs_message *msg, uid_t euid,
 	struct ecryptfs_msg_ctx *msg_ctx;
 	size_t msg_size;
 	struct nsproxy *nsproxy;
-	struct user_namespace *current_user_ns;
+	struct user_namespace *tsk_user_ns;
 	uid_t ctx_euid;
 	int rc;
 
@@ -385,9 +385,9 @@ int ecryptfs_process_response(struct ecryptfs_message *msg, uid_t euid,
 		mutex_unlock(&ecryptfs_daemon_hash_mux);
 		goto wake_up;
 	}
-	current_user_ns = nsproxy->user_ns;
+	tsk_user_ns = __task_cred(msg_ctx->task)->user->user_ns;
 	ctx_euid = task_euid(msg_ctx->task);
-	rc = ecryptfs_find_daemon_by_euid(&daemon, ctx_euid, current_user_ns);
+	rc = ecryptfs_find_daemon_by_euid(&daemon, ctx_euid, tsk_user_ns);
 	rcu_read_unlock();
 	mutex_unlock(&ecryptfs_daemon_hash_mux);
 	if (rc) {
@@ -405,11 +405,11 @@ int ecryptfs_process_response(struct ecryptfs_message *msg, uid_t euid,
 		       euid, ctx_euid);
 		goto unlock;
 	}
-	if (current_user_ns != user_ns) {
+	if (tsk_user_ns != user_ns) {
 		rc = -EBADMSG;
 		printk(KERN_WARNING "%s: Received message from user_ns "
 		       "[0x%p]; expected message from user_ns [0x%p]\n",
-		       __func__, user_ns, nsproxy->user_ns);
+		       __func__, user_ns, tsk_user_ns);
 		goto unlock;
 	}
 	if (daemon->pid != pid) {
@@ -468,8 +468,7 @@ ecryptfs_send_message_locked(char *data, int data_len, u8 msg_type,
 	uid_t euid = current_euid();
 	int rc;
 
-	rc = ecryptfs_find_daemon_by_euid(&daemon, euid,
-					  current->nsproxy->user_ns);
+	rc = ecryptfs_find_daemon_by_euid(&daemon, euid, current_user_ns());
 	if (rc || !daemon) {
 		rc = -ENOTCONN;
 		printk(KERN_ERR "%s: User [%d] does not have a daemon "
diff --git a/fs/ecryptfs/miscdev.c b/fs/ecryptfs/miscdev.c
index 047ac609695..efd95a0ed1e 100644
--- a/fs/ecryptfs/miscdev.c
+++ b/fs/ecryptfs/miscdev.c
@@ -47,8 +47,7 @@ ecryptfs_miscdev_poll(struct file *file, poll_table *pt)
 
 	mutex_lock(&ecryptfs_daemon_hash_mux);
 	/* TODO: Just use file->private_data? */
-	rc = ecryptfs_find_daemon_by_euid(&daemon, euid,
-					  current->nsproxy->user_ns);
+	rc = ecryptfs_find_daemon_by_euid(&daemon, euid, current_user_ns());
 	BUG_ON(rc || !daemon);
 	mutex_lock(&daemon->mux);
 	mutex_unlock(&ecryptfs_daemon_hash_mux);
@@ -95,11 +94,9 @@ ecryptfs_miscdev_open(struct inode *inode, struct file *file)
 		       "count; rc = [%d]\n", __func__, rc);
 		goto out_unlock_daemon_list;
 	}
-	rc = ecryptfs_find_daemon_by_euid(&daemon, euid,
-					  current->nsproxy->user_ns);
+	rc = ecryptfs_find_daemon_by_euid(&daemon, euid, current_user_ns());
 	if (rc || !daemon) {
-		rc = ecryptfs_spawn_daemon(&daemon, euid,
-					   current->nsproxy->user_ns,
+		rc = ecryptfs_spawn_daemon(&daemon, euid, current_user_ns(),
 					   task_pid(current));
 		if (rc) {
 			printk(KERN_ERR "%s: Error attempting to spawn daemon; "
@@ -153,8 +150,7 @@ ecryptfs_miscdev_release(struct inode *inode, struct file *file)
 	int rc;
 
 	mutex_lock(&ecryptfs_daemon_hash_mux);
-	rc = ecryptfs_find_daemon_by_euid(&daemon, euid,
-					  current->nsproxy->user_ns);
+	rc = ecryptfs_find_daemon_by_euid(&daemon, euid, current_user_ns());
 	BUG_ON(rc || !daemon);
 	mutex_lock(&daemon->mux);
 	BUG_ON(daemon->pid != task_pid(current));
@@ -254,8 +250,7 @@ ecryptfs_miscdev_read(struct file *file, char __user *buf, size_t count,
 
 	mutex_lock(&ecryptfs_daemon_hash_mux);
 	/* TODO: Just use file->private_data? */
-	rc = ecryptfs_find_daemon_by_euid(&daemon, euid,
-					  current->nsproxy->user_ns);
+	rc = ecryptfs_find_daemon_by_euid(&daemon, euid, current_user_ns());
 	BUG_ON(rc || !daemon);
 	mutex_lock(&daemon->mux);
 	if (daemon->flags & ECRYPTFS_DAEMON_ZOMBIE) {
@@ -295,7 +290,7 @@ check_list:
 		goto check_list;
 	}
 	BUG_ON(euid != daemon->euid);
-	BUG_ON(current->nsproxy->user_ns != daemon->user_ns);
+	BUG_ON(current_user_ns() != daemon->user_ns);
 	BUG_ON(task_pid(current) != daemon->pid);
 	msg_ctx = list_first_entry(&daemon->msg_ctx_out_queue,
 				   struct ecryptfs_msg_ctx, daemon_out_list);
@@ -468,7 +463,7 @@ ecryptfs_miscdev_write(struct file *file, const char __user *buf,
 			goto out_free;
 		}
 		rc = ecryptfs_miscdev_response(&data[i], packet_size,
-					       euid, current->nsproxy->user_ns,
+					       euid, current_user_ns(),
 					       task_pid(current), seq);
 		if (rc)
 			printk(KERN_WARNING "%s: Failed to deliver miscdev "
diff --git a/include/linux/cred.h b/include/linux/cred.h
index 26c1ab17994..3282ee4318e 100644
--- a/include/linux/cred.h
+++ b/include/linux/cred.h
@@ -60,6 +60,7 @@ do {							\
 } while (0)
 
 extern struct group_info *groups_alloc(int);
+extern struct group_info init_groups;
 extern void groups_free(struct group_info *);
 extern int set_current_groups(struct group_info *);
 extern int set_groups(struct cred *, struct group_info *);
@@ -315,6 +316,7 @@ static inline void put_cred(const struct cred *_cred)
 #define current_fsgid() 	(current_cred_xxx(fsgid))
 #define current_cap()		(current_cred_xxx(cap_effective))
 #define current_user()		(current_cred_xxx(user))
+#define current_user_ns()	(current_cred_xxx(user)->user_ns)
 #define current_security()	(current_cred_xxx(security))
 
 #define current_uid_gid(_uid, _gid)		\
diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index 2597858035c..959f5522d10 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -57,7 +57,6 @@ extern struct nsproxy init_nsproxy;
 	.mnt_ns		= NULL,						\
 	INIT_NET_NS(net_ns)                                             \
 	INIT_IPC_NS(ipc_ns)						\
-	.user_ns	= &init_user_ns,				\
 }
 
 #define INIT_SIGHAND(sighand) {						\
diff --git a/include/linux/nsproxy.h b/include/linux/nsproxy.h
index c8a768e5964..afad7dec1b3 100644
--- a/include/linux/nsproxy.h
+++ b/include/linux/nsproxy.h
@@ -27,7 +27,6 @@ struct nsproxy {
 	struct ipc_namespace *ipc_ns;
 	struct mnt_namespace *mnt_ns;
 	struct pid_namespace *pid_ns;
-	struct user_namespace *user_ns;
 	struct net 	     *net_ns;
 };
 extern struct nsproxy init_nsproxy;
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 2036e9f2602..7f8015a3082 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -638,6 +638,7 @@ struct user_struct {
 	/* Hash table maintenance information */
 	struct hlist_node uidhash_node;
 	uid_t uid;
+	struct user_namespace *user_ns;
 
 #ifdef CONFIG_USER_SCHED
 	struct task_group *tg;
diff --git a/include/linux/user_namespace.h b/include/linux/user_namespace.h
index b5f41d4c2ee..315bcd37522 100644
--- a/include/linux/user_namespace.h
+++ b/include/linux/user_namespace.h
@@ -12,7 +12,7 @@
 struct user_namespace {
 	struct kref		kref;
 	struct hlist_head	uidhash_table[UIDHASH_SZ];
-	struct user_struct	*root_user;
+	struct user_struct	*creator;
 };
 
 extern struct user_namespace init_user_ns;
@@ -26,8 +26,7 @@ static inline struct user_namespace *get_user_ns(struct user_namespace *ns)
 	return ns;
 }
 
-extern struct user_namespace *copy_user_ns(int flags,
-					   struct user_namespace *old_ns);
+extern int create_user_ns(struct cred *new);
 extern void free_user_ns(struct kref *kref);
 
 static inline void put_user_ns(struct user_namespace *ns)
@@ -43,13 +42,9 @@ static inline struct user_namespace *get_user_ns(struct user_namespace *ns)
 	return &init_user_ns;
 }
 
-static inline struct user_namespace *copy_user_ns(int flags,
-						  struct user_namespace *old_ns)
+static inline int create_user_ns(struct cred *new)
 {
-	if (flags & CLONE_NEWUSER)
-		return ERR_PTR(-EINVAL);
-
-	return old_ns;
+	return -EINVAL;
 }
 
 static inline void put_user_ns(struct user_namespace *ns)
diff --git a/kernel/cred.c b/kernel/cred.c
index 13697ca2bb3..ff7bc071991 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -274,6 +274,7 @@ int copy_creds(struct task_struct *p, unsigned long clone_flags)
 	struct thread_group_cred *tgcred;
 #endif
 	struct cred *new;
+	int ret;
 
 	mutex_init(&p->cred_exec_mutex);
 
@@ -293,6 +294,12 @@ int copy_creds(struct task_struct *p, unsigned long clone_flags)
 	if (!new)
 		return -ENOMEM;
 
+	if (clone_flags & CLONE_NEWUSER) {
+		ret = create_user_ns(new);
+		if (ret < 0)
+			goto error_put;
+	}
+
 #ifdef CONFIG_KEYS
 	/* new threads get their own thread keyrings if their parent already
 	 * had one */
@@ -309,8 +316,8 @@ int copy_creds(struct task_struct *p, unsigned long clone_flags)
 	if (!(clone_flags & CLONE_THREAD)) {
 		tgcred = kmalloc(sizeof(*tgcred), GFP_KERNEL);
 		if (!tgcred) {
-			put_cred(new);
-			return -ENOMEM;
+			ret = -ENOMEM;
+			goto error_put;
 		}
 		atomic_set(&tgcred->usage, 1);
 		spin_lock_init(&tgcred->lock);
@@ -325,6 +332,10 @@ int copy_creds(struct task_struct *p, unsigned long clone_flags)
 	atomic_inc(&new->user->processes);
 	p->cred = p->real_cred = get_cred(new);
 	return 0;
+
+error_put:
+	put_cred(new);
+	return ret;
 }
 
 /**
diff --git a/kernel/fork.c b/kernel/fork.c
index 29c18c14812..1dd89451fae 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -976,7 +976,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 	if (atomic_read(&p->real_cred->user->processes) >=
 			p->signal->rlim[RLIMIT_NPROC].rlim_cur) {
 		if (!capable(CAP_SYS_ADMIN) && !capable(CAP_SYS_RESOURCE) &&
-		    p->real_cred->user != current->nsproxy->user_ns->root_user)
+		    p->real_cred->user != INIT_USER)
 			goto bad_fork_free;
 	}
 
@@ -1334,6 +1334,20 @@ long do_fork(unsigned long clone_flags,
 	int trace = 0;
 	long nr;
 
+	/*
+	 * Do some preliminary argument and permissions checking before we
+	 * actually start allocating stuff
+	 */
+	if (clone_flags & CLONE_NEWUSER) {
+		if (clone_flags & CLONE_THREAD)
+			return -EINVAL;
+		/* hopefully this check will go away when userns support is
+		 * complete
+		 */
+		if (!capable(CAP_SYS_ADMIN))
+			return -EPERM;
+	}
+
 	/*
 	 * We hope to recycle these flags after 2.6.26
 	 */
@@ -1581,8 +1595,7 @@ asmlinkage long sys_unshare(unsigned long unshare_flags)
 	err = -EINVAL;
 	if (unshare_flags & ~(CLONE_THREAD|CLONE_FS|CLONE_NEWNS|CLONE_SIGHAND|
 				CLONE_VM|CLONE_FILES|CLONE_SYSVSEM|
-				CLONE_NEWUTS|CLONE_NEWIPC|CLONE_NEWUSER|
-				CLONE_NEWNET))
+				CLONE_NEWUTS|CLONE_NEWIPC|CLONE_NEWNET))
 		goto bad_unshare_out;
 
 	/*
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
index 1d3ef29a258..63598dca2d0 100644
--- a/kernel/nsproxy.c
+++ b/kernel/nsproxy.c
@@ -80,12 +80,6 @@ static struct nsproxy *create_new_namespaces(unsigned long flags,
 		goto out_pid;
 	}
 
-	new_nsp->user_ns = copy_user_ns(flags, tsk->nsproxy->user_ns);
-	if (IS_ERR(new_nsp->user_ns)) {
-		err = PTR_ERR(new_nsp->user_ns);
-		goto out_user;
-	}
-
 	new_nsp->net_ns = copy_net_ns(flags, tsk->nsproxy->net_ns);
 	if (IS_ERR(new_nsp->net_ns)) {
 		err = PTR_ERR(new_nsp->net_ns);
@@ -95,9 +89,6 @@ static struct nsproxy *create_new_namespaces(unsigned long flags,
 	return new_nsp;
 
 out_net:
-	if (new_nsp->user_ns)
-		put_user_ns(new_nsp->user_ns);
-out_user:
 	if (new_nsp->pid_ns)
 		put_pid_ns(new_nsp->pid_ns);
 out_pid:
@@ -130,7 +121,7 @@ int copy_namespaces(unsigned long flags, struct task_struct *tsk)
 	get_nsproxy(old_ns);
 
 	if (!(flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC |
-				CLONE_NEWUSER | CLONE_NEWPID | CLONE_NEWNET)))
+				CLONE_NEWPID | CLONE_NEWNET)))
 		return 0;
 
 	if (!capable(CAP_SYS_ADMIN)) {
@@ -173,8 +164,6 @@ void free_nsproxy(struct nsproxy *ns)
 		put_ipc_ns(ns->ipc_ns);
 	if (ns->pid_ns)
 		put_pid_ns(ns->pid_ns);
-	if (ns->user_ns)
-		put_user_ns(ns->user_ns);
 	put_net(ns->net_ns);
 	kmem_cache_free(nsproxy_cachep, ns);
 }
@@ -189,7 +178,7 @@ int unshare_nsproxy_namespaces(unsigned long unshare_flags,
 	int err = 0;
 
 	if (!(unshare_flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC |
-			       CLONE_NEWUSER | CLONE_NEWNET)))
+			       CLONE_NEWNET)))
 		return 0;
 
 	if (!capable(CAP_SYS_ADMIN))
diff --git a/kernel/sys.c b/kernel/sys.c
index ab735040468..ebe65c2c987 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -565,13 +565,13 @@ static int set_user(struct cred *new)
 {
 	struct user_struct *new_user;
 
-	new_user = alloc_uid(current->nsproxy->user_ns, new->uid);
+	new_user = alloc_uid(current_user_ns(), new->uid);
 	if (!new_user)
 		return -EAGAIN;
 
 	if (atomic_read(&new_user->processes) >=
 				current->signal->rlim[RLIMIT_NPROC].rlim_cur &&
-			new_user != current->nsproxy->user_ns->root_user) {
+			new_user != INIT_USER) {
 		free_uid(new_user);
 		return -EAGAIN;
 	}
diff --git a/kernel/user.c b/kernel/user.c
index d476307dd4b..c0ef3a46443 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -20,9 +20,9 @@
 
 struct user_namespace init_user_ns = {
 	.kref = {
-		.refcount	= ATOMIC_INIT(2),
+		.refcount	= ATOMIC_INIT(1),
 	},
-	.root_user = &root_user,
+	.creator = &root_user,
 };
 EXPORT_SYMBOL_GPL(init_user_ns);
 
@@ -48,12 +48,14 @@ static struct kmem_cache *uid_cachep;
  */
 static DEFINE_SPINLOCK(uidhash_lock);
 
+/* root_user.__count is 2, 1 for init task cred, 1 for init_user_ns->creator */
 struct user_struct root_user = {
-	.__count	= ATOMIC_INIT(1),
+	.__count	= ATOMIC_INIT(2),
 	.processes	= ATOMIC_INIT(1),
 	.files		= ATOMIC_INIT(0),
 	.sigpending	= ATOMIC_INIT(0),
 	.locked_shm     = 0,
+	.user_ns	= &init_user_ns,
 #ifdef CONFIG_USER_SCHED
 	.tg		= &init_task_group,
 #endif
@@ -314,12 +316,13 @@ done:
  * IRQ state (as stored in flags) is restored and uidhash_lock released
  * upon function exit.
  */
-static inline void free_user(struct user_struct *up, unsigned long flags)
+static void free_user(struct user_struct *up, unsigned long flags)
 {
 	/* restore back the count */
 	atomic_inc(&up->__count);
 	spin_unlock_irqrestore(&uidhash_lock, flags);
 
+	put_user_ns(up->user_ns);
 	INIT_WORK(&up->work, remove_user_sysfs_dir);
 	schedule_work(&up->work);
 }
@@ -335,13 +338,14 @@ static inline void uids_mutex_unlock(void) { }
  * IRQ state (as stored in flags) is restored and uidhash_lock released
  * upon function exit.
  */
-static inline void free_user(struct user_struct *up, unsigned long flags)
+static void free_user(struct user_struct *up, unsigned long flags)
 {
 	uid_hash_remove(up);
 	spin_unlock_irqrestore(&uidhash_lock, flags);
 	sched_destroy_user(up);
 	key_put(up->uid_keyring);
 	key_put(up->session_keyring);
+	put_user_ns(up->user_ns);
 	kmem_cache_free(uid_cachep, up);
 }
 
@@ -357,7 +361,7 @@ struct user_struct *find_user(uid_t uid)
 {
 	struct user_struct *ret;
 	unsigned long flags;
-	struct user_namespace *ns = current->nsproxy->user_ns;
+	struct user_namespace *ns = current_user()->user_ns;
 
 	spin_lock_irqsave(&uidhash_lock, flags);
 	ret = uid_hash_find(uid, uidhashentry(ns, uid));
@@ -404,6 +408,8 @@ struct user_struct *alloc_uid(struct user_namespace *ns, uid_t uid)
 		if (sched_create_user(new) < 0)
 			goto out_free_user;
 
+		new->user_ns = get_user_ns(ns);
+
 		if (uids_user_create(new))
 			goto out_destoy_sched;
 
@@ -427,7 +433,6 @@ struct user_struct *alloc_uid(struct user_namespace *ns, uid_t uid)
 			up = new;
 		}
 		spin_unlock_irq(&uidhash_lock);
-
 	}
 
 	uids_mutex_unlock();
@@ -436,6 +441,7 @@ struct user_struct *alloc_uid(struct user_namespace *ns, uid_t uid)
 
 out_destoy_sched:
 	sched_destroy_user(new);
+	put_user_ns(new->user_ns);
 out_free_user:
 	kmem_cache_free(uid_cachep, new);
 out_unlock:
@@ -443,33 +449,6 @@ out_unlock:
 	return NULL;
 }
 
-#ifdef CONFIG_USER_NS
-void release_uids(struct user_namespace *ns)
-{
-	int i;
-	unsigned long flags;
-	struct hlist_head *head;
-	struct hlist_node *nd;
-
-	spin_lock_irqsave(&uidhash_lock, flags);
-	/*
-	 * collapse the chains so that the user_struct-s will
-	 * be still alive, but not in hashes. subsequent free_uid()
-	 * will free them.
-	 */
-	for (i = 0; i < UIDHASH_SZ; i++) {
-		head = ns->uidhash_table + i;
-		while (!hlist_empty(head)) {
-			nd = head->first;
-			hlist_del_init(nd);
-		}
-	}
-	spin_unlock_irqrestore(&uidhash_lock, flags);
-
-	free_uid(ns->root_user);
-}
-#endif
-
 static int __init uid_cache_init(void)
 {
 	int n;
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index 0d9c51d6733..79084311ee5 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -9,70 +9,55 @@
 #include <linux/nsproxy.h>
 #include <linux/slab.h>
 #include <linux/user_namespace.h>
+#include <linux/cred.h>
 
 /*
- * Clone a new ns copying an original user ns, setting refcount to 1
- * @old_ns: namespace to clone
- * Return NULL on error (failure to kmalloc), new ns otherwise
+ * Create a new user namespace, deriving the creator from the user in the
+ * passed credentials, and replacing that user with the new root user for the
+ * new namespace.
+ *
+ * This is called by copy_creds(), which will finish setting the target task's
+ * credentials.
  */
-static struct user_namespace *clone_user_ns(struct user_namespace *old_ns)
+int create_user_ns(struct cred *new)
 {
 	struct user_namespace *ns;
-	struct user_struct *new_user;
-	struct cred *new;
+	struct user_struct *root_user;
 	int n;
 
 	ns = kmalloc(sizeof(struct user_namespace), GFP_KERNEL);
 	if (!ns)
-		return ERR_PTR(-ENOMEM);
+		return -ENOMEM;
 
 	kref_init(&ns->kref);
 
 	for (n = 0; n < UIDHASH_SZ; ++n)
 		INIT_HLIST_HEAD(ns->uidhash_table + n);
 
-	/* Insert new root user.  */
-	ns->root_user = alloc_uid(ns, 0);
-	if (!ns->root_user) {
+	/* Alloc new root user.  */
+	root_user = alloc_uid(ns, 0);
+	if (!root_user) {
 		kfree(ns);
-		return ERR_PTR(-ENOMEM);
+		return -ENOMEM;
 	}
 
-	/* Reset current->user with a new one */
-	new_user = alloc_uid(ns, current_uid());
-	if (!new_user) {
-		free_uid(ns->root_user);
-		kfree(ns);
-		return ERR_PTR(-ENOMEM);
-	}
-
-	/* Install the new user */
-	new = prepare_creds();
-	if (!new) {
-		free_uid(new_user);
-		free_uid(ns->root_user);
-		kfree(ns);
-	}
-	free_uid(new->user);
-	new->user = new_user;
-	commit_creds(new);
-	return ns;
-}
-
-struct user_namespace * copy_user_ns(int flags, struct user_namespace *old_ns)
-{
-	struct user_namespace *new_ns;
-
-	BUG_ON(!old_ns);
-	get_user_ns(old_ns);
-
-	if (!(flags & CLONE_NEWUSER))
-		return old_ns;
+	/* set the new root user in the credentials under preparation */
+	ns->creator = new->user;
+	new->user = root_user;
+	new->uid = new->euid = new->suid = new->fsuid = 0;
+	new->gid = new->egid = new->sgid = new->fsgid = 0;
+	put_group_info(new->group_info);
+	new->group_info = get_group_info(&init_groups);
+#ifdef CONFIG_KEYS
+	key_put(new->request_key_auth);
+	new->request_key_auth = NULL;
+#endif
+	/* tgcred will be cleared in our caller bc CLONE_THREAD won't be set */
 
-	new_ns = clone_user_ns(old_ns);
+	/* alloc_uid() incremented the userns refcount.  Just set it to 1 */
+	kref_set(&ns->kref, 1);
 
-	put_user_ns(old_ns);
-	return new_ns;
+	return 0;
 }
 
 void free_user_ns(struct kref *kref)
@@ -80,7 +65,7 @@ void free_user_ns(struct kref *kref)
 	struct user_namespace *ns;
 
 	ns = container_of(kref, struct user_namespace, kref);
-	release_uids(ns);
+	free_uid(ns->creator);
 	kfree(ns);
 }
 EXPORT_SYMBOL(free_user_ns);
-- 
cgit v1.2.3-70-g09d2


From 832d11c5cd076abc0aa1eaf7be96c81d1a59ce41 Mon Sep 17 00:00:00 2001
From: Ilpo Järvinen <ilpo.jarvinen@helsinki.fi>
Date: Mon, 24 Nov 2008 21:20:15 -0800
Subject: tcp: Try to restore large SKBs while SACK processing
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

During SACK processing, most of the benefits of TSO are eaten by
the SACK blocks that one-by-one fragment SKBs to MSS sized chunks.
Then we're in problems when cleanup work for them has to be done
when a large cumulative ACK comes. Try to return back to pre-split
state already while more and more SACK info gets discovered by
combining newly discovered SACK areas with the previous skb if
that's SACKed as well.

This approach has a number of benefits:

1) The processing overhead is spread more equally over the RTT
2) Write queue has less skbs to process (affect everything
   which has to walk in the queue past the sacked areas)
3) Write queue is consistent whole the time, so no other parts
   of TCP has to be aware of this (this was not the case with
   some other approach that was, well, quite intrusive all
   around).
4) Clean_rtx_queue can release most of the pages using single
   put_page instead of previous PAGE_SIZE/mss+1 calls

In case a hole is fully filled by the new SACK block, we attempt
to combine the next skb too which allows construction of skbs
that are even larger than what tso split them to and it handles
hole per on every nth patterns that often occur during slow start
overshoot pretty nicely. Though this to be really useful also
a retransmission would have to get lost since cumulative ACKs
advance one hole at a time in the most typical case.

TODO: handle upwards only merging. That should be rather easy
when segment is fully sacked but I'm leaving that as future
work item (it won't make very large difference anyway since
this current approach already covers quite a lot of normal
cases).

I was earlier thinking of some sophisticated way of tracking
timestamps of the first and the last segment but later on
realized that it won't be that necessary at all to store the
timestamp of the last segment. The cases that can occur are
basically either:
  1) ambiguous => no sensible measurement can be taken anyway
  2) non-ambiguous is due to reordering => having the timestamp
     of the last segment there is just skewing things more off
     than does some good since the ack got triggered by one of
     the holes (besides some substle issues that would make
     determining right hole/skb even harder problem). Anyway,
     it has nothing to do with this change then.

I choose to route some abnormal looking cases with goto noop,
some could be handled differently (eg., by stopping the
walking at that skb but again). In general, they either
shouldn't happen at all or are rare enough to make no difference
in practice.

In theory this change (as whole) could cause some macroscale
regression (global) because of cache misses that are taken over
the round-trip time but it gets very likely better because of much
less (local) cache misses per other write queue walkers and the
big recovery clearing cumulative ack.

Worth to note that these benefits would be very easy to get also
without TSO/GSO being on as long as the data is in pages so that
we can merge them. Currently I won't let that happen because
DSACK splitting at fragment that would mess up pcounts due to
sk_can_gso in tcp_set_skb_tso_segs. Once DSACKs fragments gets
avoided, we have some conditions that can be made less strict.

TODO: I will probably have to convert the excessive pointer
passing to struct sacktag_state... :-)

My testing revealed that considerable amount of skbs couldn't
be shifted because they were cloned (most likely still awaiting
tx reclaim)...

[The rest is considering future work instead since I got
repeatably EFAULT to tcpdump's recvfrom when I added
pskb_expand_head to deal with clones, so I separated that
into another, later patch]

...To counter that, I gave up on the fifth advantage:

5) When growing previous SACK block, less allocs for new skbs
   are done, basically a new alloc is needed only when new hole
   is detected and when the previous skb runs out of frags space

...which now only happens of if reclaim is fast enough to dispose
the clone before the SACK block comes in (the window is RTT long),
otherwise we'll have to alloc some.

With clones being handled I got these numbers (will be somewhat
worse without that), taken with fine-grained mibs:

                  TCPSackShifted 398
                   TCPSackMerged 877
            TCPSackShiftFallback 320
      TCPSACKCOLLAPSEFALLBACKGSO 0
  TCPSACKCOLLAPSEFALLBACKSKBBITS 0
  TCPSACKCOLLAPSEFALLBACKSKBDATA 0
    TCPSACKCOLLAPSEFALLBACKBELOW 0
    TCPSACKCOLLAPSEFALLBACKFIRST 1
 TCPSACKCOLLAPSEFALLBACKPREVBITS 318
      TCPSACKCOLLAPSEFALLBACKMSS 1
   TCPSACKCOLLAPSEFALLBACKNOHEAD 0
    TCPSACKCOLLAPSEFALLBACKSHIFT 0
          TCPSACKCOLLAPSENOOPSEQ 0
  TCPSACKCOLLAPSENOOPSMALLPCOUNT 0
     TCPSACKCOLLAPSENOOPSMALLLEN 0
             TCPSACKCOLLAPSEHOLE 12

Signed-off-by: Ilpo Järvinen <ilpo.jarvinen@helsinki.fi>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h |  33 +++++++
 include/net/tcp.h      |   5 +
 net/core/skbuff.c      | 140 +++++++++++++++++++++++++++
 net/ipv4/tcp_input.c   | 256 +++++++++++++++++++++++++++++++++++++++++++++++--
 4 files changed, 427 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index a01b6f84e3b..acf17af45af 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -492,6 +492,19 @@ static inline bool skb_queue_is_last(const struct sk_buff_head *list,
 	return (skb->next == (struct sk_buff *) list);
 }
 
+/**
+ *	skb_queue_is_first - check if skb is the first entry in the queue
+ *	@list: queue head
+ *	@skb: buffer
+ *
+ *	Returns true if @skb is the first buffer on the list.
+ */
+static inline bool skb_queue_is_first(const struct sk_buff_head *list,
+				      const struct sk_buff *skb)
+{
+	return (skb->prev == (struct sk_buff *) list);
+}
+
 /**
  *	skb_queue_next - return the next packet in the queue
  *	@list: queue head
@@ -510,6 +523,24 @@ static inline struct sk_buff *skb_queue_next(const struct sk_buff_head *list,
 	return skb->next;
 }
 
+/**
+ *	skb_queue_prev - return the prev packet in the queue
+ *	@list: queue head
+ *	@skb: current buffer
+ *
+ *	Return the prev packet in @list before @skb.  It is only valid to
+ *	call this if skb_queue_is_first() evaluates to false.
+ */
+static inline struct sk_buff *skb_queue_prev(const struct sk_buff_head *list,
+					     const struct sk_buff *skb)
+{
+	/* This BUG_ON may seem severe, but if we just return then we
+	 * are going to dereference garbage.
+	 */
+	BUG_ON(skb_queue_is_first(list, skb));
+	return skb->prev;
+}
+
 /**
  *	skb_get - reference buffer
  *	@skb: buffer to reference
@@ -1652,6 +1683,8 @@ extern int             skb_splice_bits(struct sk_buff *skb,
 extern void	       skb_copy_and_csum_dev(const struct sk_buff *skb, u8 *to);
 extern void	       skb_split(struct sk_buff *skb,
 				 struct sk_buff *skb1, const u32 len);
+extern int	       skb_shift(struct sk_buff *tgt, struct sk_buff *skb,
+				 int shiftlen);
 
 extern struct sk_buff *skb_segment(struct sk_buff *skb, int features);
 
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 90b4c3b4c33..265392470b2 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -1192,6 +1192,11 @@ static inline struct sk_buff *tcp_write_queue_next(struct sock *sk, struct sk_bu
 	return skb_queue_next(&sk->sk_write_queue, skb);
 }
 
+static inline struct sk_buff *tcp_write_queue_prev(struct sock *sk, struct sk_buff *skb)
+{
+	return skb_queue_prev(&sk->sk_write_queue, skb);
+}
+
 #define tcp_for_write_queue(skb, sk)					\
 	skb_queue_walk(&(sk)->sk_write_queue, skb)
 
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 267185a848f..844b8abeb18 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -2018,6 +2018,146 @@ void skb_split(struct sk_buff *skb, struct sk_buff *skb1, const u32 len)
 		skb_split_no_header(skb, skb1, len, pos);
 }
 
+/* Shifting from/to a cloned skb is a no-go.
+ *
+ * TODO: handle cloned skbs by using pskb_expand_head()
+ */
+static int skb_prepare_for_shift(struct sk_buff *skb)
+{
+	return skb_cloned(skb);
+}
+
+/**
+ * skb_shift - Shifts paged data partially from skb to another
+ * @tgt: buffer into which tail data gets added
+ * @skb: buffer from which the paged data comes from
+ * @shiftlen: shift up to this many bytes
+ *
+ * Attempts to shift up to shiftlen worth of bytes, which may be less than
+ * the length of the skb, from tgt to skb. Returns number bytes shifted.
+ * It's up to caller to free skb if everything was shifted.
+ *
+ * If @tgt runs out of frags, the whole operation is aborted.
+ *
+ * Skb cannot include anything else but paged data while tgt is allowed
+ * to have non-paged data as well.
+ *
+ * TODO: full sized shift could be optimized but that would need
+ * specialized skb free'er to handle frags without up-to-date nr_frags.
+ */
+int skb_shift(struct sk_buff *tgt, struct sk_buff *skb, int shiftlen)
+{
+	int from, to, merge, todo;
+	struct skb_frag_struct *fragfrom, *fragto;
+
+	BUG_ON(shiftlen > skb->len);
+	BUG_ON(skb_headlen(skb));	/* Would corrupt stream */
+
+	todo = shiftlen;
+	from = 0;
+	to = skb_shinfo(tgt)->nr_frags;
+	fragfrom = &skb_shinfo(skb)->frags[from];
+
+	/* Actual merge is delayed until the point when we know we can
+	 * commit all, so that we don't have to undo partial changes
+	 */
+	if (!to ||
+	    !skb_can_coalesce(tgt, to, fragfrom->page, fragfrom->page_offset)) {
+		merge = -1;
+	} else {
+		merge = to - 1;
+
+		todo -= fragfrom->size;
+		if (todo < 0) {
+			if (skb_prepare_for_shift(skb) ||
+			    skb_prepare_for_shift(tgt))
+				return 0;
+
+			fragto = &skb_shinfo(tgt)->frags[merge];
+
+			fragto->size += shiftlen;
+			fragfrom->size -= shiftlen;
+			fragfrom->page_offset += shiftlen;
+
+			goto onlymerged;
+		}
+
+		from++;
+	}
+
+	/* Skip full, not-fitting skb to avoid expensive operations */
+	if ((shiftlen == skb->len) &&
+	    (skb_shinfo(skb)->nr_frags - from) > (MAX_SKB_FRAGS - to))
+		return 0;
+
+	if (skb_prepare_for_shift(skb) || skb_prepare_for_shift(tgt))
+		return 0;
+
+	while ((todo > 0) && (from < skb_shinfo(skb)->nr_frags)) {
+		if (to == MAX_SKB_FRAGS)
+			return 0;
+
+		fragfrom = &skb_shinfo(skb)->frags[from];
+		fragto = &skb_shinfo(tgt)->frags[to];
+
+		if (todo >= fragfrom->size) {
+			*fragto = *fragfrom;
+			todo -= fragfrom->size;
+			from++;
+			to++;
+
+		} else {
+			get_page(fragfrom->page);
+			fragto->page = fragfrom->page;
+			fragto->page_offset = fragfrom->page_offset;
+			fragto->size = todo;
+
+			fragfrom->page_offset += todo;
+			fragfrom->size -= todo;
+			todo = 0;
+
+			to++;
+			break;
+		}
+	}
+
+	/* Ready to "commit" this state change to tgt */
+	skb_shinfo(tgt)->nr_frags = to;
+
+	if (merge >= 0) {
+		fragfrom = &skb_shinfo(skb)->frags[0];
+		fragto = &skb_shinfo(tgt)->frags[merge];
+
+		fragto->size += fragfrom->size;
+		put_page(fragfrom->page);
+	}
+
+	/* Reposition in the original skb */
+	to = 0;
+	while (from < skb_shinfo(skb)->nr_frags)
+		skb_shinfo(skb)->frags[to++] = skb_shinfo(skb)->frags[from++];
+	skb_shinfo(skb)->nr_frags = to;
+
+	BUG_ON(todo > 0 && !skb_shinfo(skb)->nr_frags);
+
+onlymerged:
+	/* Most likely the tgt won't ever need its checksum anymore, skb on
+	 * the other hand might need it if it needs to be resent
+	 */
+	tgt->ip_summed = CHECKSUM_PARTIAL;
+	skb->ip_summed = CHECKSUM_PARTIAL;
+
+	/* Yak, is it really working this way? Some helper please? */
+	skb->len -= shiftlen;
+	skb->data_len -= shiftlen;
+	skb->truesize -= shiftlen;
+	tgt->len += shiftlen;
+	tgt->data_len += shiftlen;
+	tgt->truesize += shiftlen;
+
+	return shiftlen;
+}
+
 /**
  * skb_prepare_seq_read - Prepare a sequential read of skb data
  * @skb: the buffer to read
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 3c8e297e2c3..97d57676b8e 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -1242,6 +1242,8 @@ static int tcp_check_dsack(struct sock *sk, struct sk_buff *ack_skb,
  * aligned portion of it that matches. Therefore we might need to fragment
  * which may fail and creates some hassle (caller must handle error case
  * returns).
+ *
+ * FIXME: this could be merged to shift decision code
  */
 static int tcp_match_skb_to_sack(struct sock *sk, struct sk_buff *skb,
 				 u32 start_seq, u32 end_seq)
@@ -1353,9 +1355,6 @@ static int tcp_sacktag_one(struct sk_buff *skb, struct sock *sk,
 
 		if (fack_count > tp->fackets_out)
 			tp->fackets_out = fack_count;
-
-		if (!before(TCP_SKB_CB(skb)->seq, tcp_highest_sack_seq(tp)))
-			tcp_advance_highest_sack(sk, skb);
 	}
 
 	/* D-SACK. We can detect redundant retransmission in S|R and plain R
@@ -1370,12 +1369,231 @@ static int tcp_sacktag_one(struct sk_buff *skb, struct sock *sk,
 	return flag;
 }
 
+static int tcp_shifted_skb(struct sock *sk, struct sk_buff *prev,
+			   struct sk_buff *skb, unsigned int pcount,
+			   int shifted, int fack_count, int *reord,
+			   int *flag, int mss)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	u8 dummy_sacked = TCP_SKB_CB(skb)->sacked;	/* We discard results */
+
+	BUG_ON(!pcount);
+
+	TCP_SKB_CB(prev)->end_seq += shifted;
+	TCP_SKB_CB(skb)->seq += shifted;
+
+	skb_shinfo(prev)->gso_segs += pcount;
+	BUG_ON(skb_shinfo(skb)->gso_segs < pcount);
+	skb_shinfo(skb)->gso_segs -= pcount;
+
+	/* When we're adding to gso_segs == 1, gso_size will be zero,
+	 * in theory this shouldn't be necessary but as long as DSACK
+	 * code can come after this skb later on it's better to keep
+	 * setting gso_size to something.
+	 */
+	if (!skb_shinfo(prev)->gso_size) {
+		skb_shinfo(prev)->gso_size = mss;
+		skb_shinfo(prev)->gso_type = sk->sk_gso_type;
+	}
+
+	/* CHECKME: To clear or not to clear? Mimics normal skb currently */
+	if (skb_shinfo(skb)->gso_segs <= 1) {
+		skb_shinfo(skb)->gso_size = 0;
+		skb_shinfo(skb)->gso_type = 0;
+	}
+
+	*flag |= tcp_sacktag_one(skb, sk, reord, 0, fack_count, &dummy_sacked,
+				 pcount);
+
+	/* Difference in this won't matter, both ACKed by the same cumul. ACK */
+	TCP_SKB_CB(prev)->sacked |= (TCP_SKB_CB(skb)->sacked & TCPCB_EVER_RETRANS);
+
+	tcp_clear_all_retrans_hints(tp);
+
+	if (skb->len > 0) {
+		BUG_ON(!tcp_skb_pcount(skb));
+		return 0;
+	}
+
+	/* Whole SKB was eaten :-) */
+
+	TCP_SKB_CB(skb)->flags |= TCP_SKB_CB(prev)->flags;
+	if (skb == tcp_highest_sack(sk))
+		tcp_advance_highest_sack(sk, skb);
+
+	tcp_unlink_write_queue(skb, sk);
+	sk_wmem_free_skb(sk, skb);
+
+	return 1;
+}
+
+/* I wish gso_size would have a bit more sane initialization than
+ * something-or-zero which complicates things
+ */
+static int tcp_shift_mss(struct sk_buff *skb)
+{
+	int mss = tcp_skb_mss(skb);
+
+	if (!mss)
+		mss = skb->len;
+
+	return mss;
+}
+
+/* Shifting pages past head area doesn't work */
+static int skb_can_shift(struct sk_buff *skb)
+{
+	return !skb_headlen(skb) && skb_is_nonlinear(skb);
+}
+
+/* Try collapsing SACK blocks spanning across multiple skbs to a single
+ * skb.
+ */
+static struct sk_buff *tcp_shift_skb_data(struct sock *sk, struct sk_buff *skb,
+					  u32 start_seq, u32 end_seq,
+					  int dup_sack, int *fack_count,
+					  int *reord, int *flag)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct sk_buff *prev;
+	int mss;
+	int pcount = 0;
+	int len;
+	int in_sack;
+
+	if (!sk_can_gso(sk))
+		goto fallback;
+
+	/* Normally R but no L won't result in plain S */
+	if (!dup_sack &&
+	    (TCP_SKB_CB(skb)->sacked & TCPCB_TAGBITS) == TCPCB_SACKED_RETRANS)
+		goto fallback;
+	if (!skb_can_shift(skb))
+		goto fallback;
+	/* This frame is about to be dropped (was ACKed). */
+	if (!after(TCP_SKB_CB(skb)->end_seq, tp->snd_una))
+		goto fallback;
+
+	/* Can only happen with delayed DSACK + discard craziness */
+	if (unlikely(skb == tcp_write_queue_head(sk)))
+		goto fallback;
+	prev = tcp_write_queue_prev(sk, skb);
+
+	if ((TCP_SKB_CB(prev)->sacked & TCPCB_TAGBITS) != TCPCB_SACKED_ACKED)
+		goto fallback;
+
+	in_sack = !after(start_seq, TCP_SKB_CB(skb)->seq) &&
+		  !before(end_seq, TCP_SKB_CB(skb)->end_seq);
+
+	if (in_sack) {
+		len = skb->len;
+		pcount = tcp_skb_pcount(skb);
+		mss = tcp_shift_mss(skb);
+
+		/* TODO: Fix DSACKs to not fragment already SACKed and we can
+		 * drop this restriction as unnecessary
+		 */
+		if (mss != tcp_shift_mss(prev))
+			goto fallback;
+	} else {
+		if (!after(TCP_SKB_CB(skb)->end_seq, start_seq))
+			goto noop;
+		/* CHECKME: This is non-MSS split case only?, this will
+		 * cause skipped skbs due to advancing loop btw, original
+		 * has that feature too
+		 */
+		if (tcp_skb_pcount(skb) <= 1)
+			goto noop;
+
+		in_sack = !after(start_seq, TCP_SKB_CB(skb)->seq);
+		if (!in_sack) {
+			/* TODO: head merge to next could be attempted here
+			 * if (!after(TCP_SKB_CB(skb)->end_seq, end_seq)),
+			 * though it might not be worth of the additional hassle
+			 *
+			 * ...we can probably just fallback to what was done
+			 * previously. We could try merging non-SACKed ones
+			 * as well but it probably isn't going to buy off
+			 * because later SACKs might again split them, and
+			 * it would make skb timestamp tracking considerably
+			 * harder problem.
+			 */
+			goto fallback;
+		}
+
+		len = end_seq - TCP_SKB_CB(skb)->seq;
+		BUG_ON(len < 0);
+		BUG_ON(len > skb->len);
+
+		/* MSS boundaries should be honoured or else pcount will
+		 * severely break even though it makes things bit trickier.
+		 * Optimize common case to avoid most of the divides
+		 */
+		mss = tcp_skb_mss(skb);
+
+		/* TODO: Fix DSACKs to not fragment already SACKed and we can
+		 * drop this restriction as unnecessary
+		 */
+		if (mss != tcp_shift_mss(prev))
+			goto fallback;
+
+		if (len == mss) {
+			pcount = 1;
+		} else if (len < mss) {
+			goto noop;
+		} else {
+			pcount = len / mss;
+			len = pcount * mss;
+		}
+	}
+
+	if (!skb_shift(prev, skb, len))
+		goto fallback;
+	if (!tcp_shifted_skb(sk, prev, skb, pcount, len, *fack_count, reord,
+			     flag, mss))
+		goto out;
+
+	/* Hole filled allows collapsing with the next as well, this is very
+	 * useful when hole on every nth skb pattern happens
+	 */
+	if (prev == tcp_write_queue_tail(sk))
+		goto out;
+	skb = tcp_write_queue_next(sk, prev);
+
+	if (!skb_can_shift(skb))
+		goto out;
+	if (skb == tcp_send_head(sk))
+		goto out;
+	if ((TCP_SKB_CB(skb)->sacked & TCPCB_TAGBITS) != TCPCB_SACKED_ACKED)
+		goto out;
+
+	len = skb->len;
+	if (skb_shift(prev, skb, len)) {
+		pcount += tcp_skb_pcount(skb);
+		tcp_shifted_skb(sk, prev, skb, tcp_skb_pcount(skb), len,
+				*fack_count, reord, flag, mss);
+	}
+
+out:
+	*fack_count += pcount;
+	return prev;
+
+noop:
+	return skb;
+
+fallback:
+	return NULL;
+}
+
 static struct sk_buff *tcp_sacktag_walk(struct sk_buff *skb, struct sock *sk,
 					struct tcp_sack_block *next_dup,
 					u32 start_seq, u32 end_seq,
 					int dup_sack_in, int *fack_count,
 					int *reord, int *flag)
 {
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct sk_buff *tmp;
+
 	tcp_for_write_queue_from(skb, sk) {
 		int in_sack = 0;
 		int dup_sack = dup_sack_in;
@@ -1396,18 +1614,42 @@ static struct sk_buff *tcp_sacktag_walk(struct sk_buff *skb, struct sock *sk,
 				dup_sack = 1;
 		}
 
-		if (in_sack <= 0)
-			in_sack = tcp_match_skb_to_sack(sk, skb, start_seq,
-							end_seq);
+		/* skb reference here is a bit tricky to get right, since
+		 * shifting can eat and free both this skb and the next,
+		 * so not even _safe variant of the loop is enough.
+		 */
+		if (in_sack <= 0) {
+			tmp = tcp_shift_skb_data(sk, skb, start_seq,
+						 end_seq, dup_sack,
+						 fack_count, reord, flag);
+			if (tmp != NULL) {
+				if (tmp != skb) {
+					skb = tmp;
+					continue;
+				}
+
+				in_sack = 0;
+			} else {
+				in_sack = tcp_match_skb_to_sack(sk, skb,
+								start_seq,
+								end_seq);
+			}
+		}
+
 		if (unlikely(in_sack < 0))
 			break;
 
-		if (in_sack)
+		if (in_sack) {
 			*flag |= tcp_sacktag_one(skb, sk, reord, dup_sack,
 						 *fack_count,
 						 &(TCP_SKB_CB(skb)->sacked),
 						 tcp_skb_pcount(skb));
 
+			if (!before(TCP_SKB_CB(skb)->seq,
+				    tcp_highest_sack_seq(tp)))
+				tcp_advance_highest_sack(sk, skb);
+		}
+
 		*fack_count += tcp_skb_pcount(skb);
 	}
 	return skb;
-- 
cgit v1.2.3-70-g09d2


From 111cc8b913b42ef07793648b1699288332f273e1 Mon Sep 17 00:00:00 2001
From: Ilpo Järvinen <ilpo.jarvinen@helsinki.fi>
Date: Mon, 24 Nov 2008 21:27:22 -0800
Subject: tcp: add some mibs to track collapsing
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Ilpo Järvinen <ilpo.jarvinen@helsinki.fi>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/snmp.h | 3 +++
 net/ipv4/proc.c      | 3 +++
 net/ipv4/tcp_input.c | 4 ++++
 3 files changed, 10 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/snmp.h b/include/linux/snmp.h
index 7a6e6bba4a7..aee3f1e1d1c 100644
--- a/include/linux/snmp.h
+++ b/include/linux/snmp.h
@@ -216,6 +216,9 @@ enum
 	LINUX_MIB_TCPSPURIOUSRTOS,		/* TCPSpuriousRTOs */
 	LINUX_MIB_TCPMD5NOTFOUND,		/* TCPMD5NotFound */
 	LINUX_MIB_TCPMD5UNEXPECTED,		/* TCPMD5Unexpected */
+	LINUX_MIB_SACKSHIFTED,
+	LINUX_MIB_SACKMERGED,
+	LINUX_MIB_SACKSHIFTFALLBACK,
 	__LINUX_MIB_MAX
 };
 
diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c
index a631a1f110c..731789bb499 100644
--- a/net/ipv4/proc.c
+++ b/net/ipv4/proc.c
@@ -234,6 +234,9 @@ static const struct snmp_mib snmp4_net_list[] = {
 	SNMP_MIB_ITEM("TCPSpuriousRTOs", LINUX_MIB_TCPSPURIOUSRTOS),
 	SNMP_MIB_ITEM("TCPMD5NotFound", LINUX_MIB_TCPMD5NOTFOUND),
 	SNMP_MIB_ITEM("TCPMD5Unexpected", LINUX_MIB_TCPMD5UNEXPECTED),
+	SNMP_MIB_ITEM("TCPSackShifted", LINUX_MIB_SACKSHIFTED),
+	SNMP_MIB_ITEM("TCPSackMerged", LINUX_MIB_SACKMERGED),
+	SNMP_MIB_ITEM("TCPSackShiftFallback", LINUX_MIB_SACKSHIFTFALLBACK),
 	SNMP_MIB_SENTINEL
 };
 
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index e6291dde334..9f8a80ba17b 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -1415,6 +1415,7 @@ static int tcp_shifted_skb(struct sock *sk, struct sk_buff *prev,
 
 	if (skb->len > 0) {
 		BUG_ON(!tcp_skb_pcount(skb));
+		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_SACKSHIFTED);
 		return 0;
 	}
 
@@ -1436,6 +1437,8 @@ static int tcp_shifted_skb(struct sock *sk, struct sk_buff *prev,
 	tcp_unlink_write_queue(skb, sk);
 	sk_wmem_free_skb(sk, skb);
 
+	NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_SACKMERGED);
+
 	return 1;
 }
 
@@ -1594,6 +1597,7 @@ noop:
 	return skb;
 
 fallback:
+	NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_SACKSHIFTFALLBACK);
 	return NULL;
 }
 
-- 
cgit v1.2.3-70-g09d2


From 14bfc987e395797dfe03e915e8b4c7fc9e5078e4 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Tue, 25 Nov 2008 08:58:11 +0100
Subject: tracing, tty: fix warnings caused by branch tracing and
 tty_kref_get()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Stephen Rothwell reported tht this warning started triggering in
linux-next:

  In file included from init/main.c:27:
  include/linux/tty.h: In function ‘tty_kref_get’:
  include/linux/tty.h:330: warning: ‘______f’ is static but declared in inline function ‘tty_kref_get’ which is not static

Which gcc emits for 'extern inline' functions that nevertheless define
static variables. Change it to 'static inline', which is the norm
in the kernel anyway.

Reported-by: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/tty.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/tty.h b/include/linux/tty.h
index 3b8121d4e36..eaec37c9d83 100644
--- a/include/linux/tty.h
+++ b/include/linux/tty.h
@@ -325,7 +325,7 @@ extern struct class *tty_class;
  *	go away
  */
 
-extern inline struct tty_struct *tty_kref_get(struct tty_struct *tty)
+static inline struct tty_struct *tty_kref_get(struct tty_struct *tty)
 {
 	if (tty)
 		kref_get(&tty->kref);
-- 
cgit v1.2.3-70-g09d2


From 47fd5b8373ecc6bf5473e4139b62b06425448252 Mon Sep 17 00:00:00 2001
From: Stephen Hemminger <shemminger@vyatta.com>
Date: Tue, 25 Nov 2008 00:20:43 -0800
Subject: netdev: add HAVE_NET_DEVICE_OPS

As a concession to vendors who have to deal with one source for different
kernel versions, add a HAVE_NET_DEVICE_OPS so they don't end up hard
coding ifdef against kernel version.

Signed-off-by: Stephen Hemminger <shemminger@vyatta.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 6095af572df..76a89f8e6a1 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -545,6 +545,7 @@ struct netdev_queue {
  *
  * void (*ndo_poll_controller)(struct net_device *dev);
  */
+#define HAVE_NET_DEVICE_OPS
 struct net_device_ops {
 	int			(*ndo_init)(struct net_device *dev);
 	void			(*ndo_uninit)(struct net_device *dev);
-- 
cgit v1.2.3-70-g09d2


From 7a6b6f515f77d1c62a2f383b6dce18cb0af0cf4f Mon Sep 17 00:00:00 2001
From: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
Date: Tue, 25 Nov 2008 01:02:08 -0800
Subject: DCB: fix kconfig option

Since the netlink option for DCB is necessary to actually be useful,
simplified the Kconfig option.  In addition, added useful help text for the
Kconfig option.

Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/Kconfig            |  4 ++--
 drivers/net/ixgbe/Makefile     |  2 +-
 drivers/net/ixgbe/ixgbe.h      |  2 +-
 drivers/net/ixgbe/ixgbe_main.c | 10 +++++-----
 include/linux/netdevice.h      |  4 ++--
 net/Makefile                   |  4 ++--
 net/dcb/Kconfig                | 24 +++++++++++++++++-------
 net/dcb/dcbnl.c                |  2 +-
 8 files changed, 31 insertions(+), 21 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/Kconfig b/drivers/net/Kconfig
index efd461d7c2b..75f9f7f2fbe 100644
--- a/drivers/net/Kconfig
+++ b/drivers/net/Kconfig
@@ -2451,10 +2451,10 @@ config IXGBE_DCA
 	  driver.  DCA is a method for warming the CPU cache before data
 	  is used, with the intent of lessening the impact of cache misses.
 
-config IXGBE_DCBNL
+config IXGBE_DCB
 	bool "Data Center Bridging (DCB) Support"
 	default n
-	depends on IXGBE && DCBNL
+	depends on IXGBE && DCB
 	---help---
 	  Say Y here if you want to use Data Center Bridging (DCB) in the
 	  driver.
diff --git a/drivers/net/ixgbe/Makefile b/drivers/net/ixgbe/Makefile
index 3228e508e62..6e7ef765bcd 100644
--- a/drivers/net/ixgbe/Makefile
+++ b/drivers/net/ixgbe/Makefile
@@ -35,4 +35,4 @@ obj-$(CONFIG_IXGBE) += ixgbe.o
 ixgbe-objs := ixgbe_main.o ixgbe_common.o ixgbe_ethtool.o \
               ixgbe_82598.o ixgbe_phy.o
 
-ixgbe-$(CONFIG_IXGBE_DCBNL) +=  ixgbe_dcb.o ixgbe_dcb_82598.o ixgbe_dcb_nl.o
+ixgbe-$(CONFIG_IXGBE_DCB) +=  ixgbe_dcb.o ixgbe_dcb_82598.o ixgbe_dcb_nl.o
diff --git a/drivers/net/ixgbe/ixgbe.h b/drivers/net/ixgbe/ixgbe.h
index 9509aee2d36..6cbf26e3db8 100644
--- a/drivers/net/ixgbe/ixgbe.h
+++ b/drivers/net/ixgbe/ixgbe.h
@@ -327,7 +327,7 @@ enum ixgbe_boards {
 };
 
 extern struct ixgbe_info ixgbe_82598_info;
-#ifdef CONFIG_IXGBE_DCBNL
+#ifdef CONFIG_IXGBE_DCB
 extern struct dcbnl_rtnl_ops dcbnl_ops;
 extern int ixgbe_copy_dcb_cfg(struct ixgbe_dcb_config *src_dcb_cfg,
                               struct ixgbe_dcb_config *dst_dcb_cfg,
diff --git a/drivers/net/ixgbe/ixgbe_main.c b/drivers/net/ixgbe/ixgbe_main.c
index 6620397a1fb..667a6463193 100644
--- a/drivers/net/ixgbe/ixgbe_main.c
+++ b/drivers/net/ixgbe/ixgbe_main.c
@@ -1914,7 +1914,7 @@ static void ixgbe_napi_disable_all(struct ixgbe_adapter *adapter)
 	}
 }
 
-#ifdef CONFIG_IXGBE_DCBNL
+#ifdef CONFIG_IXGBE_DCB
 /*
  * ixgbe_configure_dcb - Configure DCB hardware
  * @adapter: ixgbe adapter struct
@@ -1960,7 +1960,7 @@ static void ixgbe_configure(struct ixgbe_adapter *adapter)
 	ixgbe_set_rx_mode(netdev);
 
 	ixgbe_restore_vlan(adapter);
-#ifdef CONFIG_IXGBE_DCBNL
+#ifdef CONFIG_IXGBE_DCB
 	if (adapter->flags & IXGBE_FLAG_DCB_ENABLED) {
 		netif_set_gso_max_size(netdev, 32768);
 		ixgbe_configure_dcb(adapter);
@@ -2749,7 +2749,7 @@ static int __devinit ixgbe_sw_init(struct ixgbe_adapter *adapter)
 	struct ixgbe_hw *hw = &adapter->hw;
 	struct pci_dev *pdev = adapter->pdev;
 	unsigned int rss;
-#ifdef CONFIG_IXGBE_DCBNL
+#ifdef CONFIG_IXGBE_DCB
 	int j;
 	struct tc_configuration *tc;
 #endif
@@ -2768,7 +2768,7 @@ static int __devinit ixgbe_sw_init(struct ixgbe_adapter *adapter)
 	adapter->flags |= IXGBE_FLAG_RSS_ENABLED;
 	adapter->ring_feature[RING_F_DCB].indices = IXGBE_MAX_DCB_INDICES;
 
-#ifdef CONFIG_IXGBE_DCBNL
+#ifdef CONFIG_IXGBE_DCB
 	/* Configure DCB traffic classes */
 	for (j = 0; j < MAX_TRAFFIC_CLASS; j++) {
 		tc = &adapter->dcb_cfg.tc_config[j];
@@ -4120,7 +4120,7 @@ static int __devinit ixgbe_probe(struct pci_dev *pdev,
 	if (adapter->flags & IXGBE_FLAG_DCB_ENABLED)
 		adapter->flags &= ~IXGBE_FLAG_RSS_ENABLED;
 
-#ifdef CONFIG_IXGBE_DCBNL
+#ifdef CONFIG_IXGBE_DCB
 	netdev->dcbnl_ops = &dcbnl_ops;
 #endif
 
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 76a89f8e6a1..0df0db068ac 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -43,7 +43,7 @@
 
 #include <net/net_namespace.h>
 #include <net/dsa.h>
-#ifdef CONFIG_DCBNL
+#ifdef CONFIG_DCB
 #include <net/dcbnl.h>
 #endif
 
@@ -847,7 +847,7 @@ struct net_device
 #define GSO_MAX_SIZE		65536
 	unsigned int		gso_max_size;
 
-#ifdef CONFIG_DCBNL
+#ifdef CONFIG_DCB
 	/* Data Center Bridging netlink ops */
 	struct dcbnl_rtnl_ops *dcbnl_ops;
 #endif
diff --git a/net/Makefile b/net/Makefile
index e5af3dc3a03..ba4460432b7 100644
--- a/net/Makefile
+++ b/net/Makefile
@@ -56,8 +56,8 @@ obj-$(CONFIG_NETLABEL)		+= netlabel/
 obj-$(CONFIG_IUCV)		+= iucv/
 obj-$(CONFIG_RFKILL)		+= rfkill/
 obj-$(CONFIG_NET_9P)		+= 9p/
-ifeq ($(CONFIG_DCBNL),y)
-obj-$(CONFIG_DCB)		+= dcb/
+ifneq ($(CONFIG_DCB),)
+obj-y				+= dcb/
 endif
 
 ifeq ($(CONFIG_NET),y)
diff --git a/net/dcb/Kconfig b/net/dcb/Kconfig
index bdf38802d33..4066d59c8de 100644
--- a/net/dcb/Kconfig
+++ b/net/dcb/Kconfig
@@ -1,12 +1,22 @@
 config DCB
-        tristate "Data Center Bridging support"
-
-config DCBNL
-	bool "Data Center Bridging netlink interface support"
-	depends on DCB
+	bool "Data Center Bridging support"
 	default n
 	---help---
-	  This option turns on the netlink interface
-	  (dcbnl) for Data Center Bridging capable devices.
+	  This enables support for configuring Data Center Bridging (DCB)
+	  features on DCB capable Ethernet adapters via rtnetlink.  Say 'Y'
+	  if you have a DCB capable Ethernet adapter which supports this
+	  interface and you are connected to a DCB capable switch.
+
+	  DCB is a collection of Ethernet enhancements which allow DCB capable
+	  NICs and switches to support network traffic with differing
+	  requirements (highly reliable, no drops vs. best effort vs. low
+	  latency) to co-exist on Ethernet.
+
+	  DCB features include:
+	    Enhanced Transmission Selection (aka Priority Grouping) - provides a
+	      framework for assigning bandwidth guarantees to traffic classes.
+	    Priority-based Flow Control (PFC) - a MAC control pause frame which
+	      works at the granularity of the 802.1p priority instead of the
+	      link (802.3x).
 
 	  If unsure, say N.
diff --git a/net/dcb/dcbnl.c b/net/dcb/dcbnl.c
index b2bda3f610d..79a351d323a 100644
--- a/net/dcb/dcbnl.c
+++ b/net/dcb/dcbnl.c
@@ -48,7 +48,7 @@
  */
 
 MODULE_AUTHOR("Lucy Liu, <lucy.liu@intel.com>");
-MODULE_DESCRIPTION("Data Center Bridging generic netlink interface");
+MODULE_DESCRIPTION("Data Center Bridging netlink interface");
 MODULE_LICENSE("GPL");
 
 /**************** DCB attribute policies *************************************/
-- 
cgit v1.2.3-70-g09d2


From ca109491f612aab5c8152207631c0444f63da97f Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Tue, 25 Nov 2008 12:43:51 +0100
Subject: hrtimer: removing all ur callback modes

Impact: cleanup, move all hrtimer processing into hardirq context

This is an attempt at removing some of the hrtimer complexity by
reducing the number of callback modes to 1.

This means that all hrtimer callback functions will be ran from HARD-irq
context.

I went through all the 30 odd hrtimer callback functions in the kernel
and saw only one that I'm not quite sure of, which is the one in
net/can/bcm.c - hence I'm CC-ing the folks responsible for that code.

Furthermore, the hrtimer core now calls callbacks directly with IRQs
disabled in case you try to enqueue an expired timer. If this timer is a
periodic timer (which should use hrtimer_forward() to advance its time)
then it might be possible to end up in an inf. recursive loop due to the
fact that hrtimer_forward() doesn't round up to the next timer
granularity, and therefore keeps on calling the callback - obviously
this needs a fix.

Aside from that, this seems to compile and actually boot on my dual core
test box - although I'm sure there are some bugs in, me not hitting any
makes me certain :-)

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 drivers/input/touchscreen/ads7846.c |   4 +-
 include/linux/hrtimer.h             |  34 +----
 include/linux/interrupt.h           |   3 -
 kernel/hrtimer.c                    | 280 ++++--------------------------------
 kernel/sched.c                      |   2 -
 kernel/time/ntp.c                   |   4 +-
 kernel/time/tick-sched.c            |   1 -
 kernel/trace/trace_sysprof.c        |   1 -
 sound/drivers/pcsp/pcsp.c           |   1 -
 9 files changed, 37 insertions(+), 293 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/input/touchscreen/ads7846.c b/drivers/input/touchscreen/ads7846.c
index b9b7fc6ff1e..e1ece89fe92 100644
--- a/drivers/input/touchscreen/ads7846.c
+++ b/drivers/input/touchscreen/ads7846.c
@@ -697,7 +697,7 @@ static enum hrtimer_restart ads7846_timer(struct hrtimer *handle)
 	struct ads7846	*ts = container_of(handle, struct ads7846, timer);
 	int		status = 0;
 
-	spin_lock_irq(&ts->lock);
+	spin_lock(&ts->lock);
 
 	if (unlikely(!get_pendown_state(ts) ||
 		     device_suspended(&ts->spi->dev))) {
@@ -728,7 +728,7 @@ static enum hrtimer_restart ads7846_timer(struct hrtimer *handle)
 			dev_err(&ts->spi->dev, "spi_async --> %d\n", status);
 	}
 
-	spin_unlock_irq(&ts->lock);
+	spin_unlock(&ts->lock);
 	return HRTIMER_NORESTART;
 }
 
diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h
index 3eba43878dc..bd37078c2d7 100644
--- a/include/linux/hrtimer.h
+++ b/include/linux/hrtimer.h
@@ -42,26 +42,6 @@ enum hrtimer_restart {
 	HRTIMER_RESTART,	/* Timer must be restarted */
 };
 
-/*
- * hrtimer callback modes:
- *
- *	HRTIMER_CB_SOFTIRQ:		Callback must run in softirq context
- *	HRTIMER_CB_IRQSAFE_PERCPU:	Callback must run in hardirq context
- *					Special mode for tick emulation and
- *					scheduler timer. Such timers are per
- *					cpu and not allowed to be migrated on
- *					cpu unplug.
- *	HRTIMER_CB_IRQSAFE_UNLOCKED:	Callback should run in hardirq context
- *					with timer->base lock unlocked
- *					used for timers which call wakeup to
- *					avoid lock order problems with rq->lock
- */
-enum hrtimer_cb_mode {
-	HRTIMER_CB_SOFTIRQ,
-	HRTIMER_CB_IRQSAFE_PERCPU,
-	HRTIMER_CB_IRQSAFE_UNLOCKED,
-};
-
 /*
  * Values to track state of the timer
  *
@@ -70,7 +50,6 @@ enum hrtimer_cb_mode {
  * 0x00		inactive
  * 0x01		enqueued into rbtree
  * 0x02		callback function running
- * 0x04		callback pending (high resolution mode)
  *
  * Special cases:
  * 0x03		callback function running and enqueued
@@ -92,8 +71,7 @@ enum hrtimer_cb_mode {
 #define HRTIMER_STATE_INACTIVE	0x00
 #define HRTIMER_STATE_ENQUEUED	0x01
 #define HRTIMER_STATE_CALLBACK	0x02
-#define HRTIMER_STATE_PENDING	0x04
-#define HRTIMER_STATE_MIGRATE	0x08
+#define HRTIMER_STATE_MIGRATE	0x04
 
 /**
  * struct hrtimer - the basic hrtimer structure
@@ -109,8 +87,6 @@ enum hrtimer_cb_mode {
  * @function:	timer expiry callback function
  * @base:	pointer to the timer base (per cpu and per clock)
  * @state:	state information (See bit values above)
- * @cb_mode:	high resolution timer feature to select the callback execution
- *		 mode
  * @cb_entry:	list head to enqueue an expired timer into the callback list
  * @start_site:	timer statistics field to store the site where the timer
  *		was started
@@ -129,7 +105,6 @@ struct hrtimer {
 	struct hrtimer_clock_base	*base;
 	unsigned long			state;
 	struct list_head		cb_entry;
-	enum hrtimer_cb_mode		cb_mode;
 #ifdef CONFIG_TIMER_STATS
 	int				start_pid;
 	void				*start_site;
@@ -188,15 +163,11 @@ struct hrtimer_clock_base {
  * @check_clocks:	Indictator, when set evaluate time source and clock
  *			event devices whether high resolution mode can be
  *			activated.
- * @cb_pending:		Expired timers are moved from the rbtree to this
- *			list in the timer interrupt. The list is processed
- *			in the softirq.
  * @nr_events:		Total number of timer interrupt events
  */
 struct hrtimer_cpu_base {
 	spinlock_t			lock;
 	struct hrtimer_clock_base	clock_base[HRTIMER_MAX_CLOCK_BASES];
-	struct list_head		cb_pending;
 #ifdef CONFIG_HIGH_RES_TIMERS
 	ktime_t				expires_next;
 	int				hres_active;
@@ -404,8 +375,7 @@ static inline int hrtimer_active(const struct hrtimer *timer)
  */
 static inline int hrtimer_is_queued(struct hrtimer *timer)
 {
-	return timer->state &
-		(HRTIMER_STATE_ENQUEUED | HRTIMER_STATE_PENDING);
+	return timer->state & HRTIMER_STATE_ENQUEUED;
 }
 
 /*
diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index f58a0cf8929..d6210a97a8c 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -251,9 +251,6 @@ enum
 	BLOCK_SOFTIRQ,
 	TASKLET_SOFTIRQ,
 	SCHED_SOFTIRQ,
-#ifdef CONFIG_HIGH_RES_TIMERS
-	HRTIMER_SOFTIRQ,
-#endif
 	RCU_SOFTIRQ, 	/* Preferable RCU should always be the last softirq */
 
 	NR_SOFTIRQS
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index 47e63349d1b..efd6f41e1c1 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -442,22 +442,6 @@ static inline void debug_hrtimer_activate(struct hrtimer *timer) { }
 static inline void debug_hrtimer_deactivate(struct hrtimer *timer) { }
 #endif
 
-/*
- * Check, whether the timer is on the callback pending list
- */
-static inline int hrtimer_cb_pending(const struct hrtimer *timer)
-{
-	return timer->state & HRTIMER_STATE_PENDING;
-}
-
-/*
- * Remove a timer from the callback pending list
- */
-static inline void hrtimer_remove_cb_pending(struct hrtimer *timer)
-{
-	list_del_init(&timer->cb_entry);
-}
-
 /* High resolution timer related functions */
 #ifdef CONFIG_HIGH_RES_TIMERS
 
@@ -651,6 +635,8 @@ static inline void hrtimer_init_timer_hres(struct hrtimer *timer)
 {
 }
 
+static void __run_hrtimer(struct hrtimer *timer);
+
 /*
  * When High resolution timers are active, try to reprogram. Note, that in case
  * the state has HRTIMER_STATE_CALLBACK set, no reprogramming and no expiry
@@ -661,31 +647,14 @@ static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer,
 					    struct hrtimer_clock_base *base)
 {
 	if (base->cpu_base->hres_active && hrtimer_reprogram(timer, base)) {
-
-		/* Timer is expired, act upon the callback mode */
-		switch(timer->cb_mode) {
-		case HRTIMER_CB_IRQSAFE_PERCPU:
-		case HRTIMER_CB_IRQSAFE_UNLOCKED:
-			/*
-			 * This is solely for the sched tick emulation with
-			 * dynamic tick support to ensure that we do not
-			 * restart the tick right on the edge and end up with
-			 * the tick timer in the softirq ! The calling site
-			 * takes care of this. Also used for hrtimer sleeper !
-			 */
-			debug_hrtimer_deactivate(timer);
-			return 1;
-		case HRTIMER_CB_SOFTIRQ:
-			/*
-			 * Move everything else into the softirq pending list !
-			 */
-			list_add_tail(&timer->cb_entry,
-				      &base->cpu_base->cb_pending);
-			timer->state = HRTIMER_STATE_PENDING;
-			return 1;
-		default:
-			BUG();
-		}
+		/*
+		 * XXX: recursion check?
+		 * hrtimer_forward() should round up with timer granularity
+		 * so that we never get into inf recursion here,
+		 * it doesn't do that though
+		 */
+		__run_hrtimer(timer);
+		return 1;
 	}
 	return 0;
 }
@@ -724,11 +693,6 @@ static int hrtimer_switch_to_hres(void)
 	return 1;
 }
 
-static inline void hrtimer_raise_softirq(void)
-{
-	raise_softirq(HRTIMER_SOFTIRQ);
-}
-
 #else
 
 static inline int hrtimer_hres_active(void) { return 0; }
@@ -747,7 +711,6 @@ static inline int hrtimer_reprogram(struct hrtimer *timer,
 {
 	return 0;
 }
-static inline void hrtimer_raise_softirq(void) { }
 
 #endif /* CONFIG_HIGH_RES_TIMERS */
 
@@ -890,10 +853,7 @@ static void __remove_hrtimer(struct hrtimer *timer,
 			     struct hrtimer_clock_base *base,
 			     unsigned long newstate, int reprogram)
 {
-	/* High res. callback list. NOP for !HIGHRES */
-	if (hrtimer_cb_pending(timer))
-		hrtimer_remove_cb_pending(timer);
-	else {
+	if (timer->state & HRTIMER_STATE_ENQUEUED) {
 		/*
 		 * Remove the timer from the rbtree and replace the
 		 * first entry pointer if necessary.
@@ -953,7 +913,7 @@ hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, unsigned long delta_n
 {
 	struct hrtimer_clock_base *base, *new_base;
 	unsigned long flags;
-	int ret, raise;
+	int ret;
 
 	base = lock_hrtimer_base(timer, &flags);
 
@@ -988,26 +948,8 @@ hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, unsigned long delta_n
 	enqueue_hrtimer(timer, new_base,
 			new_base->cpu_base == &__get_cpu_var(hrtimer_bases));
 
-	/*
-	 * The timer may be expired and moved to the cb_pending
-	 * list. We can not raise the softirq with base lock held due
-	 * to a possible deadlock with runqueue lock.
-	 */
-	raise = timer->state == HRTIMER_STATE_PENDING;
-
-	/*
-	 * We use preempt_disable to prevent this task from migrating after
-	 * setting up the softirq and raising it. Otherwise, if me migrate
-	 * we will raise the softirq on the wrong CPU.
-	 */
-	preempt_disable();
-
 	unlock_hrtimer_base(timer, &flags);
 
-	if (raise)
-		hrtimer_raise_softirq();
-	preempt_enable();
-
 	return ret;
 }
 EXPORT_SYMBOL_GPL(hrtimer_start_range_ns);
@@ -1192,75 +1134,6 @@ int hrtimer_get_res(const clockid_t which_clock, struct timespec *tp)
 }
 EXPORT_SYMBOL_GPL(hrtimer_get_res);
 
-static void run_hrtimer_pending(struct hrtimer_cpu_base *cpu_base)
-{
-	spin_lock_irq(&cpu_base->lock);
-
-	while (!list_empty(&cpu_base->cb_pending)) {
-		enum hrtimer_restart (*fn)(struct hrtimer *);
-		struct hrtimer *timer;
-		int restart;
-		int emulate_hardirq_ctx = 0;
-
-		timer = list_entry(cpu_base->cb_pending.next,
-				   struct hrtimer, cb_entry);
-
-		debug_hrtimer_deactivate(timer);
-		timer_stats_account_hrtimer(timer);
-
-		fn = timer->function;
-		/*
-		 * A timer might have been added to the cb_pending list
-		 * when it was migrated during a cpu-offline operation.
-		 * Emulate hardirq context for such timers.
-		 */
-		if (timer->cb_mode == HRTIMER_CB_IRQSAFE_PERCPU ||
-		    timer->cb_mode == HRTIMER_CB_IRQSAFE_UNLOCKED)
-			emulate_hardirq_ctx = 1;
-
-		__remove_hrtimer(timer, timer->base, HRTIMER_STATE_CALLBACK, 0);
-		spin_unlock_irq(&cpu_base->lock);
-
-		if (unlikely(emulate_hardirq_ctx)) {
-			local_irq_disable();
-			restart = fn(timer);
-			local_irq_enable();
-		} else
-			restart = fn(timer);
-
-		spin_lock_irq(&cpu_base->lock);
-
-		timer->state &= ~HRTIMER_STATE_CALLBACK;
-		if (restart == HRTIMER_RESTART) {
-			BUG_ON(hrtimer_active(timer));
-			/*
-			 * Enqueue the timer, allow reprogramming of the event
-			 * device
-			 */
-			enqueue_hrtimer(timer, timer->base, 1);
-		} else if (hrtimer_active(timer)) {
-			/*
-			 * If the timer was rearmed on another CPU, reprogram
-			 * the event device.
-			 */
-			struct hrtimer_clock_base *base = timer->base;
-
-			if (base->first == &timer->node &&
-			    hrtimer_reprogram(timer, base)) {
-				/*
-				 * Timer is expired. Thus move it from tree to
-				 * pending list again.
-				 */
-				__remove_hrtimer(timer, base,
-						 HRTIMER_STATE_PENDING, 0);
-				list_add_tail(&timer->cb_entry,
-					      &base->cpu_base->cb_pending);
-			}
-		}
-	}
-	spin_unlock_irq(&cpu_base->lock);
-}
-
 static void __run_hrtimer(struct hrtimer *timer)
 {
 	struct hrtimer_clock_base *base = timer->base;
@@ -1268,25 +1141,21 @@ static void __run_hrtimer(struct hrtimer *timer)
 	enum hrtimer_restart (*fn)(struct hrtimer *);
 	int restart;
 
+	WARN_ON(!irqs_disabled());
+
 	debug_hrtimer_deactivate(timer);
 	__remove_hrtimer(timer, base, HRTIMER_STATE_CALLBACK, 0);
 	timer_stats_account_hrtimer(timer);
-
 	fn = timer->function;
-	if (timer->cb_mode == HRTIMER_CB_IRQSAFE_PERCPU ||
-	    timer->cb_mode == HRTIMER_CB_IRQSAFE_UNLOCKED) {
-		/*
-		 * Used for scheduler timers, avoid lock inversion with
-		 * rq->lock and tasklist_lock.
-		 *
-		 * These timers are required to deal with enqueue expiry
-		 * themselves and are not allowed to migrate.
-		 */
-		spin_unlock(&cpu_base->lock);
-		restart = fn(timer);
-		spin_lock(&cpu_base->lock);
-	} else
-		restart = fn(timer);
+
+	/*
+	 * Because we run timers from hardirq context, there is no chance
+	 * they get migrated to another cpu, therefore its safe to unlock
+	 * the timer base.
+	 */
+	spin_unlock(&cpu_base->lock);
+	restart = fn(timer);
+	spin_lock(&cpu_base->lock);
 
 	/*
 	 * Note: We clear the CALLBACK bit after enqueue_hrtimer to avoid
@@ -1311,7 +1180,7 @@ void hrtimer_interrupt(struct clock_event_device *dev)
 	struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases);
 	struct hrtimer_clock_base *base;
 	ktime_t expires_next, now;
-	int i, raise = 0;
+	int i;
 
 	BUG_ON(!cpu_base->hres_active);
 	cpu_base->nr_events++;
@@ -1360,16 +1229,6 @@ void hrtimer_interrupt(struct clock_event_device *dev)
 				break;
 			}
 
-			/* Move softirq callbacks to the pending list */
-			if (timer->cb_mode == HRTIMER_CB_SOFTIRQ) {
-				__remove_hrtimer(timer, base,
-						 HRTIMER_STATE_PENDING, 0);
-				list_add_tail(&timer->cb_entry,
-					      &base->cpu_base->cb_pending);
-				raise = 1;
-				continue;
-			}
-
 			__run_hrtimer(timer);
 		}
 		spin_unlock(&cpu_base->lock);
@@ -1383,10 +1242,6 @@ void hrtimer_interrupt(struct clock_event_device *dev)
 		if (tick_program_event(expires_next, 0))
 			goto retry;
 	}
-
-	/* Raise softirq ? */
-	if (raise)
-		raise_softirq(HRTIMER_SOFTIRQ);
 }
 
 /**
@@ -1413,11 +1268,6 @@ void hrtimer_peek_ahead_timers(void)
 	local_irq_restore(flags);
 }
 
-static void run_hrtimer_softirq(struct softirq_action *h)
-{
-	run_hrtimer_pending(&__get_cpu_var(hrtimer_bases));
-}
-
 #endif	/* CONFIG_HIGH_RES_TIMERS */
 
 /*
@@ -1429,8 +1279,6 @@ static void run_hrtimer_softirq(struct softirq_action *h)
  */
 void hrtimer_run_pending(void)
 {
-	struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases);
-
 	if (hrtimer_hres_active())
 		return;
 
@@ -1444,8 +1292,6 @@ void hrtimer_run_pending(void)
 	 */
 	if (tick_check_oneshot_change(!hrtimer_is_hres_enabled()))
 		hrtimer_switch_to_hres();
-
-	run_hrtimer_pending(cpu_base);
 }
 
 /*
@@ -1482,14 +1328,6 @@ void hrtimer_run_queues(void)
 					hrtimer_get_expires_tv64(timer))
 				break;
 
-			if (timer->cb_mode == HRTIMER_CB_SOFTIRQ) {
-				__remove_hrtimer(timer, base,
-					HRTIMER_STATE_PENDING, 0);
-				list_add_tail(&timer->cb_entry,
-					&base->cpu_base->cb_pending);
-				continue;
-			}
-
 			__run_hrtimer(timer);
 		}
 		spin_unlock(&cpu_base->lock);
@@ -1516,9 +1354,6 @@ void hrtimer_init_sleeper(struct hrtimer_sleeper *sl, struct task_struct *task)
 {
 	sl->timer.function = hrtimer_wakeup;
 	sl->task = task;
-#ifdef CONFIG_HIGH_RES_TIMERS
-	sl->timer.cb_mode = HRTIMER_CB_IRQSAFE_UNLOCKED;
-#endif
 }
 
 static int __sched do_nanosleep(struct hrtimer_sleeper *t, enum hrtimer_mode mode)
@@ -1655,36 +1490,22 @@ static void __cpuinit init_hrtimers_cpu(int cpu)
 	for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++)
 		cpu_base->clock_base[i].cpu_base = cpu_base;
 
-	INIT_LIST_HEAD(&cpu_base->cb_pending);
 	hrtimer_init_hres(cpu_base);
 }
 
 #ifdef CONFIG_HOTPLUG_CPU
 
-static int migrate_hrtimer_list(struct hrtimer_clock_base *old_base,
-				struct hrtimer_clock_base *new_base, int dcpu)
+static void migrate_hrtimer_list(struct hrtimer_clock_base *old_base,
+				 struct hrtimer_clock_base *new_base, int dcpu)
 {
 	struct hrtimer *timer;
 	struct rb_node *node;
-	int raise = 0;
 
 	while ((node = rb_first(&old_base->active))) {
 		timer = rb_entry(node, struct hrtimer, node);
 		BUG_ON(hrtimer_callback_running(timer));
 		debug_hrtimer_deactivate(timer);
 
-		/*
-		 * Should not happen. Per CPU timers should be
-		 * canceled _before_ the migration code is called
-		 */
-		if (timer->cb_mode == HRTIMER_CB_IRQSAFE_PERCPU) {
-			__remove_hrtimer(timer, old_base,
-					 HRTIMER_STATE_INACTIVE, 0);
-			WARN(1, "hrtimer (%p %p)active but cpu %d dead\n",
-			     timer, timer->function, dcpu);
-			continue;
-		}
-
 		/*
 		 * Mark it as STATE_MIGRATE not INACTIVE otherwise the
 		 * timer could be seen as !active and just vanish away
@@ -1708,48 +1529,19 @@ static int migrate_hrtimer_list(struct hrtimer_clock_base *old_base,
 		 * otherwise we end up with a stale timer.
 		 */
 		if (timer->state == HRTIMER_STATE_MIGRATE) {
-			timer->state = HRTIMER_STATE_PENDING;
-			list_add_tail(&timer->cb_entry,
-				      &new_base->cpu_base->cb_pending);
-			raise = 1;
+			/* XXX: running on offline cpu */
+			__run_hrtimer(timer);
 		}
 #endif
 		/* Clear the migration state bit */
 		timer->state &= ~HRTIMER_STATE_MIGRATE;
 	}
-	return raise;
 }
 
-#ifdef CONFIG_HIGH_RES_TIMERS
-static int migrate_hrtimer_pending(struct hrtimer_cpu_base *old_base,
-				   struct hrtimer_cpu_base *new_base)
-{
-	struct hrtimer *timer;
-	int raise = 0;
-
-	while (!list_empty(&old_base->cb_pending)) {
-		timer = list_entry(old_base->cb_pending.next,
-				   struct hrtimer, cb_entry);
-
-		__remove_hrtimer(timer, timer->base, HRTIMER_STATE_PENDING, 0);
-		timer->base = &new_base->clock_base[timer->base->index];
-		list_add_tail(&timer->cb_entry, &new_base->cb_pending);
-		raise = 1;
-	}
-	return raise;
-}
-#else
-static int migrate_hrtimer_pending(struct hrtimer_cpu_base *old_base,
-				   struct hrtimer_cpu_base *new_base)
-{
-	return 0;
-}
-#endif
-
 static void migrate_hrtimers(int cpu)
 {
 	struct hrtimer_cpu_base *old_base, *new_base;
-	int i, raise = 0;
+	int i;
 
 	BUG_ON(cpu_online(cpu));
 	old_base = &per_cpu(hrtimer_bases, cpu);
@@ -1764,20 +1556,13 @@ static void migrate_hrtimers(int cpu)
 	spin_lock_nested(&old_base->lock, SINGLE_DEPTH_NESTING);
 
 	for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) {
-		if (migrate_hrtimer_list(&old_base->clock_base[i],
-					 &new_base->clock_base[i], cpu))
-			raise = 1;
+		migrate_hrtimer_list(&old_base->clock_base[i],
+				     &new_base->clock_base[i], cpu);
 	}
 
-	if (migrate_hrtimer_pending(old_base, new_base))
-		raise = 1;
-
 	spin_unlock(&old_base->lock);
 	spin_unlock_irq(&new_base->lock);
 	put_cpu_var(hrtimer_bases);
-
-	if (raise)
-		hrtimer_raise_softirq();
 }
 #endif /* CONFIG_HOTPLUG_CPU */
 
@@ -1817,9 +1602,6 @@ void __init hrtimers_init(void)
 	hrtimer_cpu_notify(&hrtimers_nb, (unsigned long)CPU_UP_PREPARE,
 			  (void *)(long)smp_processor_id());
 	register_cpu_notifier(&hrtimers_nb);
-#ifdef CONFIG_HIGH_RES_TIMERS
-	open_softirq(HRTIMER_SOFTIRQ, run_hrtimer_softirq);
-#endif
 }
 
 /**
diff --git a/kernel/sched.c b/kernel/sched.c
index 9b1e79371c2..5ac5e953616 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -203,7 +203,6 @@ void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime)
 	hrtimer_init(&rt_b->rt_period_timer,
 			CLOCK_MONOTONIC, HRTIMER_MODE_REL);
 	rt_b->rt_period_timer.function = sched_rt_period_timer;
-	rt_b->rt_period_timer.cb_mode = HRTIMER_CB_IRQSAFE_UNLOCKED;
 }
 
 static inline int rt_bandwidth_enabled(void)
@@ -1139,7 +1138,6 @@ static void init_rq_hrtick(struct rq *rq)
 
 	hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
 	rq->hrtick_timer.function = hrtick;
-	rq->hrtick_timer.cb_mode = HRTIMER_CB_IRQSAFE_PERCPU;
 }
 #else	/* CONFIG_SCHED_HRTICK */
 static inline void hrtick_clear(struct rq *rq)
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index 8ff15e5d486..f5f793d9241 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -131,7 +131,7 @@ static enum hrtimer_restart ntp_leap_second(struct hrtimer *timer)
 {
 	enum hrtimer_restart res = HRTIMER_NORESTART;
 
-	write_seqlock_irq(&xtime_lock);
+	write_seqlock(&xtime_lock);
 
 	switch (time_state) {
 	case TIME_OK:
@@ -164,7 +164,7 @@ static enum hrtimer_restart ntp_leap_second(struct hrtimer *timer)
 	}
 	update_vsyscall(&xtime, clock);
 
-	write_sequnlock_irq(&xtime_lock);
+	write_sequnlock(&xtime_lock);
 
 	return res;
 }
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 342fc9ccab4..502a81e2639 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -681,7 +681,6 @@ void tick_setup_sched_timer(void)
 	 */
 	hrtimer_init(&ts->sched_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
 	ts->sched_timer.function = tick_sched_timer;
-	ts->sched_timer.cb_mode = HRTIMER_CB_IRQSAFE_PERCPU;
 
 	/* Get the next period (per cpu) */
 	hrtimer_set_expires(&ts->sched_timer, tick_init_jiffy_update());
diff --git a/kernel/trace/trace_sysprof.c b/kernel/trace/trace_sysprof.c
index 9587d3bcba5..ae542e2e38d 100644
--- a/kernel/trace/trace_sysprof.c
+++ b/kernel/trace/trace_sysprof.c
@@ -202,7 +202,6 @@ static void start_stack_timer(int cpu)
 
 	hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
 	hrtimer->function = stack_trace_timer_fn;
-	hrtimer->cb_mode = HRTIMER_CB_IRQSAFE_PERCPU;
 
 	hrtimer_start(hrtimer, ns_to_ktime(sample_period), HRTIMER_MODE_REL);
 }
diff --git a/sound/drivers/pcsp/pcsp.c b/sound/drivers/pcsp/pcsp.c
index 1899cf0685b..8e52b2a8a13 100644
--- a/sound/drivers/pcsp/pcsp.c
+++ b/sound/drivers/pcsp/pcsp.c
@@ -96,7 +96,6 @@ static int __devinit snd_card_pcsp_probe(int devnum, struct device *dev)
 		return -EINVAL;
 
 	hrtimer_init(&pcsp_chip.timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
-	pcsp_chip.timer.cb_mode = HRTIMER_CB_SOFTIRQ;
 	pcsp_chip.timer.function = pcsp_do_timer;
 
 	card = snd_card_new(index, id, THIS_MODULE, 0);
-- 
cgit v1.2.3-70-g09d2


From ca0002a179bfa532d009a9272d619732872c49bd Mon Sep 17 00:00:00 2001
From: Markus Metzger <markus.t.metzger@intel.com>
Date: Tue, 25 Nov 2008 09:01:25 +0100
Subject: x86, bts: base in-kernel ds interface on handles

Impact: generalize the DS code to shared buffers

Change the in-kernel ds.h interface to identify the tracer via a
handle returned on ds_request_~().

Tracers used to be identified via their task_struct.

The changes are required to allow DS to be shared between different
tasks, which is needed for perfmon2 and for ftrace.

For ptrace, the handle is stored in the traced task's task_struct.
This should probably go into a (arch-specific) ptrace context some
time.

Signed-off-by: Markus Metzger <markus.t.metzger@intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/include/asm/ds.h | 124 ++++-----
 arch/x86/kernel/ds.c      | 679 +++++++++++++++++++++++-----------------------
 arch/x86/kernel/ptrace.c  |  73 ++---
 include/linux/sched.h     |   9 +
 4 files changed, 446 insertions(+), 439 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/include/asm/ds.h b/arch/x86/include/asm/ds.h
index a95008457ea..0af997de5f0 100644
--- a/arch/x86/include/asm/ds.h
+++ b/arch/x86/include/asm/ds.h
@@ -26,11 +26,18 @@
 
 #include <linux/types.h>
 #include <linux/init.h>
+#include <linux/err.h>
 
 
 #ifdef CONFIG_X86_DS
 
 struct task_struct;
+struct ds_tracer;
+struct bts_tracer;
+struct pebs_tracer;
+
+typedef void (*bts_ovfl_callback_t)(struct bts_tracer *);
+typedef void (*pebs_ovfl_callback_t)(struct pebs_tracer *);
 
 /*
  * Request BTS or PEBS
@@ -38,21 +45,29 @@ struct task_struct;
  * Due to alignement constraints, the actual buffer may be slightly
  * smaller than the requested or provided buffer.
  *
- * Returns 0 on success; -Eerrno otherwise
+ * Returns a pointer to a tracer structure on success, or
+ * ERR_PTR(errcode) on failure.
+ *
+ * The interrupt threshold is independent from the overflow callback
+ * to allow users to use their own overflow interrupt handling mechanism.
  *
  * task: the task to request recording for;
  *       NULL for per-cpu recording on the current cpu
  * base: the base pointer for the (non-pageable) buffer;
  *       NULL if buffer allocation requested
- * size: the size of the requested or provided buffer
+ * size: the size of the requested or provided buffer in bytes
  * ovfl: pointer to a function to be called on buffer overflow;
  *       NULL if cyclic buffer requested
+ * th: the interrupt threshold in records from the end of the buffer;
+ *     -1 if no interrupt threshold is requested.
  */
-typedef void (*ds_ovfl_callback_t)(struct task_struct *);
-extern int ds_request_bts(struct task_struct *task, void *base, size_t size,
-			  ds_ovfl_callback_t ovfl);
-extern int ds_request_pebs(struct task_struct *task, void *base, size_t size,
-			   ds_ovfl_callback_t ovfl);
+extern struct bts_tracer *ds_request_bts(struct task_struct *task,
+					 void *base, size_t size,
+					 bts_ovfl_callback_t ovfl, size_t th);
+extern struct pebs_tracer *ds_request_pebs(struct task_struct *task,
+					   void *base, size_t size,
+					   pebs_ovfl_callback_t ovfl,
+					   size_t th);
 
 /*
  * Release BTS or PEBS resources
@@ -61,37 +76,34 @@ extern int ds_request_pebs(struct task_struct *task, void *base, size_t size,
  *
  * Returns 0 on success; -Eerrno otherwise
  *
- * task: the task to release resources for;
- *       NULL to release resources for the current cpu
+ * tracer: the tracer handle returned from ds_request_~()
  */
-extern int ds_release_bts(struct task_struct *task);
-extern int ds_release_pebs(struct task_struct *task);
+extern int ds_release_bts(struct bts_tracer *tracer);
+extern int ds_release_pebs(struct pebs_tracer *tracer);
 
 /*
- * Return the (array) index of the write pointer.
+ * Get the (array) index of the write pointer.
  * (assuming an array of BTS/PEBS records)
  *
- * Returns -Eerrno on error
+ * Returns 0 on success; -Eerrno on error
  *
- * task: the task to access;
- *       NULL to access the current cpu
- * pos (out): if not NULL, will hold the result
+ * tracer: the tracer handle returned from ds_request_~()
+ * pos (out): will hold the result
  */
-extern int ds_get_bts_index(struct task_struct *task, size_t *pos);
-extern int ds_get_pebs_index(struct task_struct *task, size_t *pos);
+extern int ds_get_bts_index(struct bts_tracer *tracer, size_t *pos);
+extern int ds_get_pebs_index(struct pebs_tracer *tracer, size_t *pos);
 
 /*
- * Return the (array) index one record beyond the end of the array.
+ * Get the (array) index one record beyond the end of the array.
  * (assuming an array of BTS/PEBS records)
  *
- * Returns -Eerrno on error
+ * Returns 0 on success; -Eerrno on error
  *
- * task: the task to access;
- *       NULL to access the current cpu
- * pos (out): if not NULL, will hold the result
+ * tracer: the tracer handle returned from ds_request_~()
+ * pos (out): will hold the result
  */
-extern int ds_get_bts_end(struct task_struct *task, size_t *pos);
-extern int ds_get_pebs_end(struct task_struct *task, size_t *pos);
+extern int ds_get_bts_end(struct bts_tracer *tracer, size_t *pos);
+extern int ds_get_pebs_end(struct pebs_tracer *tracer, size_t *pos);
 
 /*
  * Provide a pointer to the BTS/PEBS record at parameter index.
@@ -102,14 +114,13 @@ extern int ds_get_pebs_end(struct task_struct *task, size_t *pos);
  *
  * Returns the size of a single record on success; -Eerrno on error
  *
- * task: the task to access;
- *       NULL to access the current cpu
+ * tracer: the tracer handle returned from ds_request_~()
  * index: the index of the requested record
  * record (out): pointer to the requested record
  */
-extern int ds_access_bts(struct task_struct *task,
+extern int ds_access_bts(struct bts_tracer *tracer,
 			 size_t index, const void **record);
-extern int ds_access_pebs(struct task_struct *task,
+extern int ds_access_pebs(struct pebs_tracer *tracer,
 			  size_t index, const void **record);
 
 /*
@@ -129,38 +140,24 @@ extern int ds_access_pebs(struct task_struct *task,
  *
  * Returns the number of bytes written or -Eerrno.
  *
- * task: the task to access;
- *       NULL to access the current cpu
+ * tracer: the tracer handle returned from ds_request_~()
  * buffer: the buffer to write
  * size: the size of the buffer
  */
-extern int ds_write_bts(struct task_struct *task,
+extern int ds_write_bts(struct bts_tracer *tracer,
 			const void *buffer, size_t size);
-extern int ds_write_pebs(struct task_struct *task,
+extern int ds_write_pebs(struct pebs_tracer *tracer,
 			 const void *buffer, size_t size);
 
-/*
- * Same as ds_write_bts/pebs, but omit ownership checks.
- *
- * This is needed to have some other task than the owner of the
- * BTS/PEBS buffer or the parameter task itself write into the
- * respective buffer.
- */
-extern int ds_unchecked_write_bts(struct task_struct *task,
-				  const void *buffer, size_t size);
-extern int ds_unchecked_write_pebs(struct task_struct *task,
-				   const void *buffer, size_t size);
-
 /*
  * Reset the write pointer of the BTS/PEBS buffer.
  *
  * Returns 0 on success; -Eerrno on error
  *
- * task: the task to access;
- *       NULL to access the current cpu
+ * tracer: the tracer handle returned from ds_request_~()
  */
-extern int ds_reset_bts(struct task_struct *task);
-extern int ds_reset_pebs(struct task_struct *task);
+extern int ds_reset_bts(struct bts_tracer *tracer);
+extern int ds_reset_pebs(struct pebs_tracer *tracer);
 
 /*
  * Clear the BTS/PEBS buffer and reset the write pointer.
@@ -168,33 +165,30 @@ extern int ds_reset_pebs(struct task_struct *task);
  *
  * Returns 0 on success; -Eerrno on error
  *
- * task: the task to access;
- *       NULL to access the current cpu
+ * tracer: the tracer handle returned from ds_request_~()
  */
-extern int ds_clear_bts(struct task_struct *task);
-extern int ds_clear_pebs(struct task_struct *task);
+extern int ds_clear_bts(struct bts_tracer *tracer);
+extern int ds_clear_pebs(struct pebs_tracer *tracer);
 
 /*
  * Provide the PEBS counter reset value.
  *
  * Returns 0 on success; -Eerrno on error
  *
- * task: the task to access;
- *       NULL to access the current cpu
+ * tracer: the tracer handle returned from ds_request_pebs()
  * value (out): the counter reset value
  */
-extern int ds_get_pebs_reset(struct task_struct *task, u64 *value);
+extern int ds_get_pebs_reset(struct pebs_tracer *tracer, u64 *value);
 
 /*
  * Set the PEBS counter reset value.
  *
  * Returns 0 on success; -Eerrno on error
  *
- * task: the task to access;
- *       NULL to access the current cpu
+ * tracer: the tracer handle returned from ds_request_pebs()
  * value: the new counter reset value
  */
-extern int ds_set_pebs_reset(struct task_struct *task, u64 value);
+extern int ds_set_pebs_reset(struct pebs_tracer *tracer, u64 value);
 
 /*
  * Initialization
@@ -207,17 +201,13 @@ extern void __cpuinit ds_init_intel(struct cpuinfo_x86 *);
 /*
  * The DS context - part of struct thread_struct.
  */
+#define MAX_SIZEOF_DS (12 * 8)
+
 struct ds_context {
 	/* pointer to the DS configuration; goes into MSR_IA32_DS_AREA */
-	unsigned char *ds;
+	unsigned char ds[MAX_SIZEOF_DS];
 	/* the owner of the BTS and PEBS configuration, respectively */
-	struct task_struct *owner[2];
-	/* buffer overflow notification function for BTS and PEBS */
-	ds_ovfl_callback_t callback[2];
-	/* the original buffer address */
-	void *buffer[2];
-	/* the number of allocated pages for on-request allocated buffers */
-	unsigned int pages[2];
+	struct ds_tracer  *owner[2];
 	/* use count */
 	unsigned long count;
 	/* a pointer to the context location inside the thread_struct
diff --git a/arch/x86/kernel/ds.c b/arch/x86/kernel/ds.c
index d6938d9351c..96768e9cce9 100644
--- a/arch/x86/kernel/ds.c
+++ b/arch/x86/kernel/ds.c
@@ -28,6 +28,7 @@
 #include <linux/slab.h>
 #include <linux/sched.h>
 #include <linux/mm.h>
+#include <linux/kernel.h>
 
 
 /*
@@ -44,6 +45,35 @@ struct ds_configuration {
 };
 static struct ds_configuration ds_cfg;
 
+/*
+ * A BTS or PEBS tracer.
+ *
+ * This holds the configuration of the tracer and serves as a handle
+ * to identify tracers.
+ */
+struct ds_tracer {
+	/* the DS context (partially) owned by this tracer */
+	struct ds_context *context;
+	/* the buffer provided on ds_request() and its size in bytes */
+	void *buffer;
+	size_t size;
+	/* the number of allocated pages for on-request allocated buffers */
+	unsigned int pages;
+};
+
+struct bts_tracer {
+	/* the common DS part */
+	struct ds_tracer ds;
+	/* buffer overflow notification function */
+	bts_ovfl_callback_t ovfl;
+};
+
+struct pebs_tracer {
+	/* the common DS part */
+	struct ds_tracer ds;
+	/* buffer overflow notification function */
+	pebs_ovfl_callback_t ovfl;
+};
 
 /*
  * Debug Store (DS) save area configuration (see Intel64 and IA32
@@ -107,35 +137,15 @@ static inline void ds_set(unsigned char *base, enum ds_qualifier qual,
 	(*(unsigned long *)base) = value;
 }
 
+#define DS_ALIGNMENT (1 << 3)	/* BTS and PEBS buffer alignment */
+
 
 /*
  * Locking is done only for allocating BTS or PEBS resources and for
  * guarding context and buffer memory allocation.
- *
- * Most functions require the current task to own the ds context part
- * they are going to access. All the locking is done when validating
- * access to the context.
  */
 static spinlock_t ds_lock = __SPIN_LOCK_UNLOCKED(ds_lock);
 
-/*
- * Validate that the current task is allowed to access the BTS/PEBS
- * buffer of the parameter task.
- *
- * Returns 0, if access is granted; -Eerrno, otherwise.
- */
-static inline int ds_validate_access(struct ds_context *context,
-				     enum ds_qualifier qual)
-{
-	if (!context)
-		return -EPERM;
-
-	if (context->owner[qual] == current)
-		return 0;
-
-	return -EPERM;
-}
-
 
 /*
  * We either support (system-wide) per-cpu or per-thread allocation.
@@ -183,50 +193,12 @@ static inline int check_tracer(struct task_struct *task)
  *
  * Contexts are use-counted. They are allocated on first access and
  * deallocated when the last user puts the context.
- *
- * We distinguish between an allocating and a non-allocating get of a
- * context:
- * - the allocating get is used for requesting BTS/PEBS resources. It
- *   requires the caller to hold the global ds_lock.
- * - the non-allocating get is used for all other cases. A
- *   non-existing context indicates an error. It acquires and releases
- *   the ds_lock itself for obtaining the context.
- *
- * A context and its DS configuration are allocated and deallocated
- * together. A context always has a DS configuration of the
- * appropriate size.
  */
 static DEFINE_PER_CPU(struct ds_context *, system_context);
 
 #define this_system_context per_cpu(system_context, smp_processor_id())
 
-/*
- * Returns the pointer to the parameter task's context or to the
- * system-wide context, if task is NULL.
- *
- * Increases the use count of the returned context, if not NULL.
- */
 static inline struct ds_context *ds_get_context(struct task_struct *task)
-{
-	struct ds_context *context;
-	unsigned long irq;
-
-	spin_lock_irqsave(&ds_lock, irq);
-
-	context = (task ? task->thread.ds_ctx : this_system_context);
-	if (context)
-		context->count++;
-
-	spin_unlock_irqrestore(&ds_lock, irq);
-
-	return context;
-}
-
-/*
- * Same as ds_get_context, but allocates the context and it's DS
- * structure, if necessary; returns NULL; if out of memory.
- */
-static inline struct ds_context *ds_alloc_context(struct task_struct *task)
 {
 	struct ds_context **p_context =
 		(task ? &task->thread.ds_ctx : &this_system_context);
@@ -238,16 +210,9 @@ static inline struct ds_context *ds_alloc_context(struct task_struct *task)
 		if (!context)
 			return NULL;
 
-		context->ds = kzalloc(ds_cfg.sizeof_ds, GFP_KERNEL);
-		if (!context->ds) {
-			kfree(context);
-			return NULL;
-		}
-
 		spin_lock_irqsave(&ds_lock, irq);
 
 		if (*p_context) {
-			kfree(context->ds);
 			kfree(context);
 
 			context = *p_context;
@@ -272,10 +237,6 @@ static inline struct ds_context *ds_alloc_context(struct task_struct *task)
 	return context;
 }
 
-/*
- * Decreases the use count of the parameter context, if not NULL.
- * Deallocates the context, if the use count reaches zero.
- */
 static inline void ds_put_context(struct ds_context *context)
 {
 	unsigned long irq;
@@ -296,13 +257,6 @@ static inline void ds_put_context(struct ds_context *context)
 	if (!context->task || (context->task == current))
 		wrmsrl(MSR_IA32_DS_AREA, 0);
 
-	put_tracer(context->task);
-
-	/* free any leftover buffers from tracers that did not
-	 * deallocate them properly. */
-	kfree(context->buffer[ds_bts]);
-	kfree(context->buffer[ds_pebs]);
-	kfree(context->ds);
 	kfree(context);
  out:
 	spin_unlock_irqrestore(&ds_lock, irq);
@@ -312,21 +266,29 @@ static inline void ds_put_context(struct ds_context *context)
 /*
  * Handle a buffer overflow
  *
- * task: the task whose buffers are overflowing;
- *       NULL for a buffer overflow on the current cpu
  * context: the ds context
  * qual: the buffer type
  */
-static void ds_overflow(struct task_struct *task, struct ds_context *context,
-			enum ds_qualifier qual)
-{
-	if (!context)
-		return;
-
-	if (context->callback[qual])
-		(*context->callback[qual])(task);
-
-	/* todo: do some more overflow handling */
+static void ds_overflow(struct ds_context *context, enum ds_qualifier qual)
+{
+	switch (qual) {
+	case ds_bts: {
+		struct bts_tracer *tracer =
+			container_of(context->owner[qual],
+				     struct bts_tracer, ds);
+		if (tracer->ovfl)
+			tracer->ovfl(tracer);
+	}
+		break;
+	case ds_pebs: {
+		struct pebs_tracer *tracer =
+			container_of(context->owner[qual],
+				     struct pebs_tracer, ds);
+		if (tracer->ovfl)
+			tracer->ovfl(tracer);
+	}
+		break;
+	}
 }
 
 
@@ -343,23 +305,25 @@ static void ds_overflow(struct task_struct *task, struct ds_context *context,
 static inline void *ds_allocate_buffer(size_t size, unsigned int *pages)
 {
 	unsigned long rlim, vm, pgsz;
-	void *buffer;
+	void *buffer = NULL;
 
 	pgsz = PAGE_ALIGN(size) >> PAGE_SHIFT;
 
+	down_write(&current->mm->mmap_sem);
+
 	rlim = current->signal->rlim[RLIMIT_AS].rlim_cur >> PAGE_SHIFT;
 	vm   = current->mm->total_vm  + pgsz;
 	if (rlim < vm)
-		return NULL;
+		goto out;
 
 	rlim = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur >> PAGE_SHIFT;
 	vm   = current->mm->locked_vm  + pgsz;
 	if (rlim < vm)
-		return NULL;
+		goto out;
 
 	buffer = kzalloc(size, GFP_KERNEL);
 	if (!buffer)
-		return NULL;
+		goto out;
 
 	current->mm->total_vm  += pgsz;
 	current->mm->locked_vm += pgsz;
@@ -367,290 +331,337 @@ static inline void *ds_allocate_buffer(size_t size, unsigned int *pages)
 	if (pages)
 		*pages = pgsz;
 
+ out:
+	up_write(&current->mm->mmap_sem);
 	return buffer;
 }
 
-static int ds_request(struct task_struct *task, void *base, size_t size,
-		      ds_ovfl_callback_t ovfl, enum ds_qualifier qual)
+static void ds_install_ds_config(struct ds_context *context,
+				 enum ds_qualifier qual,
+				 void *base, size_t size, size_t ith)
 {
-	struct ds_context *context;
 	unsigned long buffer, adj;
-	const unsigned long alignment = (1 << 3);
+
+	/* adjust the buffer address and size to meet alignment
+	 * constraints:
+	 * - buffer is double-word aligned
+	 * - size is multiple of record size
+	 *
+	 * We checked the size at the very beginning; we have enough
+	 * space to do the adjustment.
+	 */
+	buffer = (unsigned long)base;
+
+	adj = ALIGN(buffer, DS_ALIGNMENT) - buffer;
+	buffer += adj;
+	size   -= adj;
+
+	size /= ds_cfg.sizeof_rec[qual];
+	size *= ds_cfg.sizeof_rec[qual];
+
+	ds_set(context->ds, qual, ds_buffer_base, buffer);
+	ds_set(context->ds, qual, ds_index, buffer);
+	ds_set(context->ds, qual, ds_absolute_maximum, buffer + size);
+
+	/* The value for 'no threshold' is -1, which will set the
+	 * threshold outside of the buffer, just like we want it.
+	 */
+	ds_set(context->ds, qual,
+	       ds_interrupt_threshold, buffer + size - ith);
+}
+
+static int ds_request(struct ds_tracer *tracer, enum ds_qualifier qual,
+		      struct task_struct *task,
+		      void *base, size_t size, size_t th)
+{
+	struct ds_context *context;
 	unsigned long irq;
-	int error = 0;
+	int error;
 
+	error = -EOPNOTSUPP;
 	if (!ds_cfg.sizeof_ds)
-		return -EOPNOTSUPP;
+		goto out;
 
 	/* we require some space to do alignment adjustments below */
-	if (size < (alignment + ds_cfg.sizeof_rec[qual]))
-		return -EINVAL;
+	error = -EINVAL;
+	if (size < (DS_ALIGNMENT + ds_cfg.sizeof_rec[qual]))
+		goto out;
 
-	/* buffer overflow notification is not yet implemented */
-	if (ovfl)
-		return -EOPNOTSUPP;
+	if (th != (size_t)-1) {
+		th *= ds_cfg.sizeof_rec[qual];
+
+		error = -EINVAL;
+		if (size <= th)
+			goto out;
+	}
+
+	error = -ENOMEM;
+	if (!base) {
+		base = ds_allocate_buffer(size, &tracer->pages);
+		if (!base)
+			goto out;
+	}
 
+	tracer->buffer = base;
+	tracer->size = size;
 
-	context = ds_alloc_context(task);
+	error = -ENOMEM;
+	context = ds_get_context(task);
 	if (!context)
-		return -ENOMEM;
+		goto out;
+	tracer->context = context;
+
 
 	spin_lock_irqsave(&ds_lock, irq);
 
 	error = -EPERM;
 	if (!check_tracer(task))
 		goto out_unlock;
-
 	get_tracer(task);
 
-	error = -EALREADY;
-	if (context->owner[qual] == current)
-		goto out_put_tracer;
 	error = -EPERM;
-	if (context->owner[qual] != NULL)
+	if (context->owner[qual])
 		goto out_put_tracer;
-	context->owner[qual] = current;
+	context->owner[qual] = tracer;
 
 	spin_unlock_irqrestore(&ds_lock, irq);
 
 
-	error = -ENOMEM;
-	if (!base) {
-		base = ds_allocate_buffer(size, &context->pages[qual]);
-		if (!base)
-			goto out_release;
-
-		context->buffer[qual]   = base;
-	}
-	error = 0;
+	ds_install_ds_config(context, qual, base, size, th);
 
-	context->callback[qual] = ovfl;
-
-	/* adjust the buffer address and size to meet alignment
-	 * constraints:
-	 * - buffer is double-word aligned
-	 * - size is multiple of record size
-	 *
-	 * We checked the size at the very beginning; we have enough
-	 * space to do the adjustment.
-	 */
-	buffer = (unsigned long)base;
-
-	adj = ALIGN(buffer, alignment) - buffer;
-	buffer += adj;
-	size   -= adj;
-
-	size /= ds_cfg.sizeof_rec[qual];
-	size *= ds_cfg.sizeof_rec[qual];
-
-	ds_set(context->ds, qual, ds_buffer_base, buffer);
-	ds_set(context->ds, qual, ds_index, buffer);
-	ds_set(context->ds, qual, ds_absolute_maximum, buffer + size);
-
-	if (ovfl) {
-		/* todo: select a suitable interrupt threshold */
-	} else
-		ds_set(context->ds, qual,
-		       ds_interrupt_threshold, buffer + size + 1);
-
-	/* we keep the context until ds_release */
-	return error;
-
- out_release:
-	context->owner[qual] = NULL;
-	ds_put_context(context);
-	put_tracer(task);
-	return error;
+	return 0;
 
  out_put_tracer:
-	spin_unlock_irqrestore(&ds_lock, irq);
-	ds_put_context(context);
 	put_tracer(task);
-	return error;
-
  out_unlock:
 	spin_unlock_irqrestore(&ds_lock, irq);
 	ds_put_context(context);
+	tracer->context = NULL;
+ out:
 	return error;
 }
 
-int ds_request_bts(struct task_struct *task, void *base, size_t size,
-		   ds_ovfl_callback_t ovfl)
+struct bts_tracer *ds_request_bts(struct task_struct *task,
+				  void *base, size_t size,
+				  bts_ovfl_callback_t ovfl, size_t th)
 {
-	return ds_request(task, base, size, ovfl, ds_bts);
-}
+	struct bts_tracer *tracer;
+	int error;
 
-int ds_request_pebs(struct task_struct *task, void *base, size_t size,
-		    ds_ovfl_callback_t ovfl)
-{
-	return ds_request(task, base, size, ovfl, ds_pebs);
+	/* buffer overflow notification is not yet implemented */
+	error = -EOPNOTSUPP;
+	if (ovfl)
+		goto out;
+
+	error = -ENOMEM;
+	tracer = kzalloc(sizeof(*tracer), GFP_KERNEL);
+	if (!tracer)
+		goto out;
+	tracer->ovfl = ovfl;
+
+	error = ds_request(&tracer->ds, ds_bts, task, base, size, th);
+	if (error < 0)
+		goto out_tracer;
+
+	return tracer;
+
+ out_tracer:
+	(void)ds_release_bts(tracer);
+ out:
+	return ERR_PTR(error);
 }
 
-static int ds_release(struct task_struct *task, enum ds_qualifier qual)
+struct pebs_tracer *ds_request_pebs(struct task_struct *task,
+				    void *base, size_t size,
+				    pebs_ovfl_callback_t ovfl, size_t th)
 {
-	struct ds_context *context;
+	struct pebs_tracer *tracer;
 	int error;
 
-	context = ds_get_context(task);
-	error = ds_validate_access(context, qual);
-	if (error < 0)
+	/* buffer overflow notification is not yet implemented */
+	error = -EOPNOTSUPP;
+	if (ovfl)
 		goto out;
 
-	kfree(context->buffer[qual]);
-	context->buffer[qual] = NULL;
+	error = -ENOMEM;
+	tracer = kzalloc(sizeof(*tracer), GFP_KERNEL);
+	if (!tracer)
+		goto out;
+	tracer->ovfl = ovfl;
 
-	current->mm->total_vm  -= context->pages[qual];
-	current->mm->locked_vm -= context->pages[qual];
-	context->pages[qual] = 0;
-	context->owner[qual] = NULL;
+	error = ds_request(&tracer->ds, ds_pebs, task, base, size, th);
+	if (error < 0)
+		goto out_tracer;
 
-	/*
-	 * we put the context twice:
-	 *   once for the ds_get_context
-	 *   once for the corresponding ds_request
-	 */
-	ds_put_context(context);
+	return tracer;
+
+ out_tracer:
+	(void)ds_release_pebs(tracer);
  out:
-	ds_put_context(context);
-	return error;
+	return ERR_PTR(error);
+}
+
+static void ds_release(struct ds_tracer *tracer, enum ds_qualifier qual)
+{
+	if (tracer->context) {
+		BUG_ON(tracer->context->owner[qual] != tracer);
+		tracer->context->owner[qual] = NULL;
+
+		put_tracer(tracer->context->task);
+		ds_put_context(tracer->context);
+	}
+
+	if (tracer->pages) {
+		kfree(tracer->buffer);
+
+		down_write(&current->mm->mmap_sem);
+
+		current->mm->total_vm  -= tracer->pages;
+		current->mm->locked_vm -= tracer->pages;
+
+		up_write(&current->mm->mmap_sem);
+	}
 }
 
-int ds_release_bts(struct task_struct *task)
+int ds_release_bts(struct bts_tracer *tracer)
 {
-	return ds_release(task, ds_bts);
+	if (!tracer)
+		return -EINVAL;
+
+	ds_release(&tracer->ds, ds_bts);
+	kfree(tracer);
+
+	return 0;
 }
 
-int ds_release_pebs(struct task_struct *task)
+int ds_release_pebs(struct pebs_tracer *tracer)
 {
-	return ds_release(task, ds_pebs);
+	if (!tracer)
+		return -EINVAL;
+
+	ds_release(&tracer->ds, ds_pebs);
+	kfree(tracer);
+
+	return 0;
 }
 
-static int ds_get_index(struct task_struct *task, size_t *pos,
-			enum ds_qualifier qual)
+static size_t ds_get_index(struct ds_context *context, enum ds_qualifier qual)
 {
-	struct ds_context *context;
 	unsigned long base, index;
-	int error;
-
-	context = ds_get_context(task);
-	error = ds_validate_access(context, qual);
-	if (error < 0)
-		goto out;
 
 	base  = ds_get(context->ds, qual, ds_buffer_base);
 	index = ds_get(context->ds, qual, ds_index);
 
-	error = ((index - base) / ds_cfg.sizeof_rec[qual]);
-	if (pos)
-		*pos = error;
- out:
-	ds_put_context(context);
-	return error;
+	return (index - base) / ds_cfg.sizeof_rec[qual];
 }
 
-int ds_get_bts_index(struct task_struct *task, size_t *pos)
+int ds_get_bts_index(struct bts_tracer *tracer, size_t *pos)
 {
-	return ds_get_index(task, pos, ds_bts);
+	if (!tracer)
+		return -EINVAL;
+
+	if (!pos)
+		return -EINVAL;
+
+	*pos = ds_get_index(tracer->ds.context, ds_bts);
+
+	return 0;
 }
 
-int ds_get_pebs_index(struct task_struct *task, size_t *pos)
+int ds_get_pebs_index(struct pebs_tracer *tracer, size_t *pos)
 {
-	return ds_get_index(task, pos, ds_pebs);
+	if (!tracer)
+		return -EINVAL;
+
+	if (!pos)
+		return -EINVAL;
+
+	*pos = ds_get_index(tracer->ds.context, ds_pebs);
+
+	return 0;
 }
 
-static int ds_get_end(struct task_struct *task, size_t *pos,
-		      enum ds_qualifier qual)
+static size_t ds_get_end(struct ds_context *context, enum ds_qualifier qual)
 {
-	struct ds_context *context;
-	unsigned long base, end;
-	int error;
-
-	context = ds_get_context(task);
-	error = ds_validate_access(context, qual);
-	if (error < 0)
-		goto out;
+	unsigned long base, max;
 
 	base = ds_get(context->ds, qual, ds_buffer_base);
-	end  = ds_get(context->ds, qual, ds_absolute_maximum);
+	max  = ds_get(context->ds, qual, ds_absolute_maximum);
 
-	error = ((end - base) / ds_cfg.sizeof_rec[qual]);
-	if (pos)
-		*pos = error;
- out:
-	ds_put_context(context);
-	return error;
+	return (max - base) / ds_cfg.sizeof_rec[qual];
 }
 
-int ds_get_bts_end(struct task_struct *task, size_t *pos)
+int ds_get_bts_end(struct bts_tracer *tracer, size_t *pos)
 {
-	return ds_get_end(task, pos, ds_bts);
+	if (!tracer)
+		return -EINVAL;
+
+	if (!pos)
+		return -EINVAL;
+
+	*pos = ds_get_end(tracer->ds.context, ds_bts);
+
+	return 0;
 }
 
-int ds_get_pebs_end(struct task_struct *task, size_t *pos)
+int ds_get_pebs_end(struct pebs_tracer *tracer, size_t *pos)
 {
-	return ds_get_end(task, pos, ds_pebs);
+	if (!tracer)
+		return -EINVAL;
+
+	if (!pos)
+		return -EINVAL;
+
+	*pos = ds_get_end(tracer->ds.context, ds_pebs);
+
+	return 0;
 }
 
-static int ds_access(struct task_struct *task, size_t index,
-		     const void **record, enum ds_qualifier qual)
+static int ds_access(struct ds_context *context, enum ds_qualifier qual,
+		     size_t index, const void **record)
 {
-	struct ds_context *context;
 	unsigned long base, idx;
-	int error;
 
 	if (!record)
 		return -EINVAL;
 
-	context = ds_get_context(task);
-	error = ds_validate_access(context, qual);
-	if (error < 0)
-		goto out;
-
 	base = ds_get(context->ds, qual, ds_buffer_base);
 	idx = base + (index * ds_cfg.sizeof_rec[qual]);
 
-	error = -EINVAL;
 	if (idx > ds_get(context->ds, qual, ds_absolute_maximum))
-		goto out;
+		return -EINVAL;
 
 	*record = (const void *)idx;
-	error = ds_cfg.sizeof_rec[qual];
- out:
-	ds_put_context(context);
-	return error;
+
+	return ds_cfg.sizeof_rec[qual];
 }
 
-int ds_access_bts(struct task_struct *task, size_t index, const void **record)
+int ds_access_bts(struct bts_tracer *tracer, size_t index,
+		  const void **record)
 {
-	return ds_access(task, index, record, ds_bts);
+	if (!tracer)
+		return -EINVAL;
+
+	return ds_access(tracer->ds.context, ds_bts, index, record);
 }
 
-int ds_access_pebs(struct task_struct *task, size_t index, const void **record)
+int ds_access_pebs(struct pebs_tracer *tracer, size_t index,
+		   const void **record)
 {
-	return ds_access(task, index, record, ds_pebs);
+	if (!tracer)
+		return -EINVAL;
+
+	return ds_access(tracer->ds.context, ds_pebs, index, record);
 }
 
-static int ds_write(struct task_struct *task, const void *record, size_t size,
-		    enum ds_qualifier qual, int force)
+static int ds_write(struct ds_context *context, enum ds_qualifier qual,
+		    const void *record, size_t size)
 {
-	struct ds_context *context;
-	int error;
+	int bytes_written = 0;
 
 	if (!record)
 		return -EINVAL;
 
-	error = -EPERM;
-	context = ds_get_context(task);
-	if (!context)
-		goto out;
-
-	if (!force) {
-		error = ds_validate_access(context, qual);
-		if (error < 0)
-			goto out;
-	}
-
-	error = 0;
 	while (size) {
 		unsigned long base, index, end, write_end, int_th;
 		unsigned long write_size, adj_write_size;
@@ -678,14 +689,14 @@ static int ds_write(struct task_struct *task, const void *record, size_t size,
 			write_end = end;
 
 		if (write_end <= index)
-			goto out;
+			break;
 
 		write_size = min((unsigned long) size, write_end - index);
 		memcpy((void *)index, record, write_size);
 
 		record = (const char *)record + write_size;
-		size  -= write_size;
-		error += write_size;
+		size -= write_size;
+		bytes_written += write_size;
 
 		adj_write_size = write_size / ds_cfg.sizeof_rec[qual];
 		adj_write_size *= ds_cfg.sizeof_rec[qual];
@@ -700,47 +711,32 @@ static int ds_write(struct task_struct *task, const void *record, size_t size,
 		ds_set(context->ds, qual, ds_index, index);
 
 		if (index >= int_th)
-			ds_overflow(task, context, qual);
+			ds_overflow(context, qual);
 	}
 
- out:
-	ds_put_context(context);
-	return error;
+	return bytes_written;
 }
 
-int ds_write_bts(struct task_struct *task, const void *record, size_t size)
+int ds_write_bts(struct bts_tracer *tracer, const void *record, size_t size)
 {
-	return ds_write(task, record, size, ds_bts, /* force = */ 0);
-}
+	if (!tracer)
+		return -EINVAL;
 
-int ds_write_pebs(struct task_struct *task, const void *record, size_t size)
-{
-	return ds_write(task, record, size, ds_pebs, /* force = */ 0);
+	return ds_write(tracer->ds.context, ds_bts, record, size);
 }
 
-int ds_unchecked_write_bts(struct task_struct *task,
-			   const void *record, size_t size)
+int ds_write_pebs(struct pebs_tracer *tracer, const void *record, size_t size)
 {
-	return ds_write(task, record, size, ds_bts, /* force = */ 1);
-}
+	if (!tracer)
+		return -EINVAL;
 
-int ds_unchecked_write_pebs(struct task_struct *task,
-			    const void *record, size_t size)
-{
-	return ds_write(task, record, size, ds_pebs, /* force = */ 1);
+	return ds_write(tracer->ds.context, ds_pebs, record, size);
 }
 
-static int ds_reset_or_clear(struct task_struct *task,
-			     enum ds_qualifier qual, int clear)
+static void ds_reset_or_clear(struct ds_context *context,
+			      enum ds_qualifier qual, int clear)
 {
-	struct ds_context *context;
 	unsigned long base, end;
-	int error;
-
-	context = ds_get_context(task);
-	error = ds_validate_access(context, qual);
-	if (error < 0)
-		goto out;
 
 	base = ds_get(context->ds, qual, ds_buffer_base);
 	end  = ds_get(context->ds, qual, ds_absolute_maximum);
@@ -749,70 +745,69 @@ static int ds_reset_or_clear(struct task_struct *task,
 		memset((void *)base, 0, end - base);
 
 	ds_set(context->ds, qual, ds_index, base);
-
-	error = 0;
- out:
-	ds_put_context(context);
-	return error;
 }
 
-int ds_reset_bts(struct task_struct *task)
+int ds_reset_bts(struct bts_tracer *tracer)
 {
-	return ds_reset_or_clear(task, ds_bts, /* clear = */ 0);
+	if (!tracer)
+		return -EINVAL;
+
+	ds_reset_or_clear(tracer->ds.context, ds_bts, /* clear = */ 0);
+
+	return 0;
 }
 
-int ds_reset_pebs(struct task_struct *task)
+int ds_reset_pebs(struct pebs_tracer *tracer)
 {
-	return ds_reset_or_clear(task, ds_pebs, /* clear = */ 0);
+	if (!tracer)
+		return -EINVAL;
+
+	ds_reset_or_clear(tracer->ds.context, ds_pebs, /* clear = */ 0);
+
+	return 0;
 }
 
-int ds_clear_bts(struct task_struct *task)
+int ds_clear_bts(struct bts_tracer *tracer)
 {
-	return ds_reset_or_clear(task, ds_bts, /* clear = */ 1);
+	if (!tracer)
+		return -EINVAL;
+
+	ds_reset_or_clear(tracer->ds.context, ds_bts, /* clear = */ 1);
+
+	return 0;
 }
 
-int ds_clear_pebs(struct task_struct *task)
+int ds_clear_pebs(struct pebs_tracer *tracer)
 {
-	return ds_reset_or_clear(task, ds_pebs, /* clear = */ 1);
+	if (!tracer)
+		return -EINVAL;
+
+	ds_reset_or_clear(tracer->ds.context, ds_pebs, /* clear = */ 1);
+
+	return 0;
 }
 
-int ds_get_pebs_reset(struct task_struct *task, u64 *value)
+int ds_get_pebs_reset(struct pebs_tracer *tracer, u64 *value)
 {
-	struct ds_context *context;
-	int error;
+	if (!tracer)
+		return -EINVAL;
 
 	if (!value)
 		return -EINVAL;
 
-	context = ds_get_context(task);
-	error = ds_validate_access(context, ds_pebs);
-	if (error < 0)
-		goto out;
-
-	*value = *(u64 *)(context->ds + (ds_cfg.sizeof_field * 8));
+	*value = *(u64 *)(tracer->ds.context->ds + (ds_cfg.sizeof_field * 8));
 
-	error = 0;
- out:
-	ds_put_context(context);
-	return error;
+	return 0;
 }
 
-int ds_set_pebs_reset(struct task_struct *task, u64 value)
+int ds_set_pebs_reset(struct pebs_tracer *tracer, u64 value)
 {
-	struct ds_context *context;
-	int error;
-
-	context = ds_get_context(task);
-	error = ds_validate_access(context, ds_pebs);
-	if (error < 0)
-		goto out;
+	if (!tracer)
+		return -EINVAL;
 
-	*(u64 *)(context->ds + (ds_cfg.sizeof_field * 8)) = value;
+	*(u64 *)(tracer->ds.context->ds + (ds_cfg.sizeof_field * 8)) = value;
 
-	error = 0;
- out:
-	ds_put_context(context);
-	return error;
+	return 0;
 }
 
 static const struct ds_configuration ds_cfg_var = {
@@ -840,6 +835,10 @@ static inline void
 ds_configure(const struct ds_configuration *cfg)
 {
 	ds_cfg = *cfg;
+
+	printk(KERN_INFO "DS available\n");
+
+	BUG_ON(MAX_SIZEOF_DS < ds_cfg.sizeof_ds);
 }
 
 void __cpuinit ds_init_intel(struct cpuinfo_x86 *c)
@@ -883,6 +882,8 @@ void ds_free(struct ds_context *context)
 	 * is dying. There should not be any user of that context left
 	 * to disturb us, anymore. */
 	unsigned long leftovers = context->count;
-	while (leftovers--)
+	while (leftovers--) {
+		put_tracer(context->task);
 		ds_put_context(context);
+	}
 }
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index 06180dff5b2..76adf5b640f 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -668,14 +668,14 @@ static int ptrace_bts_read_record(struct task_struct *child, size_t index,
 	size_t bts_index, bts_end;
 	int error;
 
-	error = ds_get_bts_end(child, &bts_end);
+	error = ds_get_bts_end(child->bts, &bts_end);
 	if (error < 0)
 		return error;
 
 	if (bts_end <= index)
 		return -EINVAL;
 
-	error = ds_get_bts_index(child, &bts_index);
+	error = ds_get_bts_index(child->bts, &bts_index);
 	if (error < 0)
 		return error;
 
@@ -684,7 +684,7 @@ static int ptrace_bts_read_record(struct task_struct *child, size_t index,
 	if (bts_end <= bts_index)
 		bts_index -= bts_end;
 
-	error = ds_access_bts(child, bts_index, &bts_record);
+	error = ds_access_bts(child->bts, bts_index, &bts_record);
 	if (error < 0)
 		return error;
 
@@ -705,14 +705,14 @@ static int ptrace_bts_drain(struct task_struct *child,
 	size_t end, i;
 	int error;
 
-	error = ds_get_bts_index(child, &end);
+	error = ds_get_bts_index(child->bts, &end);
 	if (error < 0)
 		return error;
 
 	if (size < (end * sizeof(struct bts_struct)))
 		return -EIO;
 
-	error = ds_access_bts(child, 0, (const void **)&raw);
+	error = ds_access_bts(child->bts, 0, (const void **)&raw);
 	if (error < 0)
 		return error;
 
@@ -723,18 +723,13 @@ static int ptrace_bts_drain(struct task_struct *child,
 			return -EFAULT;
 	}
 
-	error = ds_clear_bts(child);
+	error = ds_clear_bts(child->bts);
 	if (error < 0)
 		return error;
 
 	return end;
 }
 
-static void ptrace_bts_ovfl(struct task_struct *child)
-{
-	send_sig(child->thread.bts_ovfl_signal, child, 0);
-}
-
 static int ptrace_bts_config(struct task_struct *child,
 			     long cfg_size,
 			     const struct ptrace_bts_config __user *ucfg)
@@ -760,23 +755,29 @@ static int ptrace_bts_config(struct task_struct *child,
 		goto errout;
 
 	if (cfg.flags & PTRACE_BTS_O_ALLOC) {
-		ds_ovfl_callback_t ovfl = NULL;
+		bts_ovfl_callback_t ovfl = NULL;
 		unsigned int sig = 0;
 
-		/* we ignore the error in case we were not tracing child */
-		(void)ds_release_bts(child);
-
 		if (cfg.flags & PTRACE_BTS_O_SIGNAL) {
 			if (!cfg.signal)
 				goto errout;
 
+			error = -EOPNOTSUPP;
+			goto errout;
+
 			sig  = cfg.signal;
-			ovfl = ptrace_bts_ovfl;
 		}
 
-		error = ds_request_bts(child, /* base = */ NULL, cfg.size, ovfl);
-		if (error < 0)
+		if (child->bts)
+			(void)ds_release_bts(child->bts);
+
+		child->bts = ds_request_bts(child, /* base = */ NULL, cfg.size,
+					    ovfl, /* th = */ (size_t)-1);
+		if (IS_ERR(child->bts)) {
+			error = PTR_ERR(child->bts);
+			child->bts = NULL;
 			goto errout;
+		}
 
 		child->thread.bts_ovfl_signal = sig;
 	}
@@ -823,15 +824,15 @@ static int ptrace_bts_status(struct task_struct *child,
 	if (cfg_size < sizeof(cfg))
 		return -EIO;
 
-	error = ds_get_bts_end(child, &end);
+	error = ds_get_bts_end(child->bts, &end);
 	if (error < 0)
 		return error;
 
-	error = ds_access_bts(child, /* index = */ 0, &base);
+	error = ds_access_bts(child->bts, /* index = */ 0, &base);
 	if (error < 0)
 		return error;
 
-	error = ds_access_bts(child, /* index = */ end, &max);
+	error = ds_access_bts(child->bts, /* index = */ end, &max);
 	if (error < 0)
 		return error;
 
@@ -884,10 +885,7 @@ static int ptrace_bts_write_record(struct task_struct *child,
 		return -EINVAL;
 	}
 
-	/* The writing task will be the switched-to task on a context
-	 * switch. It needs to write into the switched-from task's BTS
-	 * buffer. */
-	return ds_unchecked_write_bts(child, bts_record, bts_cfg.sizeof_bts);
+	return ds_write_bts(child->bts, bts_record, bts_cfg.sizeof_bts);
 }
 
 void ptrace_bts_take_timestamp(struct task_struct *tsk,
@@ -972,13 +970,15 @@ void ptrace_disable(struct task_struct *child)
 	clear_tsk_thread_flag(child, TIF_SYSCALL_EMU);
 #endif
 #ifdef CONFIG_X86_PTRACE_BTS
-	(void)ds_release_bts(child);
+	if (child->bts) {
+		(void)ds_release_bts(child->bts);
 
-	child->thread.debugctlmsr &= ~bts_cfg.debugctl_mask;
-	if (!child->thread.debugctlmsr)
-		clear_tsk_thread_flag(child, TIF_DEBUGCTLMSR);
+		child->thread.debugctlmsr &= ~bts_cfg.debugctl_mask;
+		if (!child->thread.debugctlmsr)
+			clear_tsk_thread_flag(child, TIF_DEBUGCTLMSR);
 
-	clear_tsk_thread_flag(child, TIF_BTS_TRACE_TS);
+		clear_tsk_thread_flag(child, TIF_BTS_TRACE_TS);
+	}
 #endif /* CONFIG_X86_PTRACE_BTS */
 }
 
@@ -1110,9 +1110,16 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data)
 			(child, data, (struct ptrace_bts_config __user *)addr);
 		break;
 
-	case PTRACE_BTS_SIZE:
-		ret = ds_get_bts_index(child, /* pos = */ NULL);
+	case PTRACE_BTS_SIZE: {
+		size_t size;
+
+		ret = ds_get_bts_index(child->bts, &size);
+		if (ret == 0) {
+			BUG_ON(size != (int) size);
+			ret = (int) size;
+		}
 		break;
+	}
 
 	case PTRACE_BTS_GET:
 		ret = ptrace_bts_read_record
@@ -1120,7 +1127,7 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data)
 		break;
 
 	case PTRACE_BTS_CLEAR:
-		ret = ds_clear_bts(child);
+		ret = ds_clear_bts(child->bts);
 		break;
 
 	case PTRACE_BTS_DRAIN:
diff --git a/include/linux/sched.h b/include/linux/sched.h
index bee1e93c95a..a9780eaa673 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -96,6 +96,7 @@ struct exec_domain;
 struct futex_pi_state;
 struct robust_list_head;
 struct bio;
+struct bts_tracer;
 
 /*
  * List of flags we want to share for kernel threads,
@@ -1161,6 +1162,14 @@ struct task_struct {
 	struct list_head ptraced;
 	struct list_head ptrace_entry;
 
+#ifdef CONFIG_X86_PTRACE_BTS
+	/*
+	 * This is the tracer handle for the ptrace BTS extension.
+	 * This field actually belongs to the ptracer task.
+	 */
+	struct bts_tracer *bts;
+#endif /* CONFIG_X86_PTRACE_BTS */
+
 	/* PID/PID hash table linkage. */
 	struct pid_link pids[PIDTYPE_MAX];
 	struct list_head thread_group;
-- 
cgit v1.2.3-70-g09d2


From 6abb11aecd888d1da6276399380b7355f127c006 Mon Sep 17 00:00:00 2001
From: Markus Metzger <markus.t.metzger@intel.com>
Date: Tue, 25 Nov 2008 09:05:27 +0100
Subject: x86, bts, ptrace: move BTS buffer allocation from ds.c into ptrace.c

Impact: restructure DS memory allocation to be done by the usage site of DS

Require pre-allocated buffers in ds.h.

Move the BTS buffer allocation for ptrace into ptrace.c.
The pointer to the allocated buffer is stored in the traced task's
task_struct together with the handle returned by ds_request_bts().

Removes memory accounting code.

Signed-off-by: Markus Metzger <markus.t.metzger@intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/include/asm/ds.h | 12 +++----
 arch/x86/kernel/ds.c      | 92 ++++++++---------------------------------------
 arch/x86/kernel/ptrace.c  | 22 ++++++++++--
 include/linux/sched.h     |  4 +++
 4 files changed, 42 insertions(+), 88 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/include/asm/ds.h b/arch/x86/include/asm/ds.h
index 0af997de5f0..99b6c39774a 100644
--- a/arch/x86/include/asm/ds.h
+++ b/arch/x86/include/asm/ds.h
@@ -7,13 +7,12 @@
  *
  * It manages:
  * - per-thread and per-cpu allocation of BTS and PEBS
- * - buffer memory allocation (optional)
- * - buffer overflow handling
+ * - buffer overflow handling (to be done)
  * - buffer access
  *
  * It assumes:
- * - get_task_struct on all parameter tasks
- * - current is allowed to trace parameter tasks
+ * - get_task_struct on all traced tasks
+ * - current is allowed to trace tasks
  *
  *
  * Copyright (C) 2007-2008 Intel Corporation.
@@ -54,8 +53,7 @@ typedef void (*pebs_ovfl_callback_t)(struct pebs_tracer *);
  * task: the task to request recording for;
  *       NULL for per-cpu recording on the current cpu
  * base: the base pointer for the (non-pageable) buffer;
- *       NULL if buffer allocation requested
- * size: the size of the requested or provided buffer in bytes
+ * size: the size of the provided buffer in bytes
  * ovfl: pointer to a function to be called on buffer overflow;
  *       NULL if cyclic buffer requested
  * th: the interrupt threshold in records from the end of the buffer;
@@ -72,8 +70,6 @@ extern struct pebs_tracer *ds_request_pebs(struct task_struct *task,
 /*
  * Release BTS or PEBS resources
  *
- * Frees buffers allocated on ds_request.
- *
  * Returns 0 on success; -Eerrno otherwise
  *
  * tracer: the tracer handle returned from ds_request_~()
diff --git a/arch/x86/kernel/ds.c b/arch/x86/kernel/ds.c
index 96768e9cce9..19a8c2c0389 100644
--- a/arch/x86/kernel/ds.c
+++ b/arch/x86/kernel/ds.c
@@ -7,13 +7,12 @@
  *
  * It manages:
  * - per-thread and per-cpu allocation of BTS and PEBS
- * - buffer memory allocation (optional)
- * - buffer overflow handling
+ * - buffer overflow handling (to be done)
  * - buffer access
  *
  * It assumes:
- * - get_task_struct on all parameter tasks
- * - current is allowed to trace parameter tasks
+ * - get_task_struct on all traced tasks
+ * - current is allowed to trace tasks
  *
  *
  * Copyright (C) 2007-2008 Intel Corporation.
@@ -57,8 +56,6 @@ struct ds_tracer {
 	/* the buffer provided on ds_request() and its size in bytes */
 	void *buffer;
 	size_t size;
-	/* the number of allocated pages for on-request allocated buffers */
-	unsigned int pages;
 };
 
 struct bts_tracer {
@@ -141,8 +138,7 @@ static inline void ds_set(unsigned char *base, enum ds_qualifier qual,
 
 
 /*
- * Locking is done only for allocating BTS or PEBS resources and for
- * guarding context and buffer memory allocation.
+ * Locking is done only for allocating BTS or PEBS resources.
  */
 static spinlock_t ds_lock = __SPIN_LOCK_UNLOCKED(ds_lock);
 
@@ -292,50 +288,6 @@ static void ds_overflow(struct ds_context *context, enum ds_qualifier qual)
 }
 
 
-/*
- * Allocate a non-pageable buffer of the parameter size.
- * Checks the memory and the locked memory rlimit.
- *
- * Returns the buffer, if successful;
- *         NULL, if out of memory or rlimit exceeded.
- *
- * size: the requested buffer size in bytes
- * pages (out): if not NULL, contains the number of pages reserved
- */
-static inline void *ds_allocate_buffer(size_t size, unsigned int *pages)
-{
-	unsigned long rlim, vm, pgsz;
-	void *buffer = NULL;
-
-	pgsz = PAGE_ALIGN(size) >> PAGE_SHIFT;
-
-	down_write(&current->mm->mmap_sem);
-
-	rlim = current->signal->rlim[RLIMIT_AS].rlim_cur >> PAGE_SHIFT;
-	vm   = current->mm->total_vm  + pgsz;
-	if (rlim < vm)
-		goto out;
-
-	rlim = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur >> PAGE_SHIFT;
-	vm   = current->mm->locked_vm  + pgsz;
-	if (rlim < vm)
-		goto out;
-
-	buffer = kzalloc(size, GFP_KERNEL);
-	if (!buffer)
-		goto out;
-
-	current->mm->total_vm  += pgsz;
-	current->mm->locked_vm += pgsz;
-
-	if (pages)
-		*pages = pgsz;
-
- out:
-	up_write(&current->mm->mmap_sem);
-	return buffer;
-}
-
 static void ds_install_ds_config(struct ds_context *context,
 				 enum ds_qualifier qual,
 				 void *base, size_t size, size_t ith)
@@ -382,6 +334,10 @@ static int ds_request(struct ds_tracer *tracer, enum ds_qualifier qual,
 	if (!ds_cfg.sizeof_ds)
 		goto out;
 
+	error = -EINVAL;
+	if (!base)
+		goto out;
+
 	/* we require some space to do alignment adjustments below */
 	error = -EINVAL;
 	if (size < (DS_ALIGNMENT + ds_cfg.sizeof_rec[qual]))
@@ -395,13 +351,6 @@ static int ds_request(struct ds_tracer *tracer, enum ds_qualifier qual,
 			goto out;
 	}
 
-	error = -ENOMEM;
-	if (!base) {
-		base = ds_allocate_buffer(size, &tracer->pages);
-		if (!base)
-			goto out;
-	}
-
 	tracer->buffer = base;
 	tracer->size = size;
 
@@ -466,7 +415,7 @@ struct bts_tracer *ds_request_bts(struct task_struct *task,
 	return tracer;
 
  out_tracer:
-	(void)ds_release_bts(tracer);
+	kfree(tracer);
  out:
 	return ERR_PTR(error);
 }
@@ -496,31 +445,18 @@ struct pebs_tracer *ds_request_pebs(struct task_struct *task,
 	return tracer;
 
  out_tracer:
-	(void)ds_release_pebs(tracer);
+	kfree(tracer);
  out:
 	return ERR_PTR(error);
 }
 
 static void ds_release(struct ds_tracer *tracer, enum ds_qualifier qual)
 {
-	if (tracer->context) {
-		BUG_ON(tracer->context->owner[qual] != tracer);
-		tracer->context->owner[qual] = NULL;
-
-		put_tracer(tracer->context->task);
-		ds_put_context(tracer->context);
-	}
+	BUG_ON(tracer->context->owner[qual] != tracer);
+	tracer->context->owner[qual] = NULL;
 
-	if (tracer->pages) {
-		kfree(tracer->buffer);
-
-		down_write(&current->mm->mmap_sem);
-
-		current->mm->total_vm  -= tracer->pages;
-		current->mm->locked_vm -= tracer->pages;
-
-		up_write(&current->mm->mmap_sem);
-	}
+	put_tracer(tracer->context->task);
+	ds_put_context(tracer->context);
 }
 
 int ds_release_bts(struct bts_tracer *tracer)
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index 76adf5b640f..2c8ec1ba75e 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -758,6 +758,10 @@ static int ptrace_bts_config(struct task_struct *child,
 		bts_ovfl_callback_t ovfl = NULL;
 		unsigned int sig = 0;
 
+		error = -EINVAL;
+		if (cfg.size < (10 * bts_cfg.sizeof_bts))
+			goto errout;
+
 		if (cfg.flags & PTRACE_BTS_O_SIGNAL) {
 			if (!cfg.signal)
 				goto errout;
@@ -768,14 +772,26 @@ static int ptrace_bts_config(struct task_struct *child,
 			sig  = cfg.signal;
 		}
 
-		if (child->bts)
+		if (child->bts) {
 			(void)ds_release_bts(child->bts);
+			kfree(child->bts_buffer);
+
+			child->bts = NULL;
+			child->bts_buffer = NULL;
+		}
+
+		error = -ENOMEM;
+		child->bts_buffer = kzalloc(cfg.size, GFP_KERNEL);
+		if (!child->bts_buffer)
+			goto errout;
 
-		child->bts = ds_request_bts(child, /* base = */ NULL, cfg.size,
+		child->bts = ds_request_bts(child, child->bts_buffer, cfg.size,
 					    ovfl, /* th = */ (size_t)-1);
 		if (IS_ERR(child->bts)) {
 			error = PTR_ERR(child->bts);
+			kfree(child->bts_buffer);
 			child->bts = NULL;
+			child->bts_buffer = NULL;
 			goto errout;
 		}
 
@@ -972,6 +988,8 @@ void ptrace_disable(struct task_struct *child)
 #ifdef CONFIG_X86_PTRACE_BTS
 	if (child->bts) {
 		(void)ds_release_bts(child->bts);
+		kfree(child->bts_buffer);
+		child->bts_buffer = NULL;
 
 		child->thread.debugctlmsr &= ~bts_cfg.debugctl_mask;
 		if (!child->thread.debugctlmsr)
diff --git a/include/linux/sched.h b/include/linux/sched.h
index a9780eaa673..d02a0ca70ee 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1168,6 +1168,10 @@ struct task_struct {
 	 * This field actually belongs to the ptracer task.
 	 */
 	struct bts_tracer *bts;
+	/*
+	 * The buffer to hold the BTS data.
+	 */
+	void *bts_buffer;
 #endif /* CONFIG_X86_PTRACE_BTS */
 
 	/* PID/PID hash table linkage. */
-- 
cgit v1.2.3-70-g09d2


From 3f2355cb9111ac04e7ae06a4d7044da2ae813863 Mon Sep 17 00:00:00 2001
From: "Luis R. Rodriguez" <lrodriguez@atheros.com>
Date: Wed, 12 Nov 2008 14:22:02 -0800
Subject: cfg80211/mac80211: Add 802.11d support

This adds country IE parsing to mac80211 and enables its usage
within the new regulatory infrastructure in cfg80211. We parse
the country IEs only on management beacons for the BSSID you are
associated to and disregard the IEs when the country and environment
(indoor, outdoor, any) matches the already processed country IE.

To avoid following misinformed or outdated APs we build and use
a regulatory domain out of the intersection between what the AP
provides us on the country IE and what CRDA is aware is allowed
on the same country.

A secondary device is allowed to follow only the same country IE
as it make no sense for two devices on a system to be in two
different countries.

In the case the AP is using country IEs for an incorrect country
the user may help compliance further by setting the regulatory
domain before or after the IE is parsed and in that case another
intersection will be performed.

CONFIG_WIRELESS_OLD_REGULATORY is supported but requires CRDA
present.

Signed-off-by: Luis R. Rodriguez <lrodriguez@atheros.com>
Acked-by: Johannes Berg <johannes@sipsolutions.net>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 include/linux/ieee80211.h |  62 ++++++
 include/net/wireless.h    |  15 ++
 net/mac80211/mlme.c       |   7 +
 net/wireless/Kconfig      |  11 ++
 net/wireless/core.c       |   5 +-
 net/wireless/core.h       |  13 ++
 net/wireless/nl80211.c    |   2 +-
 net/wireless/reg.c        | 479 ++++++++++++++++++++++++++++++++++++++++++++--
 net/wireless/reg.h        |  21 +-
 9 files changed, 591 insertions(+), 24 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h
index 56b0eb25d92..a6ec928186a 100644
--- a/include/linux/ieee80211.h
+++ b/include/linux/ieee80211.h
@@ -1042,6 +1042,68 @@ enum ieee80211_spectrum_mgmt_actioncode {
 	WLAN_ACTION_SPCT_CHL_SWITCH = 4,
 };
 
+/*
+ * IEEE 802.11-2007 7.3.2.9 Country information element
+ *
+ * Minimum length is 8 octets, ie len must be evenly
+ * divisible by 2
+ */
+
+/* Although the spec says 8 I'm seeing 6 in practice */
+#define IEEE80211_COUNTRY_IE_MIN_LEN	6
+
+/*
+ * For regulatory extension stuff see IEEE 802.11-2007
+ * Annex I (page 1141) and Annex J (page 1147). Also
+ * review 7.3.2.9.
+ *
+ * When dot11RegulatoryClassesRequired is true and the
+ * first_channel/reg_extension_id is >= 201 then the IE
+ * compromises of the 'ext' struct represented below:
+ *
+ *  - Regulatory extension ID - when generating IE this just needs
+ *    to be monotonically increasing for each triplet passed in
+ *    the IE
+ *  - Regulatory class - index into set of rules
+ *  - Coverage class - index into air propagation time (Table 7-27),
+ *    in microseconds, you can compute the air propagation time from
+ *    the index by multiplying by 3, so index 10 yields a propagation
+ *    of 10 us. Valid values are 0-31, values 32-255 are not defined
+ *    yet. A value of 0 inicates air propagation of <= 1 us.
+ *
+ *  See also Table I.2 for Emission limit sets and table
+ *  I.3 for Behavior limit sets. Table J.1 indicates how to map
+ *  a reg_class to an emission limit set and behavior limit set.
+ */
+#define IEEE80211_COUNTRY_EXTENSION_ID 201
+
+/*
+ *  Channels numbers in the IE must be monotonically increasing
+ *  if dot11RegulatoryClassesRequired is not true.
+ *
+ *  If dot11RegulatoryClassesRequired is true consecutive
+ *  subband triplets following a regulatory triplet shall
+ *  have monotonically increasing first_channel number fields.
+ *
+ *  Channel numbers shall not overlap.
+ *
+ *  Note that max_power is signed.
+ */
+struct ieee80211_country_ie_triplet {
+	union {
+		struct {
+			u8 first_channel;
+			u8 num_channels;
+			s8 max_power;
+		} __attribute__ ((packed)) chans;
+		struct {
+			u8 reg_extension_id;
+			u8 reg_class;
+			u8 coverage_class;
+		} __attribute__ ((packed)) ext;
+	};
+} __attribute__ ((packed));
+
 /* BACK action code */
 enum ieee80211_back_actioncode {
 	WLAN_ACTION_ADDBA_REQ = 0,
diff --git a/include/net/wireless.h b/include/net/wireless.h
index 412351560b7..c85fa0ea5c5 100644
--- a/include/net/wireless.h
+++ b/include/net/wireless.h
@@ -373,4 +373,19 @@ ieee80211_get_response_rate(struct ieee80211_supported_band *sband,
  * for a regulatory domain structure for the respective country.
  */
 extern void regulatory_hint(struct wiphy *wiphy, const char *alpha2);
+
+/**
+ * regulatory_hint_11d - hints a country IE as a regulatory domain
+ * @wiphy: the wireless device giving the hint (used only for reporting
+ *	conflicts)
+ * @country_ie: pointer to the country IE
+ * @country_ie_len: length of the country IE
+ *
+ * We will intersect the rd with the what CRDA tells us should apply
+ * for the alpha2 this country IE belongs to, this prevents APs from
+ * sending us incorrect or outdated information against a country.
+ */
+extern void regulatory_hint_11d(struct wiphy *wiphy,
+				u8 *country_ie,
+				u8 country_ie_len);
 #endif /* __NET_WIRELESS_H */
diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c
index d81a4d2cd3a..30adaddeed2 100644
--- a/net/mac80211/mlme.c
+++ b/net/mac80211/mlme.c
@@ -1736,6 +1736,13 @@ static void ieee80211_rx_mgmt_beacon(struct ieee80211_sub_if_data *sdata,
 					       ap_ht_cap_flags);
 	}
 
+	if (elems.country_elem) {
+		/* Note we are only reviewing this on beacons
+		 * for the BSSID we are associated to */
+		regulatory_hint_11d(local->hw.wiphy,
+			elems.country_elem, elems.country_elem_len);
+	}
+
 	ieee80211_bss_info_change_notify(sdata, changed);
 }
 
diff --git a/net/wireless/Kconfig b/net/wireless/Kconfig
index f7c64dbe86c..e28e2b8fa43 100644
--- a/net/wireless/Kconfig
+++ b/net/wireless/Kconfig
@@ -1,6 +1,15 @@
 config CFG80211
         tristate "Improved wireless configuration API"
 
+config CFG80211_REG_DEBUG
+	bool "cfg80211 regulatory debugging"
+	depends on CFG80211
+	default n
+	---help---
+	  You can enable this if you want to debug regulatory changes.
+
+	  If unsure, say N.
+
 config NL80211
 	bool "nl80211 new netlink interface support"
 	depends on CFG80211
@@ -40,6 +49,8 @@ config WIRELESS_OLD_REGULATORY
 	  ieee80211_regdom module parameter. This is being phased out and you
 	  should stop using them ASAP.
 
+	  Note: You will need CRDA if you want 802.11d support
+
 	  Say Y unless you have installed a new userspace application.
 	  Also say Y if have one currently depending on the ieee80211_regdom
 	  module parameter and cannot port it to use the new userspace
diff --git a/net/wireless/core.c b/net/wireless/core.c
index 39e3d10fccd..b96fc0c3f1c 100644
--- a/net/wireless/core.c
+++ b/net/wireless/core.c
@@ -19,7 +19,6 @@
 #include "nl80211.h"
 #include "core.h"
 #include "sysfs.h"
-#include "reg.h"
 
 /* name for sysfs, %d is appended */
 #define PHY_NAME "phy"
@@ -348,6 +347,10 @@ void wiphy_unregister(struct wiphy *wiphy)
 	/* unlock again before freeing */
 	mutex_unlock(&drv->mtx);
 
+	/* If this device got a regulatory hint tell core its
+	 * free to listen now to a new shiny device regulatory hint */
+	reg_device_remove(wiphy);
+
 	list_del(&drv->list);
 	device_del(&drv->wiphy.dev);
 	debugfs_remove(drv->wiphy.debugfsdir);
diff --git a/net/wireless/core.h b/net/wireless/core.h
index 771cc5cc765..f7fb9f41302 100644
--- a/net/wireless/core.h
+++ b/net/wireless/core.h
@@ -11,6 +11,7 @@
 #include <net/genetlink.h>
 #include <net/wireless.h>
 #include <net/cfg80211.h>
+#include "reg.h"
 
 struct cfg80211_registered_device {
 	struct cfg80211_ops *ops;
@@ -21,6 +22,18 @@ struct cfg80211_registered_device {
 	 * any call is in progress */
 	struct mutex mtx;
 
+	/* ISO / IEC 3166 alpha2 for which this device is receiving
+	 * country IEs on, this can help disregard country IEs from APs
+	 * on the same alpha2 quickly. The alpha2 may differ from
+	 * cfg80211_regdomain's alpha2 when an intersection has occurred.
+	 * If the AP is reconfigured this can also be used to tell us if
+	 * the country on the country IE changed. */
+	char country_ie_alpha2[2];
+
+	/* If a Country IE has been received this tells us the environment
+	 * which its telling us its in. This defaults to ENVIRON_ANY */
+	enum environment_cap env;
+
 	/* wiphy index, internal only */
 	int idx;
 
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index e3e1494e769..00121ceddb1 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -1760,7 +1760,7 @@ static int nl80211_req_set_reg(struct sk_buff *skb, struct genl_info *info)
 		return -EINVAL;
 #endif
 	mutex_lock(&cfg80211_drv_mutex);
-	r = __regulatory_hint(NULL, REGDOM_SET_BY_USER, data);
+	r = __regulatory_hint(NULL, REGDOM_SET_BY_USER, data, 0, ENVIRON_ANY);
 	mutex_unlock(&cfg80211_drv_mutex);
 	return r;
 }
diff --git a/net/wireless/reg.c b/net/wireless/reg.c
index f0ff3d1779d..4dab993ea48 100644
--- a/net/wireless/reg.c
+++ b/net/wireless/reg.c
@@ -60,12 +60,18 @@
  * @intersect: indicates whether the wireless core should intersect
  * 	the requested regulatory domain with the presently set regulatory
  * 	domain.
+ * @country_ie_checksum: checksum of the last processed and accepted
+ * 	country IE
+ * @country_ie_env: lets us know if the AP is telling us we are outdoor,
+ * 	indoor, or if it doesn't matter
  */
 struct regulatory_request {
 	struct wiphy *wiphy;
 	enum reg_set_by initiator;
 	char alpha2[2];
 	bool intersect;
+	u32 country_ie_checksum;
+	enum environment_cap country_ie_env;
 };
 
 /* Receipt of information from last regulatory request */
@@ -85,6 +91,11 @@ static u32 supported_bandwidths[] = {
  * information to give us an alpha2 */
 static const struct ieee80211_regdomain *cfg80211_regdomain;
 
+/* We use this as a place for the rd structure built from the
+ * last parsed country IE to rest until CRDA gets back to us with
+ * what it thinks should apply for the same country */
+static const struct ieee80211_regdomain *country_ie_regdomain;
+
 /* We keep a static world regulatory domain in case of the absence of CRDA */
 static const struct ieee80211_regdomain world_regdom = {
 	.n_reg_rules = 1,
@@ -264,6 +275,18 @@ static bool is_unknown_alpha2(const char *alpha2)
 	return false;
 }
 
+static bool is_intersected_alpha2(const char *alpha2)
+{
+	if (!alpha2)
+		return false;
+	/* Special case where regulatory domain is the
+	 * result of an intersection between two regulatory domain
+	 * structures */
+	if (alpha2[0] == '9' && alpha2[1] == '8')
+		return true;
+	return false;
+}
+
 static bool is_an_alpha2(const char *alpha2)
 {
 	if (!alpha2)
@@ -292,6 +315,25 @@ static bool regdom_changed(const char *alpha2)
 	return true;
 }
 
+/**
+ * country_ie_integrity_changes - tells us if the country IE has changed
+ * @checksum: checksum of country IE of fields we are interested in
+ *
+ * If the country IE has not changed you can ignore it safely. This is
+ * useful to determine if two devices are seeing two different country IEs
+ * even on the same alpha2. Note that this will return false if no IE has
+ * been set on the wireless core yet.
+ */
+static bool country_ie_integrity_changes(u32 checksum)
+{
+	/* If no IE has been set then the checksum doesn't change */
+	if (unlikely(!last_request->country_ie_checksum))
+		return false;
+	if (unlikely(last_request->country_ie_checksum != checksum))
+		return true;
+	return false;
+}
+
 /* This lets us keep regulatory code which is updated on a regulatory
  * basis in userspace. */
 static int call_crda(const char *alpha2)
@@ -379,6 +421,174 @@ static u32 freq_max_bandwidth(const struct ieee80211_freq_range *freq_range,
 	return 0;
 }
 
+/* Converts a country IE to a regulatory domain. A regulatory domain
+ * structure has a lot of information which the IE doesn't yet have,
+ * so for the other values we use upper max values as we will intersect
+ * with our userspace regulatory agent to get lower bounds. */
+static struct ieee80211_regdomain *country_ie_2_rd(
+				u8 *country_ie,
+				u8 country_ie_len,
+				u32 *checksum)
+{
+	struct ieee80211_regdomain *rd = NULL;
+	unsigned int i = 0;
+	char alpha2[2];
+	u32 flags = 0;
+	u32 num_rules = 0, size_of_regd = 0;
+	u8 *triplets_start = NULL;
+	u8 len_at_triplet = 0;
+	/* the last channel we have registered in a subband (triplet) */
+	int last_sub_max_channel = 0;
+
+	*checksum = 0xDEADBEEF;
+
+	/* Country IE requirements */
+	BUG_ON(country_ie_len < IEEE80211_COUNTRY_IE_MIN_LEN ||
+		country_ie_len & 0x01);
+
+	alpha2[0] = country_ie[0];
+	alpha2[1] = country_ie[1];
+
+	/*
+	 * Third octet can be:
+	 *    'I' - Indoor
+	 *    'O' - Outdoor
+	 *
+	 *  anything else we assume is no restrictions
+	 */
+	if (country_ie[2] == 'I')
+		flags = NL80211_RRF_NO_OUTDOOR;
+	else if (country_ie[2] == 'O')
+		flags = NL80211_RRF_NO_INDOOR;
+
+	country_ie += 3;
+	country_ie_len -= 3;
+
+	triplets_start = country_ie;
+	len_at_triplet = country_ie_len;
+
+	*checksum ^= ((flags ^ alpha2[0] ^ alpha2[1]) << 8);
+
+	/* We need to build a reg rule for each triplet, but first we must
+	 * calculate the number of reg rules we will need. We will need one
+	 * for each channel subband */
+	while (country_ie_len >= 3) {
+		struct ieee80211_country_ie_triplet *triplet =
+			(struct ieee80211_country_ie_triplet *) country_ie;
+		int cur_sub_max_channel = 0, cur_channel = 0;
+
+		if (triplet->ext.reg_extension_id >=
+				IEEE80211_COUNTRY_EXTENSION_ID) {
+			country_ie += 3;
+			country_ie_len -= 3;
+			continue;
+		}
+
+		cur_channel = triplet->chans.first_channel;
+		cur_sub_max_channel = ieee80211_channel_to_frequency(
+			cur_channel + triplet->chans.num_channels);
+
+		/* Basic sanity check */
+		if (cur_sub_max_channel < cur_channel)
+			return NULL;
+
+		/* Do not allow overlapping channels. Also channels
+		 * passed in each subband must be monotonically
+		 * increasing */
+		if (last_sub_max_channel) {
+			if (cur_channel <= last_sub_max_channel)
+				return NULL;
+			if (cur_sub_max_channel <= last_sub_max_channel)
+				return NULL;
+		}
+
+		/* When dot11RegulatoryClassesRequired is supported
+		 * we can throw ext triplets as part of this soup,
+		 * for now we don't care when those change as we
+		 * don't support them */
+		*checksum ^= ((cur_channel ^ cur_sub_max_channel) << 8) |
+		  ((cur_sub_max_channel ^ cur_sub_max_channel) << 16) |
+		  ((triplet->chans.max_power ^ cur_sub_max_channel) << 24);
+
+		last_sub_max_channel = cur_sub_max_channel;
+
+		country_ie += 3;
+		country_ie_len -= 3;
+		num_rules++;
+
+		/* Note: this is not a IEEE requirement but
+		 * simply a memory requirement */
+		if (num_rules > NL80211_MAX_SUPP_REG_RULES)
+			return NULL;
+	}
+
+	country_ie = triplets_start;
+	country_ie_len = len_at_triplet;
+
+	size_of_regd = sizeof(struct ieee80211_regdomain) +
+		(num_rules * sizeof(struct ieee80211_reg_rule));
+
+	rd = kzalloc(size_of_regd, GFP_KERNEL);
+	if (!rd)
+		return NULL;
+
+	rd->n_reg_rules = num_rules;
+	rd->alpha2[0] = alpha2[0];
+	rd->alpha2[1] = alpha2[1];
+
+	/* This time around we fill in the rd */
+	while (country_ie_len >= 3) {
+		struct ieee80211_country_ie_triplet *triplet =
+			(struct ieee80211_country_ie_triplet *) country_ie;
+		struct ieee80211_reg_rule *reg_rule = NULL;
+		struct ieee80211_freq_range *freq_range = NULL;
+		struct ieee80211_power_rule *power_rule = NULL;
+
+		/* Must parse if dot11RegulatoryClassesRequired is true,
+		 * we don't support this yet */
+		if (triplet->ext.reg_extension_id >=
+				IEEE80211_COUNTRY_EXTENSION_ID) {
+			country_ie += 3;
+			country_ie_len -= 3;
+			continue;
+		}
+
+		reg_rule = &rd->reg_rules[i];
+		freq_range = &reg_rule->freq_range;
+		power_rule = &reg_rule->power_rule;
+
+		reg_rule->flags = flags;
+
+		/* The +10 is since the regulatory domain expects
+		 * the actual band edge, not the center of freq for
+		 * its start and end freqs, assuming 20 MHz bandwidth on
+		 * the channels passed */
+		freq_range->start_freq_khz =
+			MHZ_TO_KHZ(ieee80211_channel_to_frequency(
+				triplet->chans.first_channel) - 10);
+		freq_range->end_freq_khz =
+			MHZ_TO_KHZ(ieee80211_channel_to_frequency(
+				triplet->chans.first_channel +
+					triplet->chans.num_channels) + 10);
+
+		/* Large arbitrary values, we intersect later */
+		/* Increment this if we ever support >= 40 MHz channels
+		 * in IEEE 802.11 */
+		freq_range->max_bandwidth_khz = MHZ_TO_KHZ(40);
+		power_rule->max_antenna_gain = DBI_TO_MBI(100);
+		power_rule->max_eirp = DBM_TO_MBM(100);
+
+		country_ie += 3;
+		country_ie_len -= 3;
+		i++;
+
+		BUG_ON(i > NL80211_MAX_SUPP_REG_RULES);
+	}
+
+	return rd;
+}
+
+
 /* Helper for regdom_intersect(), this does the real
  * mathematical intersection fun */
 static int reg_rules_intersect(
@@ -663,16 +873,14 @@ static int ignore_request(struct wiphy *wiphy, enum reg_set_by set_by,
 					return -EOPNOTSUPP;
 				return -EALREADY;
 			}
-			/* Two consecutive Country IE hints on the same wiphy */
-			if (!alpha2_equal(cfg80211_regdomain->alpha2, alpha2))
+			/* Two consecutive Country IE hints on the same wiphy.
+			 * This should be picked up early by the driver/stack */
+			if (WARN_ON(!alpha2_equal(cfg80211_regdomain->alpha2,
+				  alpha2)))
 				return 0;
 			return -EALREADY;
 		}
-		/*
-		 * Ignore Country IE hints for now, need to think about
-		 * what we need to do to support multi-domain operation.
-		 */
-		return -EOPNOTSUPP;
+		return REG_INTERSECT;
 	case REGDOM_SET_BY_DRIVER:
 		if (last_request->initiator == REGDOM_SET_BY_DRIVER)
 			return -EALREADY;
@@ -680,6 +888,11 @@ static int ignore_request(struct wiphy *wiphy, enum reg_set_by set_by,
 	case REGDOM_SET_BY_USER:
 		if (last_request->initiator == REGDOM_SET_BY_COUNTRY_IE)
 			return REG_INTERSECT;
+		/* If the user knows better the user should set the regdom
+		 * to their country before the IE is picked up */
+		if (last_request->initiator == REGDOM_SET_BY_USER &&
+			  last_request->intersect)
+			return -EOPNOTSUPP;
 		return 0;
 	}
 
@@ -688,7 +901,9 @@ static int ignore_request(struct wiphy *wiphy, enum reg_set_by set_by,
 
 /* Caller must hold &cfg80211_drv_mutex */
 int __regulatory_hint(struct wiphy *wiphy, enum reg_set_by set_by,
-		      const char *alpha2)
+			const char *alpha2,
+			u32 country_ie_checksum,
+			enum environment_cap env)
 {
 	struct regulatory_request *request;
 	bool intersect = false;
@@ -711,9 +926,21 @@ int __regulatory_hint(struct wiphy *wiphy, enum reg_set_by set_by,
 	request->initiator = set_by;
 	request->wiphy = wiphy;
 	request->intersect = intersect;
+	request->country_ie_checksum = country_ie_checksum;
+	request->country_ie_env = env;
 
 	kfree(last_request);
 	last_request = request;
+	/*
+	 * Note: When CONFIG_WIRELESS_OLD_REGULATORY is enabled
+	 * AND if CRDA is NOT present nothing will happen, if someone
+	 * wants to bother with 11d with OLD_REG you can add a timer.
+	 * If after x amount of time nothing happens you can call:
+	 *
+	 * return set_regdom(country_ie_regdomain);
+	 *
+	 * to intersect with the static rd
+	 */
 	return call_crda(alpha2);
 }
 
@@ -722,11 +949,120 @@ void regulatory_hint(struct wiphy *wiphy, const char *alpha2)
 	BUG_ON(!alpha2);
 
 	mutex_lock(&cfg80211_drv_mutex);
-	__regulatory_hint(wiphy, REGDOM_SET_BY_DRIVER, alpha2);
+	__regulatory_hint(wiphy, REGDOM_SET_BY_DRIVER, alpha2, 0, ENVIRON_ANY);
 	mutex_unlock(&cfg80211_drv_mutex);
 }
 EXPORT_SYMBOL(regulatory_hint);
 
+static bool reg_same_country_ie_hint(struct wiphy *wiphy,
+			u32 country_ie_checksum)
+{
+	if (!last_request->wiphy)
+		return false;
+	if (likely(last_request->wiphy != wiphy))
+		return !country_ie_integrity_changes(country_ie_checksum);
+	/* We should not have let these through at this point, they
+	 * should have been picked up earlier by the first alpha2 check
+	 * on the device */
+	if (WARN_ON(!country_ie_integrity_changes(country_ie_checksum)))
+		return true;
+	return false;
+}
+
+void regulatory_hint_11d(struct wiphy *wiphy,
+			u8 *country_ie,
+			u8 country_ie_len)
+{
+	struct ieee80211_regdomain *rd = NULL;
+	char alpha2[2];
+	u32 checksum = 0;
+	enum environment_cap env = ENVIRON_ANY;
+
+	mutex_lock(&cfg80211_drv_mutex);
+
+	/* IE len must be evenly divisible by 2 */
+	if (country_ie_len & 0x01)
+		goto out;
+
+	if (country_ie_len < IEEE80211_COUNTRY_IE_MIN_LEN)
+		goto out;
+
+	/* Pending country IE processing, this can happen after we
+	 * call CRDA and wait for a response if a beacon was received before
+	 * we were able to process the last regulatory_hint_11d() call */
+	if (country_ie_regdomain)
+		goto out;
+
+	alpha2[0] = country_ie[0];
+	alpha2[1] = country_ie[1];
+
+	if (country_ie[2] == 'I')
+		env = ENVIRON_INDOOR;
+	else if (country_ie[2] == 'O')
+		env = ENVIRON_OUTDOOR;
+
+	/* We will run this for *every* beacon processed for the BSSID, so
+	 * we optimize an early check to exit out early if we don't have to
+	 * do anything */
+	if (likely(last_request->wiphy)) {
+		struct cfg80211_registered_device *drv_last_ie;
+
+		drv_last_ie = wiphy_to_dev(last_request->wiphy);
+
+		/* Lets keep this simple -- we trust the first AP
+		 * after we intersect with CRDA */
+		if (likely(last_request->wiphy == wiphy)) {
+			/* Ignore IEs coming in on this wiphy with
+			 * the same alpha2 and environment cap */
+			if (likely(alpha2_equal(drv_last_ie->country_ie_alpha2,
+				  alpha2) &&
+				  env == drv_last_ie->env)) {
+				goto out;
+			}
+			/* the wiphy moved on to another BSSID or the AP
+			 * was reconfigured. XXX: We need to deal with the
+			 * case where the user suspends and goes to goes
+			 * to another country, and then gets IEs from an
+			 * AP with different settings */
+			goto out;
+		} else {
+			/* Ignore IEs coming in on two separate wiphys with
+			 * the same alpha2 and environment cap */
+			if (likely(alpha2_equal(drv_last_ie->country_ie_alpha2,
+				  alpha2) &&
+				  env == drv_last_ie->env)) {
+				goto out;
+			}
+			/* We could potentially intersect though */
+			goto out;
+		}
+	}
+
+	rd = country_ie_2_rd(country_ie, country_ie_len, &checksum);
+	if (!rd)
+		goto out;
+
+	/* This will not happen right now but we leave it here for the
+	 * the future when we want to add suspend/resume support and having
+	 * the user move to another country after doing so, or having the user
+	 * move to another AP. Right now we just trust the first AP. This is why
+	 * this is marked as likley(). If we hit this before we add this support
+	 * we want to be informed of it as it would indicate a mistake in the
+	 * current design  */
+	if (likely(WARN_ON(reg_same_country_ie_hint(wiphy, checksum))))
+		goto out;
+
+	/* We keep this around for when CRDA comes back with a response so
+	 * we can intersect with that */
+	country_ie_regdomain = rd;
+
+	__regulatory_hint(wiphy, REGDOM_SET_BY_COUNTRY_IE,
+		country_ie_regdomain->alpha2, checksum, env);
+
+out:
+	mutex_unlock(&cfg80211_drv_mutex);
+}
+EXPORT_SYMBOL(regulatory_hint_11d);
 
 static void print_rd_rules(const struct ieee80211_regdomain *rd)
 {
@@ -766,7 +1102,25 @@ static void print_rd_rules(const struct ieee80211_regdomain *rd)
 static void print_regdomain(const struct ieee80211_regdomain *rd)
 {
 
-	if (is_world_regdom(rd->alpha2))
+	if (is_intersected_alpha2(rd->alpha2)) {
+		struct wiphy *wiphy = NULL;
+		struct cfg80211_registered_device *drv;
+
+		if (last_request->initiator == REGDOM_SET_BY_COUNTRY_IE) {
+			if (last_request->wiphy) {
+				wiphy = last_request->wiphy;
+				drv = wiphy_to_dev(wiphy);
+				printk(KERN_INFO "cfg80211: Current regulatory "
+					"domain updated by AP to: %c%c\n",
+					drv->country_ie_alpha2[0],
+					drv->country_ie_alpha2[1]);
+			} else
+				printk(KERN_INFO "cfg80211: Current regulatory "
+					"domain intersected: \n");
+		} else
+				printk(KERN_INFO "cfg80211: Current regulatory "
+					"intersected: \n");
+	} else if (is_world_regdom(rd->alpha2))
 		printk(KERN_INFO "cfg80211: World regulatory "
 			"domain updated:\n");
 	else {
@@ -789,10 +1143,39 @@ static void print_regdomain_info(const struct ieee80211_regdomain *rd)
 	print_rd_rules(rd);
 }
 
+#ifdef CONFIG_CFG80211_REG_DEBUG
+static void reg_country_ie_process_debug(
+	const struct ieee80211_regdomain *rd,
+	const struct ieee80211_regdomain *country_ie_regdomain,
+	const struct ieee80211_regdomain *intersected_rd)
+{
+	printk(KERN_DEBUG "cfg80211: Received country IE:\n");
+	print_regdomain_info(country_ie_regdomain);
+	printk(KERN_DEBUG "cfg80211: CRDA thinks this should applied:\n");
+	print_regdomain_info(rd);
+	if (intersected_rd) {
+		printk(KERN_DEBUG "cfg80211: We intersect both of these "
+			"and get:\n");
+		print_regdomain_info(rd);
+		return;
+	}
+	printk(KERN_DEBUG "cfg80211: Intersection between both failed\n");
+}
+#else
+static inline void reg_country_ie_process_debug(
+	const struct ieee80211_regdomain *rd,
+	const struct ieee80211_regdomain *country_ie_regdomain,
+	const struct ieee80211_regdomain *intersected_rd)
+{
+}
+#endif
+
 /* Takes ownership of rd only if it doesn't fail */
 static int __set_regdom(const struct ieee80211_regdomain *rd)
 {
 	const struct ieee80211_regdomain *intersected_rd = NULL;
+	struct cfg80211_registered_device *drv = NULL;
+	struct wiphy *wiphy = NULL;
 	/* Some basic sanity checks first */
 
 	if (is_world_regdom(rd->alpha2)) {
@@ -809,10 +1192,18 @@ static int __set_regdom(const struct ieee80211_regdomain *rd)
 	if (!last_request)
 		return -EINVAL;
 
-	/* allow overriding the static definitions if CRDA is present */
-	if (!is_old_static_regdom(cfg80211_regdomain) &&
-	    !regdom_changed(rd->alpha2))
-		return -EINVAL;
+	/* Lets only bother proceeding on the same alpha2 if the current
+	 * rd is non static (it means CRDA was present and was used last)
+	 * and the pending request came in from a country IE */
+	if (last_request->initiator != REGDOM_SET_BY_COUNTRY_IE) {
+		/* If someone else asked us to change the rd lets only bother
+		 * checking if the alpha2 changes if CRDA was already called */
+		if (!is_old_static_regdom(cfg80211_regdomain) &&
+		    !regdom_changed(rd->alpha2))
+			return -EINVAL;
+	}
+
+	wiphy = last_request->wiphy;
 
 	/* Now lets set the regulatory domain, update all driver channels
 	 * and finally inform them of what we have done, in case they want
@@ -853,9 +1244,47 @@ static int __set_regdom(const struct ieee80211_regdomain *rd)
 		return 0;
 	}
 
-	/* Country IE parsing coming soon */
+	/*
+	 * Country IE requests are handled a bit differently, we intersect
+	 * the country IE rd with what CRDA believes that country should have
+	 */
+
+	BUG_ON(!country_ie_regdomain);
+
+	if (rd != country_ie_regdomain) {
+		/* Intersect what CRDA returned and our what we
+		 * had built from the Country IE received */
+
+		intersected_rd = regdom_intersect(rd, country_ie_regdomain);
+
+		reg_country_ie_process_debug(rd, country_ie_regdomain,
+			intersected_rd);
+
+		kfree(country_ie_regdomain);
+		country_ie_regdomain = NULL;
+	} else {
+		/* This would happen when CRDA was not present and
+		 * OLD_REGULATORY was enabled. We intersect our Country
+		 * IE rd and what was set on cfg80211 originally */
+		intersected_rd = regdom_intersect(rd, cfg80211_regdomain);
+	}
+
+	if (!intersected_rd)
+		return -EINVAL;
+
+	drv = wiphy_to_dev(wiphy);
+
+	drv->country_ie_alpha2[0] = rd->alpha2[0];
+	drv->country_ie_alpha2[1] = rd->alpha2[1];
+	drv->env = last_request->country_ie_env;
+
+	BUG_ON(intersected_rd == rd);
+
+	kfree(rd);
+	rd = NULL;
+
 	reset_regdomains();
-	WARN_ON(1);
+	cfg80211_regdomain = intersected_rd;
 
 	return 0;
 }
@@ -887,6 +1316,17 @@ int set_regdom(const struct ieee80211_regdomain *rd)
 	return r;
 }
 
+/* Caller must hold cfg80211_drv_mutex */
+void reg_device_remove(struct wiphy *wiphy)
+{
+	if (!last_request->wiphy)
+		return;
+	if (last_request->wiphy != wiphy)
+		return;
+	last_request->wiphy = NULL;
+	last_request->country_ie_env = ENVIRON_ANY;
+}
+
 int regulatory_init(void)
 {
 	int err;
@@ -906,11 +1346,11 @@ int regulatory_init(void)
 	 * that is not a valid ISO / IEC 3166 alpha2 */
 	if (ieee80211_regdom[0] != 'E' || ieee80211_regdom[1] != 'U')
 		err = __regulatory_hint(NULL, REGDOM_SET_BY_CORE,
-					ieee80211_regdom);
+					ieee80211_regdom, 0, ENVIRON_ANY);
 #else
 	cfg80211_regdomain = cfg80211_world_regdom;
 
-	err = __regulatory_hint(NULL, REGDOM_SET_BY_CORE, "00");
+	err = __regulatory_hint(NULL, REGDOM_SET_BY_CORE, "00", 0, ENVIRON_ANY);
 	if (err)
 		printk(KERN_ERR "cfg80211: calling CRDA failed - "
 		       "unable to update world regulatory domain, "
@@ -926,6 +1366,9 @@ void regulatory_exit(void)
 
 	reset_regdomains();
 
+	kfree(country_ie_regdomain);
+	country_ie_regdomain = NULL;
+
 	kfree(last_request);
 
 	platform_device_unregister(reg_pdev);
diff --git a/net/wireless/reg.h b/net/wireless/reg.h
index c9b6b6358bb..a76ea3ff7cd 100644
--- a/net/wireless/reg.h
+++ b/net/wireless/reg.h
@@ -4,28 +4,41 @@
 bool is_world_regdom(const char *alpha2);
 bool reg_is_valid_request(const char *alpha2);
 
+void reg_device_remove(struct wiphy *wiphy);
+
 int regulatory_init(void);
 void regulatory_exit(void);
 
 int set_regdom(const struct ieee80211_regdomain *rd);
 
+enum environment_cap {
+	ENVIRON_ANY,
+	ENVIRON_INDOOR,
+	ENVIRON_OUTDOOR,
+};
+
+
 /**
  * __regulatory_hint - hint to the wireless core a regulatory domain
  * @wiphy: if the hint comes from country information from an AP, this
  *	is required to be set to the wiphy that received the information
  * @alpha2: the ISO/IEC 3166 alpha2 being claimed the regulatory domain
  *	should be in.
+ * @country_ie_checksum: checksum of processed country IE, set this to 0
+ * 	if the hint did not come from a country IE
+ * @country_ie_env: the environment the IE told us we are in, %ENVIRON_*
  *
  * The Wireless subsystem can use this function to hint to the wireless core
- * what it believes should be the current regulatory domain by
- * giving it an ISO/IEC 3166 alpha2 country code it knows its regulatory
- * domain should be in.
+ * what it believes should be the current regulatory domain by giving it an
+ * ISO/IEC 3166 alpha2 country code it knows its regulatory domain should be
+ * in.
  *
  * Returns zero if all went fine, %-EALREADY if a regulatory domain had
  * already been set or other standard error codes.
  *
  */
 extern int __regulatory_hint(struct wiphy *wiphy, enum reg_set_by set_by,
-			     const char *alpha2);
+			     const char *alpha2, u32 country_ie_checksum,
+			     enum environment_cap country_ie_env);
 
 #endif  /* __NET_WIRELESS_REG_H */
-- 
cgit v1.2.3-70-g09d2


From fb52607afcd0629776f1dc9e657647ceae81dd50 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Tue, 25 Nov 2008 21:07:04 +0100
Subject: tracing/function-return-tracer: change the name into
 function-graph-tracer

Impact: cleanup

This patch changes the name of the "return function tracer" into
function-graph-tracer which is a more suitable name for a tracing
which makes one able to retrieve the ordered call stack during
the code flow.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Acked-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/Kconfig                     |  2 +-
 arch/x86/include/asm/ftrace.h        |  4 +-
 arch/x86/kernel/Makefile             |  4 +-
 arch/x86/kernel/entry_32.S           | 12 ++---
 arch/x86/kernel/ftrace.c             | 12 ++---
 include/linux/ftrace.h               | 24 ++++-----
 include/linux/ftrace_irq.h           |  2 +-
 include/linux/sched.h                |  2 +-
 kernel/Makefile                      |  2 +-
 kernel/fork.c                        |  4 +-
 kernel/sched.c                       |  2 +-
 kernel/trace/Kconfig                 | 19 ++++---
 kernel/trace/Makefile                |  2 +-
 kernel/trace/ftrace.c                | 26 +++++-----
 kernel/trace/trace.c                 | 18 +++----
 kernel/trace/trace.h                 | 12 ++---
 kernel/trace/trace_functions_graph.c | 98 ++++++++++++++++++++++++++++++++++++
 17 files changed, 173 insertions(+), 72 deletions(-)
 create mode 100644 kernel/trace/trace_functions_graph.c

(limited to 'include/linux')

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index e49a4fd718f..0842b112768 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -29,7 +29,7 @@ config X86
 	select HAVE_FTRACE_MCOUNT_RECORD
 	select HAVE_DYNAMIC_FTRACE
 	select HAVE_FUNCTION_TRACER
-	select HAVE_FUNCTION_RET_TRACER if X86_32
+	select HAVE_FUNCTION_GRAPH_TRACER if X86_32
 	select HAVE_FUNCTION_TRACE_MCOUNT_TEST
 	select HAVE_KVM if ((X86_32 && !X86_VOYAGER && !X86_VISWS && !X86_NUMAQ) || X86_64)
 	select HAVE_ARCH_KGDB if !X86_VOYAGER
diff --git a/arch/x86/include/asm/ftrace.h b/arch/x86/include/asm/ftrace.h
index 754a3e082f9..7e61b4ceb9a 100644
--- a/arch/x86/include/asm/ftrace.h
+++ b/arch/x86/include/asm/ftrace.h
@@ -28,7 +28,7 @@ struct dyn_arch_ftrace {
 #endif /* __ASSEMBLY__ */
 #endif /* CONFIG_FUNCTION_TRACER */
 
-#ifdef CONFIG_FUNCTION_RET_TRACER
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
 
 #ifndef __ASSEMBLY__
 
@@ -51,6 +51,6 @@ struct ftrace_ret_stack {
 extern void return_to_handler(void);
 
 #endif /* __ASSEMBLY__ */
-#endif /* CONFIG_FUNCTION_RET_TRACER */
+#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
 
 #endif /* _ASM_X86_FTRACE_H */
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index af2bc36ca1c..64939a0c398 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -14,7 +14,7 @@ CFLAGS_REMOVE_paravirt-spinlocks.o = -pg
 CFLAGS_REMOVE_ftrace.o = -pg
 endif
 
-ifdef CONFIG_FUNCTION_RET_TRACER
+ifdef CONFIG_FUNCTION_GRAPH_TRACER
 # Don't trace __switch_to() but let it for function tracer
 CFLAGS_REMOVE_process_32.o = -pg
 endif
@@ -70,7 +70,7 @@ obj-$(CONFIG_X86_LOCAL_APIC)	+= apic.o nmi.o
 obj-$(CONFIG_X86_IO_APIC)	+= io_apic.o
 obj-$(CONFIG_X86_REBOOTFIXUPS)	+= reboot_fixups_32.o
 obj-$(CONFIG_DYNAMIC_FTRACE)	+= ftrace.o
-obj-$(CONFIG_FUNCTION_RET_TRACER)	+= ftrace.o
+obj-$(CONFIG_FUNCTION_GRAPH_TRACER)	+= ftrace.o
 obj-$(CONFIG_KEXEC)		+= machine_kexec_$(BITS).o
 obj-$(CONFIG_KEXEC)		+= relocate_kernel_$(BITS).o crash.o
 obj-$(CONFIG_CRASH_DUMP)	+= crash_dump_$(BITS).o
diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
index 74defe21ba4..2b1f0f081a6 100644
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -1188,9 +1188,9 @@ ENTRY(mcount)
 
 	cmpl $ftrace_stub, ftrace_trace_function
 	jnz trace
-#ifdef CONFIG_FUNCTION_RET_TRACER
-	cmpl $ftrace_stub, ftrace_function_return
-	jnz ftrace_return_caller
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+	cmpl $ftrace_stub, ftrace_graph_function
+	jnz ftrace_graph_caller
 #endif
 .globl ftrace_stub
 ftrace_stub:
@@ -1215,8 +1215,8 @@ END(mcount)
 #endif /* CONFIG_DYNAMIC_FTRACE */
 #endif /* CONFIG_FUNCTION_TRACER */
 
-#ifdef CONFIG_FUNCTION_RET_TRACER
-ENTRY(ftrace_return_caller)
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+ENTRY(ftrace_graph_caller)
 	cmpl $0, function_trace_stop
 	jne ftrace_stub
 
@@ -1230,7 +1230,7 @@ ENTRY(ftrace_return_caller)
 	popl %ecx
 	popl %eax
 	ret
-END(ftrace_return_caller)
+END(ftrace_graph_caller)
 
 .globl return_to_handler
 return_to_handler:
diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c
index bb137f7297e..3595a4c14ab 100644
--- a/arch/x86/kernel/ftrace.c
+++ b/arch/x86/kernel/ftrace.c
@@ -323,7 +323,7 @@ int __init ftrace_dyn_arch_init(void *data)
 }
 #endif
 
-#ifdef CONFIG_FUNCTION_RET_TRACER
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
 
 #ifndef CONFIG_DYNAMIC_FTRACE
 
@@ -389,11 +389,11 @@ static void pop_return_trace(unsigned long *ret, unsigned long long *time,
  */
 unsigned long ftrace_return_to_handler(void)
 {
-	struct ftrace_retfunc trace;
+	struct ftrace_graph_ret trace;
 	pop_return_trace(&trace.ret, &trace.calltime, &trace.func,
 			&trace.overrun);
 	trace.rettime = cpu_clock(raw_smp_processor_id());
-	ftrace_function_return(&trace);
+	ftrace_graph_function(&trace);
 
 	return trace.ret;
 }
@@ -440,12 +440,12 @@ void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr)
 	);
 
 	if (WARN_ON(faulted)) {
-		unregister_ftrace_return();
+		unregister_ftrace_graph();
 		return;
 	}
 
 	if (WARN_ON(!__kernel_text_address(old))) {
-		unregister_ftrace_return();
+		unregister_ftrace_graph();
 		*parent = old;
 		return;
 	}
@@ -456,4 +456,4 @@ void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr)
 		*parent = old;
 }
 
-#endif /* CONFIG_FUNCTION_RET_TRACER */
+#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index 7854d87b97b..b4ac734ad8d 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -115,8 +115,8 @@ extern int ftrace_update_ftrace_func(ftrace_func_t func);
 extern void ftrace_caller(void);
 extern void ftrace_call(void);
 extern void mcount_call(void);
-#ifdef CONFIG_FUNCTION_RET_TRACER
-extern void ftrace_return_caller(void);
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+extern void ftrace_graph_caller(void);
 #endif
 
 /**
@@ -315,7 +315,7 @@ ftrace_init_module(struct module *mod,
 /*
  * Structure that defines a return function trace.
  */
-struct ftrace_retfunc {
+struct ftrace_graph_ret {
 	unsigned long ret; /* Return address */
 	unsigned long func; /* Current function */
 	unsigned long long calltime;
@@ -324,22 +324,22 @@ struct ftrace_retfunc {
 	unsigned long overrun;
 };
 
-#ifdef CONFIG_FUNCTION_RET_TRACER
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
 #define FTRACE_RETFUNC_DEPTH 50
 #define FTRACE_RETSTACK_ALLOC_SIZE 32
 /* Type of a callback handler of tracing return function */
-typedef void (*trace_function_return_t)(struct ftrace_retfunc *);
+typedef void (*trace_function_graph_t)(struct ftrace_graph_ret *);
 
-extern int register_ftrace_return(trace_function_return_t func);
+extern int register_ftrace_graph(trace_function_graph_t func);
 /* The current handler in use */
-extern trace_function_return_t ftrace_function_return;
-extern void unregister_ftrace_return(void);
+extern trace_function_graph_t ftrace_graph_function;
+extern void unregister_ftrace_graph(void);
 
-extern void ftrace_retfunc_init_task(struct task_struct *t);
-extern void ftrace_retfunc_exit_task(struct task_struct *t);
+extern void ftrace_graph_init_task(struct task_struct *t);
+extern void ftrace_graph_exit_task(struct task_struct *t);
 #else
-static inline void ftrace_retfunc_init_task(struct task_struct *t) { }
-static inline void ftrace_retfunc_exit_task(struct task_struct *t) { }
+static inline void ftrace_graph_init_task(struct task_struct *t) { }
+static inline void ftrace_graph_exit_task(struct task_struct *t) { }
 #endif
 
 #endif /* _LINUX_FTRACE_H */
diff --git a/include/linux/ftrace_irq.h b/include/linux/ftrace_irq.h
index 0b4df55d7a7..366a054d0b0 100644
--- a/include/linux/ftrace_irq.h
+++ b/include/linux/ftrace_irq.h
@@ -2,7 +2,7 @@
 #define _LINUX_FTRACE_IRQ_H
 
 
-#if defined(CONFIG_DYNAMIC_FTRACE) || defined(CONFIG_FUNCTION_RET_TRACER)
+#if defined(CONFIG_DYNAMIC_FTRACE) || defined(CONFIG_FUNCTION_GRAPH_TRACER)
 extern void ftrace_nmi_enter(void);
 extern void ftrace_nmi_exit(void);
 #else
diff --git a/include/linux/sched.h b/include/linux/sched.h
index d02a0ca70ee..7ad48f2a275 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1365,7 +1365,7 @@ struct task_struct {
 	unsigned long default_timer_slack_ns;
 
 	struct list_head	*scm_work_list;
-#ifdef CONFIG_FUNCTION_RET_TRACER
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
 	/* Index of current stored adress in ret_stack */
 	int curr_ret_stack;
 	/* Stack of return addresses for return function tracing */
diff --git a/kernel/Makefile b/kernel/Makefile
index 03a45e7e87b..703cf3b7389 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -21,7 +21,7 @@ CFLAGS_REMOVE_cgroup-debug.o = -pg
 CFLAGS_REMOVE_sched_clock.o = -pg
 CFLAGS_REMOVE_sched.o = -pg
 endif
-ifdef CONFIG_FUNCTION_RET_TRACER
+ifdef CONFIG_FUNCTION_GRAPH_TRACER
 CFLAGS_REMOVE_extable.o = -pg # For __kernel_text_address()
 CFLAGS_REMOVE_module.o = -pg # For __module_text_address()
 endif
diff --git a/kernel/fork.c b/kernel/fork.c
index d6e1a3205f6..5f82a999c03 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -140,7 +140,7 @@ void free_task(struct task_struct *tsk)
 	prop_local_destroy_single(&tsk->dirties);
 	free_thread_info(tsk->stack);
 	rt_mutex_debug_task_free(tsk);
-	ftrace_retfunc_exit_task(tsk);
+	ftrace_graph_exit_task(tsk);
 	free_task_struct(tsk);
 }
 EXPORT_SYMBOL(free_task);
@@ -1271,7 +1271,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 	total_forks++;
 	spin_unlock(&current->sighand->siglock);
 	write_unlock_irq(&tasklist_lock);
-	ftrace_retfunc_init_task(p);
+	ftrace_graph_init_task(p);
 	proc_fork_connector(p);
 	cgroup_post_fork(p);
 	return p;
diff --git a/kernel/sched.c b/kernel/sched.c
index 388d9db044a..52490bf6b88 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -5901,7 +5901,7 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu)
 	 * The idle tasks have their own, simple scheduling class:
 	 */
 	idle->sched_class = &idle_sched_class;
-	ftrace_retfunc_init_task(idle);
+	ftrace_graph_init_task(idle);
 }
 
 /*
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 620feadff67..eb9b901e077 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -12,7 +12,7 @@ config NOP_TRACER
 config HAVE_FUNCTION_TRACER
 	bool
 
-config HAVE_FUNCTION_RET_TRACER
+config HAVE_FUNCTION_GRAPH_TRACER
 	bool
 
 config HAVE_FUNCTION_TRACE_MCOUNT_TEST
@@ -63,15 +63,18 @@ config FUNCTION_TRACER
 	  (the bootup default), then the overhead of the instructions is very
 	  small and not measurable even in micro-benchmarks.
 
-config FUNCTION_RET_TRACER
-	bool "Kernel Function return Tracer"
-	depends on HAVE_FUNCTION_RET_TRACER
+config FUNCTION_GRAPH_TRACER
+	bool "Kernel Function Graph Tracer"
+	depends on HAVE_FUNCTION_GRAPH_TRACER
 	depends on FUNCTION_TRACER
 	help
-	  Enable the kernel to trace a function at its return.
-	  It's first purpose is to trace the duration of functions.
-	  This is done by setting the current return address on the thread
-	  info structure of the current task.
+	  Enable the kernel to trace a function at both its return
+	  and its entry.
+	  It's first purpose is to trace the duration of functions and
+	  draw a call graph for each thread with some informations like
+	  the return value.
+	  This is done by setting the current return address on the current
+	  task structure into a stack of calls.
 
 config IRQSOFF_TRACER
 	bool "Interrupts-off Latency Tracer"
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index cef4bcb4e82..08c5fe6ddc0 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -29,7 +29,7 @@ obj-$(CONFIG_NOP_TRACER) += trace_nop.o
 obj-$(CONFIG_STACK_TRACER) += trace_stack.o
 obj-$(CONFIG_MMIOTRACE) += trace_mmiotrace.o
 obj-$(CONFIG_BOOT_TRACER) += trace_boot.o
-obj-$(CONFIG_FUNCTION_RET_TRACER) += trace_functions_return.o
+obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += trace_functions_graph.o
 obj-$(CONFIG_TRACE_BRANCH_PROFILING) += trace_branch.o
 obj-$(CONFIG_BTS_TRACER) += trace_bts.o
 
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 53042f118f2..9e19976af72 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -395,11 +395,11 @@ __ftrace_replace_code(struct dyn_ftrace *rec, int enable)
 	unsigned long ip, fl;
 	unsigned long ftrace_addr;
 
-#ifdef CONFIG_FUNCTION_RET_TRACER
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
 	if (ftrace_tracing_type == FTRACE_TYPE_ENTER)
 		ftrace_addr = (unsigned long)ftrace_caller;
 	else
-		ftrace_addr = (unsigned long)ftrace_return_caller;
+		ftrace_addr = (unsigned long)ftrace_graph_caller;
 #else
 	ftrace_addr = (unsigned long)ftrace_caller;
 #endif
@@ -1496,13 +1496,13 @@ ftrace_enable_sysctl(struct ctl_table *table, int write,
 	return ret;
 }
 
-#ifdef CONFIG_FUNCTION_RET_TRACER
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
 
 static atomic_t ftrace_retfunc_active;
 
 /* The callback that hooks the return of a function */
-trace_function_return_t ftrace_function_return =
-			(trace_function_return_t)ftrace_stub;
+trace_function_graph_t ftrace_graph_function =
+			(trace_function_graph_t)ftrace_stub;
 
 
 /* Try to assign a return stack array on FTRACE_RETSTACK_ALLOC_SIZE tasks. */
@@ -1549,7 +1549,7 @@ free:
 }
 
 /* Allocate a return stack for each task */
-static int start_return_tracing(void)
+static int start_graph_tracing(void)
 {
 	struct ftrace_ret_stack **ret_stack_list;
 	int ret;
@@ -1569,7 +1569,7 @@ static int start_return_tracing(void)
 	return ret;
 }
 
-int register_ftrace_return(trace_function_return_t func)
+int register_ftrace_graph(trace_function_graph_t func)
 {
 	int ret = 0;
 
@@ -1584,13 +1584,13 @@ int register_ftrace_return(trace_function_return_t func)
 		goto out;
 	}
 	atomic_inc(&ftrace_retfunc_active);
-	ret = start_return_tracing();
+	ret = start_graph_tracing();
 	if (ret) {
 		atomic_dec(&ftrace_retfunc_active);
 		goto out;
 	}
 	ftrace_tracing_type = FTRACE_TYPE_RETURN;
-	ftrace_function_return = func;
+	ftrace_graph_function = func;
 	ftrace_startup();
 
 out:
@@ -1598,12 +1598,12 @@ out:
 	return ret;
 }
 
-void unregister_ftrace_return(void)
+void unregister_ftrace_graph(void)
 {
 	mutex_lock(&ftrace_sysctl_lock);
 
 	atomic_dec(&ftrace_retfunc_active);
-	ftrace_function_return = (trace_function_return_t)ftrace_stub;
+	ftrace_graph_function = (trace_function_graph_t)ftrace_stub;
 	ftrace_shutdown();
 	/* Restore normal tracing type */
 	ftrace_tracing_type = FTRACE_TYPE_ENTER;
@@ -1612,7 +1612,7 @@ void unregister_ftrace_return(void)
 }
 
 /* Allocate a return stack for newly created task */
-void ftrace_retfunc_init_task(struct task_struct *t)
+void ftrace_graph_init_task(struct task_struct *t)
 {
 	if (atomic_read(&ftrace_retfunc_active)) {
 		t->ret_stack = kmalloc(FTRACE_RETFUNC_DEPTH
@@ -1626,7 +1626,7 @@ void ftrace_retfunc_init_task(struct task_struct *t)
 		t->ret_stack = NULL;
 }
 
-void ftrace_retfunc_exit_task(struct task_struct *t)
+void ftrace_graph_exit_task(struct task_struct *t)
 {
 	struct ftrace_ret_stack	*ret_stack = t->ret_stack;
 
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 8df8fdd69c9..f21ab2c68fd 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -878,15 +878,15 @@ trace_function(struct trace_array *tr, struct trace_array_cpu *data,
 	ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
 }
 
-#ifdef CONFIG_FUNCTION_RET_TRACER
-static void __trace_function_return(struct trace_array *tr,
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+static void __trace_function_graph(struct trace_array *tr,
 				struct trace_array_cpu *data,
-				struct ftrace_retfunc *trace,
+				struct ftrace_graph_ret *trace,
 				unsigned long flags,
 				int pc)
 {
 	struct ring_buffer_event *event;
-	struct ftrace_ret_entry *entry;
+	struct ftrace_graph_entry *entry;
 	unsigned long irq_flags;
 
 	if (unlikely(local_read(&__get_cpu_var(ftrace_cpu_disabled))))
@@ -1177,8 +1177,8 @@ function_trace_call(unsigned long ip, unsigned long parent_ip)
 	local_irq_restore(flags);
 }
 
-#ifdef CONFIG_FUNCTION_RET_TRACER
-void trace_function_return(struct ftrace_retfunc *trace)
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+void trace_function_graph(struct ftrace_graph_ret *trace)
 {
 	struct trace_array *tr = &global_trace;
 	struct trace_array_cpu *data;
@@ -1193,12 +1193,12 @@ void trace_function_return(struct ftrace_retfunc *trace)
 	disabled = atomic_inc_return(&data->disabled);
 	if (likely(disabled == 1)) {
 		pc = preempt_count();
-		__trace_function_return(tr, data, trace, flags, pc);
+		__trace_function_graph(tr, data, trace, flags, pc);
 	}
 	atomic_dec(&data->disabled);
 	raw_local_irq_restore(flags);
 }
-#endif /* CONFIG_FUNCTION_RET_TRACER */
+#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
 
 static struct ftrace_ops trace_ops __read_mostly =
 {
@@ -2001,7 +2001,7 @@ static enum print_line_t print_trace_fmt(struct trace_iterator *iter)
 		break;
 	}
 	case TRACE_FN_RET: {
-		return print_return_function(iter);
+		return print_graph_function(iter);
 		break;
 	}
 	case TRACE_BRANCH: {
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 3abd645e8af..72b5ef86876 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -57,7 +57,7 @@ struct ftrace_entry {
 };
 
 /* Function return entry */
-struct ftrace_ret_entry {
+struct ftrace_graph_entry {
 	struct trace_entry	ent;
 	unsigned long		ip;
 	unsigned long		parent_ip;
@@ -264,7 +264,7 @@ extern void __ftrace_bad_type(void);
 		IF_ASSIGN(var, ent, struct trace_boot_call, TRACE_BOOT_CALL);\
 		IF_ASSIGN(var, ent, struct trace_boot_ret, TRACE_BOOT_RET);\
 		IF_ASSIGN(var, ent, struct trace_branch, TRACE_BRANCH); \
-		IF_ASSIGN(var, ent, struct ftrace_ret_entry, TRACE_FN_RET);\
+		IF_ASSIGN(var, ent, struct ftrace_graph_entry, TRACE_FN_RET);\
 		IF_ASSIGN(var, ent, struct bts_entry, TRACE_BTS);\
 		__ftrace_bad_type();					\
 	} while (0)
@@ -398,7 +398,7 @@ void trace_function(struct trace_array *tr,
 		    unsigned long parent_ip,
 		    unsigned long flags, int pc);
 void
-trace_function_return(struct ftrace_retfunc *trace);
+trace_function_graph(struct ftrace_graph_ret *trace);
 
 void trace_bts(struct trace_array *tr,
 	       unsigned long from,
@@ -489,11 +489,11 @@ extern int trace_vprintk(unsigned long ip, const char *fmt, va_list args);
 extern unsigned long trace_flags;
 
 /* Standard output formatting function used for function return traces */
-#ifdef CONFIG_FUNCTION_RET_TRACER
-extern enum print_line_t print_return_function(struct trace_iterator *iter);
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+extern enum print_line_t print_graph_function(struct trace_iterator *iter);
 #else
 static inline enum print_line_t
-print_return_function(struct trace_iterator *iter)
+print_graph_function(struct trace_iterator *iter)
 {
 	return TRACE_TYPE_UNHANDLED;
 }
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
new file mode 100644
index 00000000000..f5bad4624d2
--- /dev/null
+++ b/kernel/trace/trace_functions_graph.c
@@ -0,0 +1,98 @@
+/*
+ *
+ * Function graph tracer.
+ * Copyright (c) 2008 Frederic Weisbecker <fweisbec@gmail.com>
+ * Mostly borrowed from function tracer which
+ * is Copyright (c) Steven Rostedt <srostedt@redhat.com>
+ *
+ */
+#include <linux/debugfs.h>
+#include <linux/uaccess.h>
+#include <linux/ftrace.h>
+#include <linux/fs.h>
+
+#include "trace.h"
+
+
+#define TRACE_GRAPH_PRINT_OVERRUN	0x1
+static struct tracer_opt trace_opts[] = {
+	/* Display overruns or not */
+	{ TRACER_OPT(overrun, TRACE_GRAPH_PRINT_OVERRUN) },
+	{ } /* Empty entry */
+};
+
+static struct tracer_flags tracer_flags = {
+	.val = 0, /* Don't display overruns by default */
+	.opts = trace_opts
+};
+
+
+static int graph_trace_init(struct trace_array *tr)
+{
+	int cpu;
+	for_each_online_cpu(cpu)
+		tracing_reset(tr, cpu);
+
+	return register_ftrace_graph(&trace_function_graph);
+}
+
+static void graph_trace_reset(struct trace_array *tr)
+{
+		unregister_ftrace_graph();
+}
+
+
+enum print_line_t
+print_graph_function(struct trace_iterator *iter)
+{
+	struct trace_seq *s = &iter->seq;
+	struct trace_entry *entry = iter->ent;
+	struct ftrace_graph_entry *field;
+	int ret;
+
+	if (entry->type == TRACE_FN_RET) {
+		trace_assign_type(field, entry);
+		ret = trace_seq_printf(s, "%pF -> ", (void *)field->parent_ip);
+		if (!ret)
+			return TRACE_TYPE_PARTIAL_LINE;
+
+		ret = seq_print_ip_sym(s, field->ip,
+					trace_flags & TRACE_ITER_SYM_MASK);
+		if (!ret)
+			return TRACE_TYPE_PARTIAL_LINE;
+
+		ret = trace_seq_printf(s, " (%llu ns)",
+					field->rettime - field->calltime);
+		if (!ret)
+			return TRACE_TYPE_PARTIAL_LINE;
+
+		if (tracer_flags.val & TRACE_GRAPH_PRINT_OVERRUN) {
+			ret = trace_seq_printf(s, " (Overruns: %lu)",
+						field->overrun);
+			if (!ret)
+				return TRACE_TYPE_PARTIAL_LINE;
+		}
+
+		ret = trace_seq_printf(s, "\n");
+		if (!ret)
+			return TRACE_TYPE_PARTIAL_LINE;
+
+		return TRACE_TYPE_HANDLED;
+	}
+	return TRACE_TYPE_UNHANDLED;
+}
+
+static struct tracer graph_trace __read_mostly = {
+	.name	     = "function-graph",
+	.init	     = graph_trace_init,
+	.reset	     = graph_trace_reset,
+	.print_line = print_graph_function,
+	.flags		= &tracer_flags,
+};
+
+static __init int init_graph_trace(void)
+{
+	return register_tracer(&graph_trace);
+}
+
+device_initcall(init_graph_trace);
-- 
cgit v1.2.3-70-g09d2


From 287b6e68ca7209caec40b2f44f837c580a413bae Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Wed, 26 Nov 2008 00:57:25 +0100
Subject: tracing/function-return-tracer: set a more human readable output

Impact: feature

This patch sets a C-like output for the function graph tracing.
For this aim, we now call two handler for each function: one on the entry
and one other on return. This way we can draw a well-ordered call stack.

The pid of the previous trace is loosely stored to be compared against
the one of the current trace to see if there were a context switch.

Without this little feature, the call tree would seem broken at
some locations.
We could use the sched_tracer to capture these sched_events but this
way of processing is much more simpler.

2 spaces have been chosen for indentation to fit the screen while deep
calls. The time of execution in nanosecs is printed just after closed
braces, it seems more easy this way to find the corresponding function.
If the time was printed as a first column, it would be not so easy to
find the corresponding function if it is called on a deep depth.

I plan to output the return value but on 32 bits CPU, the return value
can be 32 or 64, and its difficult to guess on which case we are.
I don't know what would be the better solution on X86-32: only print
eax (low-part) or even edx (high-part).

Actually it's thee same problem when a function return a 8 bits value, the
high part of eax could contain junk values...

Here is an example of trace:

sys_read() {
  fget_light() {
  } 526
  vfs_read() {
    rw_verify_area() {
      security_file_permission() {
        cap_file_permission() {
        } 519
      } 1564
    } 2640
    do_sync_read() {
      pipe_read() {
        __might_sleep() {
        } 511
        pipe_wait() {
          prepare_to_wait() {
          } 760
          deactivate_task() {
            dequeue_task() {
              dequeue_task_fair() {
                dequeue_entity() {
                  update_curr() {
                    update_min_vruntime() {
                    } 504
                  } 1587
                  clear_buddies() {
                  } 512
                  add_cfs_task_weight() {
                  } 519
                  update_min_vruntime() {
                  } 511
                } 5602
                dequeue_entity() {
                  update_curr() {
                    update_min_vruntime() {
                    } 496
                  } 1631
                  clear_buddies() {
                  } 496
                  update_min_vruntime() {
                  } 527
                } 4580
                hrtick_update() {
                  hrtick_start_fair() {
                  } 488
                } 1489
              } 13700
            } 14949
          } 16016
          msecs_to_jiffies() {
          } 496
          put_prev_task_fair() {
          } 504
          pick_next_task_fair() {
          } 489
          pick_next_task_rt() {
          } 496
          pick_next_task_fair() {
          } 489
          pick_next_task_idle() {
          } 489

------------8<---------- thread 4 ------------8<----------

finish_task_switch() {
} 1203
do_softirq() {
  __do_softirq() {
    __local_bh_disable() {
    } 669
    rcu_process_callbacks() {
      __rcu_process_callbacks() {
        cpu_quiet() {
          rcu_start_batch() {
          } 503
        } 1647
      } 3128
      __rcu_process_callbacks() {
      } 542
    } 5362
    _local_bh_enable() {
    } 587
  } 8880
} 9986
kthread_should_stop() {
} 669
deactivate_task() {
  dequeue_task() {
    dequeue_task_fair() {
      dequeue_entity() {
        update_curr() {
          calc_delta_mine() {
          } 511
          update_min_vruntime() {
          } 511
        } 2813

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Acked-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/ftrace.c              |  32 +++++++----
 include/linux/ftrace.h                |  25 ++++++--
 kernel/trace/ftrace.c                 |  30 +++++-----
 kernel/trace/trace.c                  |  67 ++++++++++++++++++----
 kernel/trace/trace.h                  |  28 +++++----
 kernel/trace/trace_functions_graph.c  | 104 ++++++++++++++++++++++++++--------
 kernel/trace/trace_functions_return.c |  98 --------------------------------
 7 files changed, 208 insertions(+), 176 deletions(-)
 delete mode 100644 kernel/trace/trace_functions_return.c

(limited to 'include/linux')

diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c
index 3595a4c14ab..26b2d92d48b 100644
--- a/arch/x86/kernel/ftrace.c
+++ b/arch/x86/kernel/ftrace.c
@@ -347,7 +347,7 @@ void ftrace_nmi_exit(void)
 
 /* Add a function return address to the trace stack on thread info.*/
 static int push_return_trace(unsigned long ret, unsigned long long time,
-				unsigned long func)
+				unsigned long func, int *depth)
 {
 	int index;
 
@@ -365,21 +365,22 @@ static int push_return_trace(unsigned long ret, unsigned long long time,
 	current->ret_stack[index].ret = ret;
 	current->ret_stack[index].func = func;
 	current->ret_stack[index].calltime = time;
+	*depth = index;
 
 	return 0;
 }
 
 /* Retrieve a function return address to the trace stack on thread info.*/
-static void pop_return_trace(unsigned long *ret, unsigned long long *time,
-				unsigned long *func, unsigned long *overrun)
+static void pop_return_trace(struct ftrace_graph_ret *trace, unsigned long *ret)
 {
 	int index;
 
 	index = current->curr_ret_stack;
 	*ret = current->ret_stack[index].ret;
-	*func = current->ret_stack[index].func;
-	*time = current->ret_stack[index].calltime;
-	*overrun = atomic_read(&current->trace_overrun);
+	trace->func = current->ret_stack[index].func;
+	trace->calltime = current->ret_stack[index].calltime;
+	trace->overrun = atomic_read(&current->trace_overrun);
+	trace->depth = index;
 	current->curr_ret_stack--;
 }
 
@@ -390,12 +391,13 @@ static void pop_return_trace(unsigned long *ret, unsigned long long *time,
 unsigned long ftrace_return_to_handler(void)
 {
 	struct ftrace_graph_ret trace;
-	pop_return_trace(&trace.ret, &trace.calltime, &trace.func,
-			&trace.overrun);
+	unsigned long ret;
+
+	pop_return_trace(&trace, &ret);
 	trace.rettime = cpu_clock(raw_smp_processor_id());
-	ftrace_graph_function(&trace);
+	ftrace_graph_return(&trace);
 
-	return trace.ret;
+	return ret;
 }
 
 /*
@@ -407,6 +409,7 @@ void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr)
 	unsigned long old;
 	unsigned long long calltime;
 	int faulted;
+	struct ftrace_graph_ent trace;
 	unsigned long return_hooker = (unsigned long)
 				&return_to_handler;
 
@@ -452,8 +455,15 @@ void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr)
 
 	calltime = cpu_clock(raw_smp_processor_id());
 
-	if (push_return_trace(old, calltime, self_addr) == -EBUSY)
+	if (push_return_trace(old, calltime,
+				self_addr, &trace.depth) == -EBUSY) {
 		*parent = old;
+		return;
+	}
+
+	trace.func = self_addr;
+	ftrace_graph_entry(&trace);
+
 }
 
 #endif /* CONFIG_FUNCTION_GRAPH_TRACER */
diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index b4ac734ad8d..fc2d5498719 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -312,27 +312,40 @@ ftrace_init_module(struct module *mod,
 #endif
 
 
+/*
+ * Structure that defines an entry function trace.
+ */
+struct ftrace_graph_ent {
+	unsigned long func; /* Current function */
+	int depth;
+};
+
 /*
  * Structure that defines a return function trace.
  */
 struct ftrace_graph_ret {
-	unsigned long ret; /* Return address */
 	unsigned long func; /* Current function */
 	unsigned long long calltime;
 	unsigned long long rettime;
 	/* Number of functions that overran the depth limit for current task */
 	unsigned long overrun;
+	int depth;
 };
 
 #ifdef CONFIG_FUNCTION_GRAPH_TRACER
 #define FTRACE_RETFUNC_DEPTH 50
 #define FTRACE_RETSTACK_ALLOC_SIZE 32
-/* Type of a callback handler of tracing return function */
-typedef void (*trace_function_graph_t)(struct ftrace_graph_ret *);
+/* Type of the callback handlers for tracing function graph*/
+typedef void (*trace_func_graph_ret_t)(struct ftrace_graph_ret *); /* return */
+typedef void (*trace_func_graph_ent_t)(struct ftrace_graph_ent *); /* entry */
+
+extern int register_ftrace_graph(trace_func_graph_ret_t retfunc,
+				trace_func_graph_ent_t entryfunc);
+
+/* The current handlers in use */
+extern trace_func_graph_ret_t ftrace_graph_return;
+extern trace_func_graph_ent_t ftrace_graph_entry;
 
-extern int register_ftrace_graph(trace_function_graph_t func);
-/* The current handler in use */
-extern trace_function_graph_t ftrace_graph_function;
 extern void unregister_ftrace_graph(void);
 
 extern void ftrace_graph_init_task(struct task_struct *t);
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 9e19976af72..7e2d3b91692 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -1498,12 +1498,13 @@ ftrace_enable_sysctl(struct ctl_table *table, int write,
 
 #ifdef CONFIG_FUNCTION_GRAPH_TRACER
 
-static atomic_t ftrace_retfunc_active;
-
-/* The callback that hooks the return of a function */
-trace_function_graph_t ftrace_graph_function =
-			(trace_function_graph_t)ftrace_stub;
+static atomic_t ftrace_graph_active;
 
+/* The callbacks that hook a function */
+trace_func_graph_ret_t ftrace_graph_return =
+			(trace_func_graph_ret_t)ftrace_stub;
+trace_func_graph_ent_t ftrace_graph_entry =
+			(trace_func_graph_ent_t)ftrace_stub;
 
 /* Try to assign a return stack array on FTRACE_RETSTACK_ALLOC_SIZE tasks. */
 static int alloc_retstack_tasklist(struct ftrace_ret_stack **ret_stack_list)
@@ -1569,7 +1570,8 @@ static int start_graph_tracing(void)
 	return ret;
 }
 
-int register_ftrace_graph(trace_function_graph_t func)
+int register_ftrace_graph(trace_func_graph_ret_t retfunc,
+			trace_func_graph_ent_t entryfunc)
 {
 	int ret = 0;
 
@@ -1583,14 +1585,15 @@ int register_ftrace_graph(trace_function_graph_t func)
 		ret = -EBUSY;
 		goto out;
 	}
-	atomic_inc(&ftrace_retfunc_active);
+	atomic_inc(&ftrace_graph_active);
 	ret = start_graph_tracing();
 	if (ret) {
-		atomic_dec(&ftrace_retfunc_active);
+		atomic_dec(&ftrace_graph_active);
 		goto out;
 	}
 	ftrace_tracing_type = FTRACE_TYPE_RETURN;
-	ftrace_graph_function = func;
+	ftrace_graph_return = retfunc;
+	ftrace_graph_entry = entryfunc;
 	ftrace_startup();
 
 out:
@@ -1602,8 +1605,9 @@ void unregister_ftrace_graph(void)
 {
 	mutex_lock(&ftrace_sysctl_lock);
 
-	atomic_dec(&ftrace_retfunc_active);
-	ftrace_graph_function = (trace_function_graph_t)ftrace_stub;
+	atomic_dec(&ftrace_graph_active);
+	ftrace_graph_return = (trace_func_graph_ret_t)ftrace_stub;
+	ftrace_graph_entry = (trace_func_graph_ent_t)ftrace_stub;
 	ftrace_shutdown();
 	/* Restore normal tracing type */
 	ftrace_tracing_type = FTRACE_TYPE_ENTER;
@@ -1614,7 +1618,7 @@ void unregister_ftrace_graph(void)
 /* Allocate a return stack for newly created task */
 void ftrace_graph_init_task(struct task_struct *t)
 {
-	if (atomic_read(&ftrace_retfunc_active)) {
+	if (atomic_read(&ftrace_graph_active)) {
 		t->ret_stack = kmalloc(FTRACE_RETFUNC_DEPTH
 				* sizeof(struct ftrace_ret_stack),
 				GFP_KERNEL);
@@ -1638,5 +1642,3 @@ void ftrace_graph_exit_task(struct task_struct *t)
 }
 #endif
 
-
-
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index f21ab2c68fd..9d5f7c94f25 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -879,14 +879,38 @@ trace_function(struct trace_array *tr, struct trace_array_cpu *data,
 }
 
 #ifdef CONFIG_FUNCTION_GRAPH_TRACER
-static void __trace_function_graph(struct trace_array *tr,
+static void __trace_graph_entry(struct trace_array *tr,
+				struct trace_array_cpu *data,
+				struct ftrace_graph_ent *trace,
+				unsigned long flags,
+				int pc)
+{
+	struct ring_buffer_event *event;
+	struct ftrace_graph_ent_entry *entry;
+	unsigned long irq_flags;
+
+	if (unlikely(local_read(&__get_cpu_var(ftrace_cpu_disabled))))
+		return;
+
+	event = ring_buffer_lock_reserve(global_trace.buffer, sizeof(*entry),
+					 &irq_flags);
+	if (!event)
+		return;
+	entry	= ring_buffer_event_data(event);
+	tracing_generic_entry_update(&entry->ent, flags, pc);
+	entry->ent.type			= TRACE_GRAPH_ENT;
+	entry->graph_ent			= *trace;
+	ring_buffer_unlock_commit(global_trace.buffer, event, irq_flags);
+}
+
+static void __trace_graph_return(struct trace_array *tr,
 				struct trace_array_cpu *data,
 				struct ftrace_graph_ret *trace,
 				unsigned long flags,
 				int pc)
 {
 	struct ring_buffer_event *event;
-	struct ftrace_graph_entry *entry;
+	struct ftrace_graph_ret_entry *entry;
 	unsigned long irq_flags;
 
 	if (unlikely(local_read(&__get_cpu_var(ftrace_cpu_disabled))))
@@ -898,12 +922,8 @@ static void __trace_function_graph(struct trace_array *tr,
 		return;
 	entry	= ring_buffer_event_data(event);
 	tracing_generic_entry_update(&entry->ent, flags, pc);
-	entry->ent.type			= TRACE_FN_RET;
-	entry->ip			= trace->func;
-	entry->parent_ip	= trace->ret;
-	entry->rettime		= trace->rettime;
-	entry->calltime		= trace->calltime;
-	entry->overrun		= trace->overrun;
+	entry->ent.type			= TRACE_GRAPH_RET;
+	entry->ret				= *trace;
 	ring_buffer_unlock_commit(global_trace.buffer, event, irq_flags);
 }
 #endif
@@ -1178,7 +1198,7 @@ function_trace_call(unsigned long ip, unsigned long parent_ip)
 }
 
 #ifdef CONFIG_FUNCTION_GRAPH_TRACER
-void trace_function_graph(struct ftrace_graph_ret *trace)
+void trace_graph_entry(struct ftrace_graph_ent *trace)
 {
 	struct trace_array *tr = &global_trace;
 	struct trace_array_cpu *data;
@@ -1193,7 +1213,28 @@ void trace_function_graph(struct ftrace_graph_ret *trace)
 	disabled = atomic_inc_return(&data->disabled);
 	if (likely(disabled == 1)) {
 		pc = preempt_count();
-		__trace_function_graph(tr, data, trace, flags, pc);
+		__trace_graph_entry(tr, data, trace, flags, pc);
+	}
+	atomic_dec(&data->disabled);
+	raw_local_irq_restore(flags);
+}
+
+void trace_graph_return(struct ftrace_graph_ret *trace)
+{
+	struct trace_array *tr = &global_trace;
+	struct trace_array_cpu *data;
+	unsigned long flags;
+	long disabled;
+	int cpu;
+	int pc;
+
+	raw_local_irq_save(flags);
+	cpu = raw_smp_processor_id();
+	data = tr->data[cpu];
+	disabled = atomic_inc_return(&data->disabled);
+	if (likely(disabled == 1)) {
+		pc = preempt_count();
+		__trace_graph_return(tr, data, trace, flags, pc);
 	}
 	atomic_dec(&data->disabled);
 	raw_local_irq_restore(flags);
@@ -2000,9 +2041,11 @@ static enum print_line_t print_trace_fmt(struct trace_iterator *iter)
 			trace_seq_print_cont(s, iter);
 		break;
 	}
-	case TRACE_FN_RET: {
+	case TRACE_GRAPH_RET: {
+		return print_graph_function(iter);
+	}
+	case TRACE_GRAPH_ENT: {
 		return print_graph_function(iter);
-		break;
 	}
 	case TRACE_BRANCH: {
 		struct trace_branch *field;
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 72b5ef86876..ffe1bb1eb62 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -25,7 +25,8 @@ enum trace_type {
 	TRACE_BRANCH,
 	TRACE_BOOT_CALL,
 	TRACE_BOOT_RET,
-	TRACE_FN_RET,
+	TRACE_GRAPH_RET,
+	TRACE_GRAPH_ENT,
 	TRACE_USER_STACK,
 	TRACE_BTS,
 
@@ -56,14 +57,16 @@ struct ftrace_entry {
 	unsigned long		parent_ip;
 };
 
+/* Function call entry */
+struct ftrace_graph_ent_entry {
+	struct trace_entry			ent;
+	struct ftrace_graph_ent		graph_ent;
+};
+
 /* Function return entry */
-struct ftrace_graph_entry {
-	struct trace_entry	ent;
-	unsigned long		ip;
-	unsigned long		parent_ip;
-	unsigned long long	calltime;
-	unsigned long long	rettime;
-	unsigned long		overrun;
+struct ftrace_graph_ret_entry {
+	struct trace_entry			ent;
+	struct ftrace_graph_ret		ret;
 };
 extern struct tracer boot_tracer;
 
@@ -264,7 +267,10 @@ extern void __ftrace_bad_type(void);
 		IF_ASSIGN(var, ent, struct trace_boot_call, TRACE_BOOT_CALL);\
 		IF_ASSIGN(var, ent, struct trace_boot_ret, TRACE_BOOT_RET);\
 		IF_ASSIGN(var, ent, struct trace_branch, TRACE_BRANCH); \
-		IF_ASSIGN(var, ent, struct ftrace_graph_entry, TRACE_FN_RET);\
+		IF_ASSIGN(var, ent, struct ftrace_graph_ent_entry,	\
+			  TRACE_GRAPH_ENT);		\
+		IF_ASSIGN(var, ent, struct ftrace_graph_ret_entry,	\
+			  TRACE_GRAPH_RET);		\
 		IF_ASSIGN(var, ent, struct bts_entry, TRACE_BTS);\
 		__ftrace_bad_type();					\
 	} while (0)
@@ -397,9 +403,9 @@ void trace_function(struct trace_array *tr,
 		    unsigned long ip,
 		    unsigned long parent_ip,
 		    unsigned long flags, int pc);
-void
-trace_function_graph(struct ftrace_graph_ret *trace);
 
+void trace_graph_return(struct ftrace_graph_ret *trace);
+void trace_graph_entry(struct ftrace_graph_ent *trace);
 void trace_bts(struct trace_array *tr,
 	       unsigned long from,
 	       unsigned long to);
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index f5bad4624d2..b6f0cc2a00c 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -13,6 +13,7 @@
 
 #include "trace.h"
 
+#define TRACE_GRAPH_INDENT	2
 
 #define TRACE_GRAPH_PRINT_OVERRUN	0x1
 static struct tracer_opt trace_opts[] = {
@@ -26,6 +27,8 @@ static struct tracer_flags tracer_flags = {
 	.opts = trace_opts
 };
 
+/* pid on the last trace processed */
+static pid_t last_pid = -1;
 
 static int graph_trace_init(struct trace_array *tr)
 {
@@ -33,7 +36,8 @@ static int graph_trace_init(struct trace_array *tr)
 	for_each_online_cpu(cpu)
 		tracing_reset(tr, cpu);
 
-	return register_ftrace_graph(&trace_function_graph);
+	return register_ftrace_graph(&trace_graph_return,
+					&trace_graph_entry);
 }
 
 static void graph_trace_reset(struct trace_array *tr)
@@ -41,45 +45,97 @@ static void graph_trace_reset(struct trace_array *tr)
 		unregister_ftrace_graph();
 }
 
+/* If the pid changed since the last trace, output this event */
+static int verif_pid(struct trace_seq *s, pid_t pid)
+{
+	if (last_pid != -1 && last_pid == pid)
+		return 1;
 
-enum print_line_t
-print_graph_function(struct trace_iterator *iter)
+	last_pid = pid;
+	return trace_seq_printf(s, "\n------------8<---------- thread %d"
+				    " ------------8<----------\n\n",
+				  pid);
+}
+
+static enum print_line_t
+print_graph_entry(struct ftrace_graph_ent *call, struct trace_seq *s,
+		struct trace_entry *ent)
 {
-	struct trace_seq *s = &iter->seq;
-	struct trace_entry *entry = iter->ent;
-	struct ftrace_graph_entry *field;
+	int i;
 	int ret;
 
-	if (entry->type == TRACE_FN_RET) {
-		trace_assign_type(field, entry);
-		ret = trace_seq_printf(s, "%pF -> ", (void *)field->parent_ip);
-		if (!ret)
-			return TRACE_TYPE_PARTIAL_LINE;
+	if (!verif_pid(s, ent->pid))
+		return TRACE_TYPE_PARTIAL_LINE;
 
-		ret = seq_print_ip_sym(s, field->ip,
-					trace_flags & TRACE_ITER_SYM_MASK);
+	for (i = 0; i < call->depth * TRACE_GRAPH_INDENT; i++) {
+		ret = trace_seq_printf(s, " ");
 		if (!ret)
 			return TRACE_TYPE_PARTIAL_LINE;
+	}
+
+	ret = seq_print_ip_sym(s, call->func, 0);
+	if (!ret)
+		return TRACE_TYPE_PARTIAL_LINE;
+
+	ret = trace_seq_printf(s, "() {\n");
+	if (!ret)
+		return TRACE_TYPE_PARTIAL_LINE;
+	return TRACE_TYPE_HANDLED;
+}
+
+static enum print_line_t
+print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s,
+		   struct trace_entry *ent)
+{
+	int i;
+	int ret;
+
+	if (!verif_pid(s, ent->pid))
+		return TRACE_TYPE_PARTIAL_LINE;
 
-		ret = trace_seq_printf(s, " (%llu ns)",
-					field->rettime - field->calltime);
+	for (i = 0; i < trace->depth * TRACE_GRAPH_INDENT; i++) {
+		ret = trace_seq_printf(s, " ");
 		if (!ret)
 			return TRACE_TYPE_PARTIAL_LINE;
+	}
+
+	ret = trace_seq_printf(s, "} ");
+	if (!ret)
+		return TRACE_TYPE_PARTIAL_LINE;
 
-		if (tracer_flags.val & TRACE_GRAPH_PRINT_OVERRUN) {
-			ret = trace_seq_printf(s, " (Overruns: %lu)",
-						field->overrun);
-			if (!ret)
-				return TRACE_TYPE_PARTIAL_LINE;
-		}
+	ret = trace_seq_printf(s, "%llu\n", trace->rettime - trace->calltime);
+	if (!ret)
+		return TRACE_TYPE_PARTIAL_LINE;
 
-		ret = trace_seq_printf(s, "\n");
+	if (tracer_flags.val & TRACE_GRAPH_PRINT_OVERRUN) {
+		ret = trace_seq_printf(s, " (Overruns: %lu)\n",
+					trace->overrun);
 		if (!ret)
 			return TRACE_TYPE_PARTIAL_LINE;
+	}
+	return TRACE_TYPE_HANDLED;
+}
+
+enum print_line_t
+print_graph_function(struct trace_iterator *iter)
+{
+	struct trace_seq *s = &iter->seq;
+	struct trace_entry *entry = iter->ent;
 
-		return TRACE_TYPE_HANDLED;
+	switch (entry->type) {
+	case TRACE_GRAPH_ENT: {
+		struct ftrace_graph_ent_entry *field;
+		trace_assign_type(field, entry);
+		return print_graph_entry(&field->graph_ent, s, entry);
+	}
+	case TRACE_GRAPH_RET: {
+		struct ftrace_graph_ret_entry *field;
+		trace_assign_type(field, entry);
+		return print_graph_return(&field->ret, s, entry);
+	}
+	default:
+		return TRACE_TYPE_UNHANDLED;
 	}
-	return TRACE_TYPE_UNHANDLED;
 }
 
 static struct tracer graph_trace __read_mostly = {
diff --git a/kernel/trace/trace_functions_return.c b/kernel/trace/trace_functions_return.c
deleted file mode 100644
index e00d64509c9..00000000000
--- a/kernel/trace/trace_functions_return.c
+++ /dev/null
@@ -1,98 +0,0 @@
-/*
- *
- * Function return tracer.
- * Copyright (c) 2008 Frederic Weisbecker <fweisbec@gmail.com>
- * Mostly borrowed from function tracer which
- * is Copyright (c) Steven Rostedt <srostedt@redhat.com>
- *
- */
-#include <linux/debugfs.h>
-#include <linux/uaccess.h>
-#include <linux/ftrace.h>
-#include <linux/fs.h>
-
-#include "trace.h"
-
-
-#define TRACE_RETURN_PRINT_OVERRUN	0x1
-static struct tracer_opt trace_opts[] = {
-	/* Display overruns or not */
-	{ TRACER_OPT(overrun, TRACE_RETURN_PRINT_OVERRUN) },
-	{ } /* Empty entry */
-};
-
-static struct tracer_flags tracer_flags = {
-	.val = 0, /* Don't display overruns by default */
-	.opts = trace_opts
-};
-
-
-static int return_trace_init(struct trace_array *tr)
-{
-	int cpu;
-	for_each_online_cpu(cpu)
-		tracing_reset(tr, cpu);
-
-	return register_ftrace_return(&trace_function_return);
-}
-
-static void return_trace_reset(struct trace_array *tr)
-{
-		unregister_ftrace_return();
-}
-
-
-enum print_line_t
-print_return_function(struct trace_iterator *iter)
-{
-	struct trace_seq *s = &iter->seq;
-	struct trace_entry *entry = iter->ent;
-	struct ftrace_ret_entry *field;
-	int ret;
-
-	if (entry->type == TRACE_FN_RET) {
-		trace_assign_type(field, entry);
-		ret = trace_seq_printf(s, "%pF -> ", (void *)field->parent_ip);
-		if (!ret)
-			return TRACE_TYPE_PARTIAL_LINE;
-
-		ret = seq_print_ip_sym(s, field->ip,
-					trace_flags & TRACE_ITER_SYM_MASK);
-		if (!ret)
-			return TRACE_TYPE_PARTIAL_LINE;
-
-		ret = trace_seq_printf(s, " (%llu ns)",
-					field->rettime - field->calltime);
-		if (!ret)
-			return TRACE_TYPE_PARTIAL_LINE;
-
-		if (tracer_flags.val & TRACE_RETURN_PRINT_OVERRUN) {
-			ret = trace_seq_printf(s, " (Overruns: %lu)",
-						field->overrun);
-			if (!ret)
-				return TRACE_TYPE_PARTIAL_LINE;
-		}
-
-		ret = trace_seq_printf(s, "\n");
-		if (!ret)
-			return TRACE_TYPE_PARTIAL_LINE;
-
-		return TRACE_TYPE_HANDLED;
-	}
-	return TRACE_TYPE_UNHANDLED;
-}
-
-static struct tracer return_trace __read_mostly = {
-	.name	     = "return",
-	.init	     = return_trace_init,
-	.reset	     = return_trace_reset,
-	.print_line = print_return_function,
-	.flags		= &tracer_flags,
-};
-
-static __init int init_return_trace(void)
-{
-	return register_tracer(&return_trace);
-}
-
-device_initcall(init_return_trace);
-- 
cgit v1.2.3-70-g09d2


From 5a45cfe1c64862e8cd3b0d79d7c4ba71c3118915 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Wed, 26 Nov 2008 00:16:24 -0500
Subject: ftrace: use code patching for ftrace graph tracer

Impact: more efficient code for ftrace graph tracer

This patch uses the dynamic patching, when available, to patch
the function graph code into the kernel.

This patch will ease the way for letting both function tracing
and function graph tracing run together.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/entry_32.S |  5 +++++
 arch/x86/kernel/ftrace.c   | 48 ++++++++++++++++++++++++++++++++++++++++++++--
 include/linux/ftrace.h     |  5 +++++
 kernel/trace/ftrace.c      | 35 ++++++++++++++++-----------------
 4 files changed, 72 insertions(+), 21 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
index 7def9fd5c1e..958af86186c 100644
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -1174,6 +1174,11 @@ ftrace_call:
 	popl %edx
 	popl %ecx
 	popl %eax
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+.globl ftrace_graph_call
+ftrace_graph_call:
+	jmp ftrace_stub
+#endif
 
 .globl ftrace_stub
 ftrace_stub:
diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c
index 26b2d92d48b..7ef914e6a2f 100644
--- a/arch/x86/kernel/ftrace.c
+++ b/arch/x86/kernel/ftrace.c
@@ -111,7 +111,6 @@ static void ftrace_mod_code(void)
 	 */
 	mod_code_status = probe_kernel_write(mod_code_ip, mod_code_newcode,
 					     MCOUNT_INSN_SIZE);
-
 }
 
 void ftrace_nmi_enter(void)
@@ -325,7 +324,51 @@ int __init ftrace_dyn_arch_init(void *data)
 
 #ifdef CONFIG_FUNCTION_GRAPH_TRACER
 
-#ifndef CONFIG_DYNAMIC_FTRACE
+#ifdef CONFIG_DYNAMIC_FTRACE
+extern void ftrace_graph_call(void);
+
+static int ftrace_mod_jmp(unsigned long ip,
+			  int old_offset, int new_offset)
+{
+	unsigned char code[MCOUNT_INSN_SIZE];
+
+	if (probe_kernel_read(code, (void *)ip, MCOUNT_INSN_SIZE))
+		return -EFAULT;
+
+	if (code[0] != 0xe9 || old_offset != *(int *)(&code[1]))
+		return -EINVAL;
+
+	*(int *)(&code[1]) = new_offset;
+
+	if (do_ftrace_mod_code(ip, &code))
+		return -EPERM;
+
+	return 0;
+}
+
+int ftrace_enable_ftrace_graph_caller(void)
+{
+	unsigned long ip = (unsigned long)(&ftrace_graph_call);
+	int old_offset, new_offset;
+
+	old_offset = (unsigned long)(&ftrace_stub) - (ip + MCOUNT_INSN_SIZE);
+	new_offset = (unsigned long)(&ftrace_graph_caller) - (ip + MCOUNT_INSN_SIZE);
+
+	return ftrace_mod_jmp(ip, old_offset, new_offset);
+}
+
+int ftrace_disable_ftrace_graph_caller(void)
+{
+	unsigned long ip = (unsigned long)(&ftrace_graph_call);
+	int old_offset, new_offset;
+
+	old_offset = (unsigned long)(&ftrace_graph_caller) - (ip + MCOUNT_INSN_SIZE);
+	new_offset = (unsigned long)(&ftrace_stub) - (ip + MCOUNT_INSN_SIZE);
+
+	return ftrace_mod_jmp(ip, old_offset, new_offset);
+}
+
+#else /* CONFIG_DYNAMIC_FTRACE */
 
 /*
  * These functions are picked from those used on
@@ -343,6 +386,7 @@ void ftrace_nmi_exit(void)
 {
 	atomic_dec(&in_nmi);
 }
+
 #endif /* !CONFIG_DYNAMIC_FTRACE */
 
 /* Add a function return address to the trace stack on thread info.*/
diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index fc2d5498719..f9792c0d73f 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -117,6 +117,11 @@ extern void ftrace_call(void);
 extern void mcount_call(void);
 #ifdef CONFIG_FUNCTION_GRAPH_TRACER
 extern void ftrace_graph_caller(void);
+extern int ftrace_enable_ftrace_graph_caller(void);
+extern int ftrace_disable_ftrace_graph_caller(void);
+#else
+static inline int ftrace_enable_ftrace_graph_caller(void) { return 0; }
+static inline int ftrace_disable_ftrace_graph_caller(void) { return 0; }
 #endif
 
 /**
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 00d98c65fad..5f7c8642d58 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -281,6 +281,8 @@ enum {
 	FTRACE_UPDATE_TRACE_FUNC	= (1 << 2),
 	FTRACE_ENABLE_MCOUNT		= (1 << 3),
 	FTRACE_DISABLE_MCOUNT		= (1 << 4),
+	FTRACE_START_FUNC_RET		= (1 << 5),
+	FTRACE_STOP_FUNC_RET		= (1 << 6),
 };
 
 static int ftrace_filtered;
@@ -465,14 +467,7 @@ __ftrace_replace_code(struct dyn_ftrace *rec, int enable)
 	unsigned long ip, fl;
 	unsigned long ftrace_addr;
 
-#ifdef CONFIG_FUNCTION_GRAPH_TRACER
-	if (ftrace_tracing_type == FTRACE_TYPE_ENTER)
-		ftrace_addr = (unsigned long)ftrace_caller;
-	else
-		ftrace_addr = (unsigned long)ftrace_graph_caller;
-#else
 	ftrace_addr = (unsigned long)ftrace_caller;
-#endif
 
 	ip = rec->ip;
 
@@ -605,6 +600,11 @@ static int __ftrace_modify_code(void *data)
 	if (*command & FTRACE_UPDATE_TRACE_FUNC)
 		ftrace_update_ftrace_func(ftrace_trace_function);
 
+	if (*command & FTRACE_START_FUNC_RET)
+		ftrace_enable_ftrace_graph_caller();
+	else if (*command & FTRACE_STOP_FUNC_RET)
+		ftrace_disable_ftrace_graph_caller();
+
 	return 0;
 }
 
@@ -629,10 +629,8 @@ static void ftrace_startup_enable(int command)
 	ftrace_run_update_code(command);
 }
 
-static void ftrace_startup(void)
+static void ftrace_startup(int command)
 {
-	int command = 0;
-
 	if (unlikely(ftrace_disabled))
 		return;
 
@@ -645,10 +643,8 @@ static void ftrace_startup(void)
 	mutex_unlock(&ftrace_start_lock);
 }
 
-static void ftrace_shutdown(void)
+static void ftrace_shutdown(int command)
 {
-	int command = 0;
-
 	if (unlikely(ftrace_disabled))
 		return;
 
@@ -1453,8 +1449,9 @@ device_initcall(ftrace_nodyn_init);
 
 static inline int ftrace_init_dyn_debugfs(struct dentry *d_tracer) { return 0; }
 static inline void ftrace_startup_enable(int command) { }
-# define ftrace_startup()		do { } while (0)
-# define ftrace_shutdown()		do { } while (0)
+/* Keep as macros so we do not need to define the commands */
+# define ftrace_startup(command)	do { } while (0)
+# define ftrace_shutdown(command)	do { } while (0)
 # define ftrace_startup_sysctl()	do { } while (0)
 # define ftrace_shutdown_sysctl()	do { } while (0)
 #endif /* CONFIG_DYNAMIC_FTRACE */
@@ -1585,7 +1582,7 @@ int register_ftrace_function(struct ftrace_ops *ops)
 	}
 
 	ret = __register_ftrace_function(ops);
-	ftrace_startup();
+	ftrace_startup(0);
 
 out:
 	mutex_unlock(&ftrace_sysctl_lock);
@@ -1604,7 +1601,7 @@ int unregister_ftrace_function(struct ftrace_ops *ops)
 
 	mutex_lock(&ftrace_sysctl_lock);
 	ret = __unregister_ftrace_function(ops);
-	ftrace_shutdown();
+	ftrace_shutdown(0);
 	mutex_unlock(&ftrace_sysctl_lock);
 
 	return ret;
@@ -1751,7 +1748,7 @@ int register_ftrace_graph(trace_func_graph_ret_t retfunc,
 	ftrace_tracing_type = FTRACE_TYPE_RETURN;
 	ftrace_graph_return = retfunc;
 	ftrace_graph_entry = entryfunc;
-	ftrace_startup();
+	ftrace_startup(FTRACE_START_FUNC_RET);
 
 out:
 	mutex_unlock(&ftrace_sysctl_lock);
@@ -1765,7 +1762,7 @@ void unregister_ftrace_graph(void)
 	atomic_dec(&ftrace_graph_active);
 	ftrace_graph_return = (trace_func_graph_ret_t)ftrace_stub;
 	ftrace_graph_entry = (trace_func_graph_ent_t)ftrace_stub;
-	ftrace_shutdown();
+	ftrace_shutdown(FTRACE_STOP_FUNC_RET);
 	/* Restore normal tracing type */
 	ftrace_tracing_type = FTRACE_TYPE_ENTER;
 
-- 
cgit v1.2.3-70-g09d2


From f3f47a6768a29448866da4422b6f6bee485c947f Mon Sep 17 00:00:00 2001
From: Arjan van de Ven <arjan@infradead.org>
Date: Sun, 23 Nov 2008 16:49:58 -0800
Subject: tracing: add "power-tracer": C/P state tracer to help power
 optimization

Impact: new "power-tracer" ftrace plugin

This patch adds a C/P-state ftrace plugin that will generate
detailed statistics about the C/P-states that are being used,
so that we can look at detailed decisions that the C/P-state
code is making, rather than the too high level "average"
that we have today.

An example way of using this is:

 mount -t debugfs none /sys/kernel/debug
 echo cstate > /sys/kernel/debug/tracing/current_tracer
 echo 1 > /sys/kernel/debug/tracing/tracing_enabled
 sleep 1
 echo 0 > /sys/kernel/debug/tracing/tracing_enabled
 cat /sys/kernel/debug/tracing/trace | perl scripts/trace/cstate.pl > out.svg

Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c |   4 +
 arch/x86/kernel/process.c                  |  16 +++
 include/linux/ftrace.h                     |  29 +++++
 kernel/trace/Kconfig                       |  11 ++
 kernel/trace/Makefile                      |   1 +
 kernel/trace/trace.h                       |   7 ++
 kernel/trace/trace_power.c                 | 179 +++++++++++++++++++++++++++++
 scripts/trace/power.pl                     | 108 +++++++++++++++++
 8 files changed, 355 insertions(+)
 create mode 100644 kernel/trace/trace_power.c
 create mode 100644 scripts/trace/power.pl

(limited to 'include/linux')

diff --git a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
index 8e48c5d4467..88ea02dcb62 100644
--- a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
+++ b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
@@ -33,6 +33,7 @@
 #include <linux/cpufreq.h>
 #include <linux/compiler.h>
 #include <linux/dmi.h>
+#include <linux/ftrace.h>
 
 #include <linux/acpi.h>
 #include <acpi/processor.h>
@@ -391,6 +392,7 @@ static int acpi_cpufreq_target(struct cpufreq_policy *policy,
 	unsigned int next_perf_state = 0; /* Index into perf table */
 	unsigned int i;
 	int result = 0;
+	struct power_trace it;
 
 	dprintk("acpi_cpufreq_target %d (%d)\n", target_freq, policy->cpu);
 
@@ -427,6 +429,8 @@ static int acpi_cpufreq_target(struct cpufreq_policy *policy,
 		}
 	}
 
+	trace_power_mark(&it, POWER_PSTATE, next_perf_state);
+
 	switch (data->cpu_feature) {
 	case SYSTEM_INTEL_MSR_CAPABLE:
 		cmd.type = SYSTEM_INTEL_MSR_CAPABLE;
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index c622772744d..c27af49a4ed 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -7,6 +7,7 @@
 #include <linux/module.h>
 #include <linux/pm.h>
 #include <linux/clockchips.h>
+#include <linux/ftrace.h>
 #include <asm/system.h>
 
 unsigned long idle_halt;
@@ -100,6 +101,9 @@ static inline int hlt_use_halt(void)
 void default_idle(void)
 {
 	if (hlt_use_halt()) {
+		struct power_trace it;
+
+		trace_power_start(&it, POWER_CSTATE, 1);
 		current_thread_info()->status &= ~TS_POLLING;
 		/*
 		 * TS_POLLING-cleared state must be visible before we
@@ -112,6 +116,7 @@ void default_idle(void)
 		else
 			local_irq_enable();
 		current_thread_info()->status |= TS_POLLING;
+		trace_power_end(&it);
 	} else {
 		local_irq_enable();
 		/* loop is done by the caller */
@@ -154,24 +159,31 @@ EXPORT_SYMBOL_GPL(cpu_idle_wait);
  */
 void mwait_idle_with_hints(unsigned long ax, unsigned long cx)
 {
+	struct power_trace it;
+
+	trace_power_start(&it, POWER_CSTATE, (ax>>4)+1);
 	if (!need_resched()) {
 		__monitor((void *)&current_thread_info()->flags, 0, 0);
 		smp_mb();
 		if (!need_resched())
 			__mwait(ax, cx);
 	}
+	trace_power_end(&it);
 }
 
 /* Default MONITOR/MWAIT with no hints, used for default C1 state */
 static void mwait_idle(void)
 {
+	struct power_trace it;
 	if (!need_resched()) {
+		trace_power_start(&it, POWER_CSTATE, 1);
 		__monitor((void *)&current_thread_info()->flags, 0, 0);
 		smp_mb();
 		if (!need_resched())
 			__sti_mwait(0, 0);
 		else
 			local_irq_enable();
+		trace_power_end(&it);
 	} else
 		local_irq_enable();
 }
@@ -183,9 +195,13 @@ static void mwait_idle(void)
  */
 static void poll_idle(void)
 {
+	struct power_trace it;
+
+	trace_power_start(&it, POWER_CSTATE, 0);
 	local_irq_enable();
 	while (!need_resched())
 		cpu_relax();
+	trace_power_end(&it);
 }
 
 /*
diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index 7854d87b97b..0df28866620 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -311,6 +311,35 @@ ftrace_init_module(struct module *mod,
 		   unsigned long *start, unsigned long *end) { }
 #endif
 
+enum {
+	POWER_NONE = 0,
+	POWER_CSTATE = 1,
+	POWER_PSTATE = 2,
+};
+
+struct power_trace {
+#ifdef CONFIG_POWER_TRACER
+	ktime_t			stamp;
+	ktime_t			end;
+	int			type;
+	int			state;
+#endif
+};
+
+#ifdef CONFIG_POWER_TRACER
+extern void trace_power_start(struct power_trace *it, unsigned int type,
+					unsigned int state);
+extern void trace_power_mark(struct power_trace *it, unsigned int type,
+					unsigned int state);
+extern void trace_power_end(struct power_trace *it);
+#else
+static inline void trace_power_start(struct power_trace *it, unsigned int type,
+					unsigned int state) { }
+static inline void trace_power_mark(struct power_trace *it, unsigned int type,
+					unsigned int state) { }
+static inline void trace_power_end(struct power_trace *it) { }
+#endif
+
 
 /*
  * Structure that defines a return function trace.
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 620feadff67..d151aab48ed 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -217,6 +217,17 @@ config BRANCH_TRACER
 
 	  Say N if unsure.
 
+config POWER_TRACER
+	bool "Trace power consumption behavior"
+	depends on DEBUG_KERNEL
+	depends on X86
+	select TRACING
+	help
+	  This tracer helps developers to analyze and optimize the kernels
+	  power management decisions, specifically the C-state and P-state
+	  behavior.
+
+
 config STACK_TRACER
 	bool "Trace max stack"
 	depends on HAVE_FUNCTION_TRACER
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index cef4bcb4e82..acaa06553ec 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -32,5 +32,6 @@ obj-$(CONFIG_BOOT_TRACER) += trace_boot.o
 obj-$(CONFIG_FUNCTION_RET_TRACER) += trace_functions_return.o
 obj-$(CONFIG_TRACE_BRANCH_PROFILING) += trace_branch.o
 obj-$(CONFIG_BTS_TRACER) += trace_bts.o
+obj-$(CONFIG_POWER_TRACER) += trace_power.o
 
 libftrace-y := ftrace.o
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 3abd645e8af..4c453778a6a 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -28,6 +28,7 @@ enum trace_type {
 	TRACE_FN_RET,
 	TRACE_USER_STACK,
 	TRACE_BTS,
+	TRACE_POWER,
 
 	__TRACE_LAST_TYPE
 };
@@ -160,6 +161,11 @@ struct bts_entry {
 	unsigned long		to;
 };
 
+struct trace_power {
+	struct trace_entry	ent;
+	struct power_trace	state_data;
+};
+
 /*
  * trace_flag_type is an enumeration that holds different
  * states when a trace occurs. These are:
@@ -266,6 +272,7 @@ extern void __ftrace_bad_type(void);
 		IF_ASSIGN(var, ent, struct trace_branch, TRACE_BRANCH); \
 		IF_ASSIGN(var, ent, struct ftrace_ret_entry, TRACE_FN_RET);\
 		IF_ASSIGN(var, ent, struct bts_entry, TRACE_BTS);\
+ 		IF_ASSIGN(var, ent, struct trace_power, TRACE_POWER); \
 		__ftrace_bad_type();					\
 	} while (0)
 
diff --git a/kernel/trace/trace_power.c b/kernel/trace/trace_power.c
new file mode 100644
index 00000000000..a7172a352f6
--- /dev/null
+++ b/kernel/trace/trace_power.c
@@ -0,0 +1,179 @@
+/*
+ * ring buffer based C-state tracer
+ *
+ * Arjan van de Ven <arjan@linux.intel.com>
+ * Copyright (C) 2008 Intel Corporation
+ *
+ * Much is borrowed from trace_boot.c which is
+ * Copyright (C) 2008 Frederic Weisbecker <fweisbec@gmail.com>
+ *
+ */
+
+#include <linux/init.h>
+#include <linux/debugfs.h>
+#include <linux/ftrace.h>
+#include <linux/kallsyms.h>
+#include <linux/module.h>
+
+#include "trace.h"
+
+static struct trace_array *power_trace;
+static int __read_mostly trace_power_enabled;
+
+
+static void start_power_trace(struct trace_array *tr)
+{
+	trace_power_enabled = 1;
+}
+
+static void stop_power_trace(struct trace_array *tr)
+{
+	trace_power_enabled = 0;
+}
+
+
+static int power_trace_init(struct trace_array *tr)
+{
+	int cpu;
+	power_trace = tr;
+
+	trace_power_enabled = 1;
+
+	for_each_cpu_mask(cpu, cpu_possible_map)
+		tracing_reset(tr, cpu);
+	return 0;
+}
+
+static enum print_line_t power_print_line(struct trace_iterator *iter)
+{
+	int ret = 0;
+	struct trace_entry *entry = iter->ent;
+	struct trace_power *field ;
+	struct power_trace *it;
+	struct trace_seq *s = &iter->seq;
+	struct timespec stamp;
+	struct timespec duration;
+
+	trace_assign_type(field, entry);
+	it = &field->state_data;
+	stamp = ktime_to_timespec(it->stamp);
+	duration = ktime_to_timespec(ktime_sub(it->end, it->stamp));
+
+	if (entry->type == TRACE_POWER) {
+		if (it->type == POWER_CSTATE)
+			ret = trace_seq_printf(s, "[%5ld.%09ld] CSTATE: Going to C%i on cpu %i for %ld.%09ld\n",
+					  stamp.tv_sec,
+					  stamp.tv_nsec,
+					  it->state, iter->cpu,
+					  duration.tv_sec,
+					  duration.tv_nsec);
+		if (it->type == POWER_PSTATE)
+			ret = trace_seq_printf(s, "[%5ld.%09ld] PSTATE: Going to P%i on cpu %i\n",
+					  stamp.tv_sec,
+					  stamp.tv_nsec,
+					  it->state, iter->cpu);
+		if (!ret)
+			return TRACE_TYPE_PARTIAL_LINE;
+		return TRACE_TYPE_HANDLED;
+	}
+	return TRACE_TYPE_UNHANDLED;
+}
+
+static struct tracer power_tracer __read_mostly =
+{
+	.name		= "power",
+	.init		= power_trace_init,
+	.start		= start_power_trace,
+	.stop		= stop_power_trace,
+	.reset		= stop_power_trace,
+	.print_line	= power_print_line,
+};
+
+static int init_power_trace(void)
+{
+	return register_tracer(&power_tracer);
+}
+device_initcall(init_power_trace);
+
+void trace_power_start(struct power_trace *it, unsigned int type,
+			 unsigned int level)
+{
+	if (!trace_power_enabled)
+		return;
+
+	memset(it, 0, sizeof(struct power_trace));
+	it->state = level;
+	it->type = type;
+	it->stamp = ktime_get();
+}
+EXPORT_SYMBOL_GPL(trace_power_start);
+
+
+void trace_power_end(struct power_trace *it)
+{
+	struct ring_buffer_event *event;
+	struct trace_power *entry;
+	struct trace_array_cpu *data;
+	unsigned long irq_flags;
+	struct trace_array *tr = power_trace;
+
+	if (!trace_power_enabled)
+		return;
+
+	preempt_disable();
+	it->end = ktime_get();
+	data = tr->data[smp_processor_id()];
+
+	event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry),
+					 &irq_flags);
+	if (!event)
+		goto out;
+	entry	= ring_buffer_event_data(event);
+	tracing_generic_entry_update(&entry->ent, 0, 0);
+	entry->ent.type = TRACE_POWER;
+	entry->state_data = *it;
+	ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
+
+	trace_wake_up();
+
+ out:
+	preempt_enable();
+}
+EXPORT_SYMBOL_GPL(trace_power_end);
+
+void trace_power_mark(struct power_trace *it, unsigned int type,
+			 unsigned int level)
+{
+	struct ring_buffer_event *event;
+	struct trace_power *entry;
+	struct trace_array_cpu *data;
+	unsigned long irq_flags;
+	struct trace_array *tr = power_trace;
+
+	if (!trace_power_enabled)
+		return;
+
+	memset(it, 0, sizeof(struct power_trace));
+	it->state = level;
+	it->type = type;
+	it->stamp = ktime_get();
+	preempt_disable();
+	it->end = it->stamp;
+	data = tr->data[smp_processor_id()];
+
+	event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry),
+					 &irq_flags);
+	if (!event)
+		goto out;
+	entry	= ring_buffer_event_data(event);
+	tracing_generic_entry_update(&entry->ent, 0, 0);
+	entry->ent.type = TRACE_POWER;
+	entry->state_data = *it;
+	ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
+
+	trace_wake_up();
+
+ out:
+	preempt_enable();
+}
+EXPORT_SYMBOL_GPL(trace_power_mark);
diff --git a/scripts/trace/power.pl b/scripts/trace/power.pl
new file mode 100644
index 00000000000..4f729b3501e
--- /dev/null
+++ b/scripts/trace/power.pl
@@ -0,0 +1,108 @@
+#!/usr/bin/perl
+
+# Copyright 2008, Intel Corporation
+#
+# This file is part of the Linux kernel
+#
+# This program file is free software; you can redistribute it and/or modify it
+# under the terms of the GNU General Public License as published by the
+# Free Software Foundation; version 2 of the License.
+#
+# This program is distributed in the hope that it will be useful, but WITHOUT
+# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+# FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+# for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program in a file named COPYING; if not, write to the
+# Free Software Foundation, Inc.,
+# 51 Franklin Street, Fifth Floor,
+# Boston, MA 02110-1301 USA
+#
+# Authors:
+# 	Arjan van de Ven <arjan@linux.intel.com>
+
+
+#
+# This script turns a cstate ftrace output into a SVG graphic that shows
+# historic C-state information
+#
+#
+# 	cat /sys/kernel/debug/tracing/trace | perl power.pl > out.svg
+#
+
+my @styles;
+my $base = 0;
+
+my @pstate_last;
+my @pstate_level;
+
+$styles[0] = "fill:rgb(0,0,255);fill-opacity:0.5;stroke-width:1;stroke:rgb(0,0,0)";
+$styles[1] = "fill:rgb(0,255,0);fill-opacity:0.5;stroke-width:1;stroke:rgb(0,0,0)";
+$styles[2] = "fill:rgb(255,0,20);fill-opacity:0.5;stroke-width:1;stroke:rgb(0,0,0)";
+$styles[3] = "fill:rgb(255,255,20);fill-opacity:0.5;stroke-width:1;stroke:rgb(0,0,0)";
+$styles[4] = "fill:rgb(255,0,255);fill-opacity:0.5;stroke-width:1;stroke:rgb(0,0,0)";
+$styles[5] = "fill:rgb(0,255,255);fill-opacity:0.5;stroke-width:1;stroke:rgb(0,0,0)";
+$styles[6] = "fill:rgb(0,128,255);fill-opacity:0.5;stroke-width:1;stroke:rgb(0,0,0)";
+$styles[7] = "fill:rgb(0,255,128);fill-opacity:0.5;stroke-width:1;stroke:rgb(0,0,0)";
+$styles[8] = "fill:rgb(0,25,20);fill-opacity:0.5;stroke-width:1;stroke:rgb(0,0,0)";
+
+
+print "<?xml version=\"1.0\" standalone=\"no\"?> \n";
+print "<svg width=\"10000\" height=\"100%\" version=\"1.1\" xmlns=\"http://www.w3.org/2000/svg\">\n";
+
+my $scale = 30000.0;
+while (<>) {
+	my $line = $_;
+	if ($line =~ /([0-9\.]+)\] CSTATE: Going to C([0-9]) on cpu ([0-9]+) for ([0-9\.]+)/) {
+		if ($base == 0) {
+			$base = $1;
+		}
+		my $time = $1 - $base;
+		$time = $time * $scale;
+		my $C = $2;
+		my $cpu = $3;
+		my $y = 400 * $cpu;
+		my $duration = $4 * $scale;
+		my $msec = int($4 * 100000)/100.0;
+		my $height = $C * 20;
+		$style = $styles[$C];
+
+		$y = $y + 140 - $height;
+
+		$x2 = $time + 4;
+		$y2 = $y + 4;
+
+
+		print "<rect x=\"$time\" width=\"$duration\" y=\"$y\" height=\"$height\" style=\"$style\"/>\n";
+		print "<text transform=\"translate($x2,$y2) rotate(90)\">C$C $msec</text>\n";
+	}
+	if ($line =~ /([0-9\.]+)\] PSTATE: Going to P([0-9]) on cpu ([0-9]+)/) {
+		my $time = $1 - $base;
+		my $state = $2;
+		my $cpu = $3;
+
+		if (defined($pstate_last[$cpu])) {
+			my $from = $pstate_last[$cpu];
+			my $oldstate = $pstate_state[$cpu];
+			my $duration = ($time-$from) * $scale;
+
+			$from = $from * $scale;
+			my $to = $from + $duration;
+			my $height = 140 - ($oldstate * (140/8));
+
+			my $y = 400 * $cpu + 200 + $height;
+			my $y2 = $y+4;
+			my $style = $styles[8];
+
+			print "<rect x=\"$from\" y=\"$y\" width=\"$duration\" height=\"5\" style=\"$style\"/>\n";
+			print "<text transform=\"translate($from,$y2)\">P$oldstate (cpu $cpu)</text>\n";
+		};
+
+		$pstate_last[$cpu] = $time;
+		$pstate_state[$cpu] = $state;
+	}
+}
+
+
+print "</svg>\n";
-- 
cgit v1.2.3-70-g09d2


From 5f3ea37c7716db4e894a480e0c18b24399595b6b Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Thu, 30 Oct 2008 08:34:33 +0100
Subject: blktrace: port to tracepoints

This was a forward port of work done by Mathieu Desnoyers, I changed it to
encode the 'what' parameter on the tracepoint name, so that one can register
interest in specific events and not on classes of events to then check the
'what' parameter.

Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 block/Kconfig                |   1 +
 block/blk-core.c             |  33 ++---
 block/blktrace.c             | 332 ++++++++++++++++++++++++++++++++++++++++++-
 block/elevator.c             |   7 +-
 drivers/md/dm.c              |   6 +-
 fs/bio.c                     |   3 +-
 include/linux/blktrace_api.h | 172 +---------------------
 include/trace/block.h        |  60 ++++++++
 mm/bounce.c                  |   3 +-
 9 files changed, 418 insertions(+), 199 deletions(-)
 create mode 100644 include/trace/block.h

(limited to 'include/linux')

diff --git a/block/Kconfig b/block/Kconfig
index 1ab7c15c8d7..290b219fad9 100644
--- a/block/Kconfig
+++ b/block/Kconfig
@@ -47,6 +47,7 @@ config BLK_DEV_IO_TRACE
 	depends on SYSFS
 	select RELAY
 	select DEBUG_FS
+	select TRACEPOINTS
 	help
 	  Say Y here if you want to be able to trace the block layer actions
 	  on a given queue. Tracing allows you to see any traffic happening
diff --git a/block/blk-core.c b/block/blk-core.c
index 10e8a64a5a5..04267d66a2b 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -28,6 +28,7 @@
 #include <linux/task_io_accounting_ops.h>
 #include <linux/blktrace_api.h>
 #include <linux/fault-inject.h>
+#include <trace/block.h>
 
 #include "blk.h"
 
@@ -205,7 +206,7 @@ void blk_plug_device(struct request_queue *q)
 
 	if (!queue_flag_test_and_set(QUEUE_FLAG_PLUGGED, q)) {
 		mod_timer(&q->unplug_timer, jiffies + q->unplug_delay);
-		blk_add_trace_generic(q, NULL, 0, BLK_TA_PLUG);
+		trace_block_plug(q);
 	}
 }
 EXPORT_SYMBOL(blk_plug_device);
@@ -292,9 +293,7 @@ void blk_unplug_work(struct work_struct *work)
 	struct request_queue *q =
 		container_of(work, struct request_queue, unplug_work);
 
-	blk_add_trace_pdu_int(q, BLK_TA_UNPLUG_IO, NULL,
-				q->rq.count[READ] + q->rq.count[WRITE]);
-
+	trace_block_unplug_io(q);
 	q->unplug_fn(q);
 }
 
@@ -302,9 +301,7 @@ void blk_unplug_timeout(unsigned long data)
 {
 	struct request_queue *q = (struct request_queue *)data;
 
-	blk_add_trace_pdu_int(q, BLK_TA_UNPLUG_TIMER, NULL,
-				q->rq.count[READ] + q->rq.count[WRITE]);
-
+	trace_block_unplug_timer(q);
 	kblockd_schedule_work(q, &q->unplug_work);
 }
 
@@ -314,9 +311,7 @@ void blk_unplug(struct request_queue *q)
 	 * devices don't necessarily have an ->unplug_fn defined
 	 */
 	if (q->unplug_fn) {
-		blk_add_trace_pdu_int(q, BLK_TA_UNPLUG_IO, NULL,
-					q->rq.count[READ] + q->rq.count[WRITE]);
-
+		trace_block_unplug_io(q);
 		q->unplug_fn(q);
 	}
 }
@@ -822,7 +817,7 @@ rq_starved:
 	if (ioc_batching(q, ioc))
 		ioc->nr_batch_requests--;
 
-	blk_add_trace_generic(q, bio, rw, BLK_TA_GETRQ);
+	trace_block_getrq(q, bio, rw);
 out:
 	return rq;
 }
@@ -848,7 +843,7 @@ static struct request *get_request_wait(struct request_queue *q, int rw_flags,
 		prepare_to_wait_exclusive(&rl->wait[rw], &wait,
 				TASK_UNINTERRUPTIBLE);
 
-		blk_add_trace_generic(q, bio, rw, BLK_TA_SLEEPRQ);
+		trace_block_sleeprq(q, bio, rw);
 
 		__generic_unplug_device(q);
 		spin_unlock_irq(q->queue_lock);
@@ -928,7 +923,7 @@ void blk_requeue_request(struct request_queue *q, struct request *rq)
 {
 	blk_delete_timer(rq);
 	blk_clear_rq_complete(rq);
-	blk_add_trace_rq(q, rq, BLK_TA_REQUEUE);
+	trace_block_rq_requeue(q, rq);
 
 	if (blk_rq_tagged(rq))
 		blk_queue_end_tag(q, rq);
@@ -1167,7 +1162,7 @@ static int __make_request(struct request_queue *q, struct bio *bio)
 		if (!ll_back_merge_fn(q, req, bio))
 			break;
 
-		blk_add_trace_bio(q, bio, BLK_TA_BACKMERGE);
+		trace_block_bio_backmerge(q, bio);
 
 		req->biotail->bi_next = bio;
 		req->biotail = bio;
@@ -1186,7 +1181,7 @@ static int __make_request(struct request_queue *q, struct bio *bio)
 		if (!ll_front_merge_fn(q, req, bio))
 			break;
 
-		blk_add_trace_bio(q, bio, BLK_TA_FRONTMERGE);
+		trace_block_bio_frontmerge(q, bio);
 
 		bio->bi_next = req->bio;
 		req->bio = bio;
@@ -1269,7 +1264,7 @@ static inline void blk_partition_remap(struct bio *bio)
 		bio->bi_sector += p->start_sect;
 		bio->bi_bdev = bdev->bd_contains;
 
-		blk_add_trace_remap(bdev_get_queue(bio->bi_bdev), bio,
+		trace_block_remap(bdev_get_queue(bio->bi_bdev), bio,
 				    bdev->bd_dev, bio->bi_sector,
 				    bio->bi_sector - p->start_sect);
 	}
@@ -1441,10 +1436,10 @@ end_io:
 			goto end_io;
 
 		if (old_sector != -1)
-			blk_add_trace_remap(q, bio, old_dev, bio->bi_sector,
+			trace_block_remap(q, bio, old_dev, bio->bi_sector,
 					    old_sector);
 
-		blk_add_trace_bio(q, bio, BLK_TA_QUEUE);
+		trace_block_bio_queue(q, bio);
 
 		old_sector = bio->bi_sector;
 		old_dev = bio->bi_bdev->bd_dev;
@@ -1656,7 +1651,7 @@ static int __end_that_request_first(struct request *req, int error,
 	int total_bytes, bio_nbytes, next_idx = 0;
 	struct bio *bio;
 
-	blk_add_trace_rq(req->q, req, BLK_TA_COMPLETE);
+	trace_block_rq_complete(req->q, req);
 
 	/*
 	 * for a REQ_TYPE_BLOCK_PC request, we want to carry any eventual
diff --git a/block/blktrace.c b/block/blktrace.c
index 85049a7e7a1..b0a2cae886d 100644
--- a/block/blktrace.c
+++ b/block/blktrace.c
@@ -23,10 +23,18 @@
 #include <linux/mutex.h>
 #include <linux/debugfs.h>
 #include <linux/time.h>
+#include <trace/block.h>
 #include <asm/uaccess.h>
 
 static unsigned int blktrace_seq __read_mostly = 1;
 
+/* Global reference count of probes */
+static DEFINE_MUTEX(blk_probe_mutex);
+static atomic_t blk_probes_ref = ATOMIC_INIT(0);
+
+static int blk_register_tracepoints(void);
+static void blk_unregister_tracepoints(void);
+
 /*
  * Send out a notify message.
  */
@@ -119,7 +127,7 @@ static u32 ddir_act[2] __read_mostly = { BLK_TC_ACT(BLK_TC_READ), BLK_TC_ACT(BLK
  * The worker for the various blk_add_trace*() types. Fills out a
  * blk_io_trace structure and places it in a per-cpu subbuffer.
  */
-void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes,
+static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes,
 		     int rw, u32 what, int error, int pdu_len, void *pdu_data)
 {
 	struct task_struct *tsk = current;
@@ -177,8 +185,6 @@ void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes,
 	local_irq_restore(flags);
 }
 
-EXPORT_SYMBOL_GPL(__blk_add_trace);
-
 static struct dentry *blk_tree_root;
 static DEFINE_MUTEX(blk_tree_mutex);
 static unsigned int root_users;
@@ -237,6 +243,10 @@ static void blk_trace_cleanup(struct blk_trace *bt)
 	free_percpu(bt->sequence);
 	free_percpu(bt->msg_data);
 	kfree(bt);
+	mutex_lock(&blk_probe_mutex);
+	if (atomic_dec_and_test(&blk_probes_ref))
+		blk_unregister_tracepoints();
+	mutex_unlock(&blk_probe_mutex);
 }
 
 int blk_trace_remove(struct request_queue *q)
@@ -428,6 +438,14 @@ int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
 	bt->pid = buts->pid;
 	bt->trace_state = Blktrace_setup;
 
+	mutex_lock(&blk_probe_mutex);
+	if (atomic_add_return(1, &blk_probes_ref) == 1) {
+		ret = blk_register_tracepoints();
+		if (ret)
+			goto probe_err;
+	}
+	mutex_unlock(&blk_probe_mutex);
+
 	ret = -EBUSY;
 	old_bt = xchg(&q->blk_trace, bt);
 	if (old_bt) {
@@ -436,6 +454,9 @@ int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
 	}
 
 	return 0;
+probe_err:
+	atomic_dec(&blk_probes_ref);
+	mutex_unlock(&blk_probe_mutex);
 err:
 	if (dir)
 		blk_remove_tree(dir);
@@ -562,3 +583,308 @@ void blk_trace_shutdown(struct request_queue *q)
 		blk_trace_remove(q);
 	}
 }
+
+/*
+ * blktrace probes
+ */
+
+/**
+ * blk_add_trace_rq - Add a trace for a request oriented action
+ * @q:		queue the io is for
+ * @rq:		the source request
+ * @what:	the action
+ *
+ * Description:
+ *     Records an action against a request. Will log the bio offset + size.
+ *
+ **/
+static void blk_add_trace_rq(struct request_queue *q, struct request *rq,
+				    u32 what)
+{
+	struct blk_trace *bt = q->blk_trace;
+	int rw = rq->cmd_flags & 0x03;
+
+	if (likely(!bt))
+		return;
+
+	if (blk_discard_rq(rq))
+		rw |= (1 << BIO_RW_DISCARD);
+
+	if (blk_pc_request(rq)) {
+		what |= BLK_TC_ACT(BLK_TC_PC);
+		__blk_add_trace(bt, 0, rq->data_len, rw, what, rq->errors,
+				sizeof(rq->cmd), rq->cmd);
+	} else  {
+		what |= BLK_TC_ACT(BLK_TC_FS);
+		__blk_add_trace(bt, rq->hard_sector, rq->hard_nr_sectors << 9,
+				rw, what, rq->errors, 0, NULL);
+	}
+}
+
+static void blk_add_trace_rq_abort(struct request_queue *q, struct request *rq)
+{
+	blk_add_trace_rq(q, rq, BLK_TA_ABORT);
+}
+
+static void blk_add_trace_rq_insert(struct request_queue *q, struct request *rq)
+{
+	blk_add_trace_rq(q, rq, BLK_TA_INSERT);
+}
+
+static void blk_add_trace_rq_issue(struct request_queue *q, struct request *rq)
+{
+	blk_add_trace_rq(q, rq, BLK_TA_ISSUE);
+}
+
+static void blk_add_trace_rq_requeue(struct request_queue *q, struct request *rq)
+{
+	blk_add_trace_rq(q, rq, BLK_TA_REQUEUE);
+}
+
+static void blk_add_trace_rq_complete(struct request_queue *q, struct request *rq)
+{
+	blk_add_trace_rq(q, rq, BLK_TA_COMPLETE);
+}
+
+/**
+ * blk_add_trace_bio - Add a trace for a bio oriented action
+ * @q:		queue the io is for
+ * @bio:	the source bio
+ * @what:	the action
+ *
+ * Description:
+ *     Records an action against a bio. Will log the bio offset + size.
+ *
+ **/
+static void blk_add_trace_bio(struct request_queue *q, struct bio *bio,
+				     u32 what)
+{
+	struct blk_trace *bt = q->blk_trace;
+
+	if (likely(!bt))
+		return;
+
+	__blk_add_trace(bt, bio->bi_sector, bio->bi_size, bio->bi_rw, what,
+			!bio_flagged(bio, BIO_UPTODATE), 0, NULL);
+}
+
+static void blk_add_trace_bio_bounce(struct request_queue *q, struct bio *bio)
+{
+	blk_add_trace_bio(q, bio, BLK_TA_BOUNCE);
+}
+
+static void blk_add_trace_bio_complete(struct request_queue *q, struct bio *bio)
+{
+	blk_add_trace_bio(q, bio, BLK_TA_COMPLETE);
+}
+
+static void blk_add_trace_bio_backmerge(struct request_queue *q, struct bio *bio)
+{
+	blk_add_trace_bio(q, bio, BLK_TA_BACKMERGE);
+}
+
+static void blk_add_trace_bio_frontmerge(struct request_queue *q, struct bio *bio)
+{
+	blk_add_trace_bio(q, bio, BLK_TA_FRONTMERGE);
+}
+
+static void blk_add_trace_bio_queue(struct request_queue *q, struct bio *bio)
+{
+	blk_add_trace_bio(q, bio, BLK_TA_QUEUE);
+}
+
+static void blk_add_trace_getrq(struct request_queue *q, struct bio *bio, int rw)
+{
+	if (bio)
+		blk_add_trace_bio(q, bio, BLK_TA_GETRQ);
+	else {
+		struct blk_trace *bt = q->blk_trace;
+
+		if (bt)
+			__blk_add_trace(bt, 0, 0, rw, BLK_TA_GETRQ, 0, 0, NULL);
+	}
+}
+
+
+static void blk_add_trace_sleeprq(struct request_queue *q, struct bio *bio, int rw)
+{
+	if (bio)
+		blk_add_trace_bio(q, bio, BLK_TA_SLEEPRQ);
+	else {
+		struct blk_trace *bt = q->blk_trace;
+
+		if (bt)
+			__blk_add_trace(bt, 0, 0, rw, BLK_TA_SLEEPRQ, 0, 0, NULL);
+	}
+}
+
+static void blk_add_trace_plug(struct request_queue *q)
+{
+	struct blk_trace *bt = q->blk_trace;
+
+	if (bt)
+		__blk_add_trace(bt, 0, 0, 0, BLK_TA_PLUG, 0, 0, NULL);
+}
+
+static void blk_add_trace_unplug_io(struct request_queue *q)
+{
+	struct blk_trace *bt = q->blk_trace;
+
+	if (bt) {
+		unsigned int pdu = q->rq.count[READ] + q->rq.count[WRITE];
+		__be64 rpdu = cpu_to_be64(pdu);
+
+		__blk_add_trace(bt, 0, 0, 0, BLK_TA_UNPLUG_IO, 0,
+				sizeof(rpdu), &rpdu);
+	}
+}
+
+static void blk_add_trace_unplug_timer(struct request_queue *q)
+{
+	struct blk_trace *bt = q->blk_trace;
+
+	if (bt) {
+		unsigned int pdu = q->rq.count[READ] + q->rq.count[WRITE];
+		__be64 rpdu = cpu_to_be64(pdu);
+
+		__blk_add_trace(bt, 0, 0, 0, BLK_TA_UNPLUG_TIMER, 0,
+				sizeof(rpdu), &rpdu);
+	}
+}
+
+static void blk_add_trace_split(struct request_queue *q, struct bio *bio,
+				unsigned int pdu)
+{
+	struct blk_trace *bt = q->blk_trace;
+
+	if (bt) {
+		__be64 rpdu = cpu_to_be64(pdu);
+
+		__blk_add_trace(bt, bio->bi_sector, bio->bi_size, bio->bi_rw,
+				BLK_TA_SPLIT, !bio_flagged(bio, BIO_UPTODATE),
+				sizeof(rpdu), &rpdu);
+	}
+}
+
+/**
+ * blk_add_trace_remap - Add a trace for a remap operation
+ * @q:		queue the io is for
+ * @bio:	the source bio
+ * @dev:	target device
+ * @from:	source sector
+ * @to:		target sector
+ *
+ * Description:
+ *     Device mapper or raid target sometimes need to split a bio because
+ *     it spans a stripe (or similar). Add a trace for that action.
+ *
+ **/
+static void blk_add_trace_remap(struct request_queue *q, struct bio *bio,
+				       dev_t dev, sector_t from, sector_t to)
+{
+	struct blk_trace *bt = q->blk_trace;
+	struct blk_io_trace_remap r;
+
+	if (likely(!bt))
+		return;
+
+	r.device = cpu_to_be32(dev);
+	r.device_from = cpu_to_be32(bio->bi_bdev->bd_dev);
+	r.sector = cpu_to_be64(to);
+
+	__blk_add_trace(bt, from, bio->bi_size, bio->bi_rw, BLK_TA_REMAP,
+			!bio_flagged(bio, BIO_UPTODATE), sizeof(r), &r);
+}
+
+/**
+ * blk_add_driver_data - Add binary message with driver-specific data
+ * @q:		queue the io is for
+ * @rq:		io request
+ * @data:	driver-specific data
+ * @len:	length of driver-specific data
+ *
+ * Description:
+ *     Some drivers might want to write driver-specific data per request.
+ *
+ **/
+void blk_add_driver_data(struct request_queue *q,
+			 struct request *rq,
+			 void *data, size_t len)
+{
+	struct blk_trace *bt = q->blk_trace;
+
+	if (likely(!bt))
+		return;
+
+	if (blk_pc_request(rq))
+		__blk_add_trace(bt, 0, rq->data_len, 0, BLK_TA_DRV_DATA,
+				rq->errors, len, data);
+	else
+		__blk_add_trace(bt, rq->hard_sector, rq->hard_nr_sectors << 9,
+				0, BLK_TA_DRV_DATA, rq->errors, len, data);
+}
+EXPORT_SYMBOL_GPL(blk_add_driver_data);
+
+static int blk_register_tracepoints(void)
+{
+	int ret;
+
+	ret = register_trace_block_rq_abort(blk_add_trace_rq_abort);
+	WARN_ON(ret);
+	ret = register_trace_block_rq_insert(blk_add_trace_rq_insert);
+	WARN_ON(ret);
+	ret = register_trace_block_rq_issue(blk_add_trace_rq_issue);
+	WARN_ON(ret);
+	ret = register_trace_block_rq_requeue(blk_add_trace_rq_requeue);
+	WARN_ON(ret);
+	ret = register_trace_block_rq_complete(blk_add_trace_rq_complete);
+	WARN_ON(ret);
+	ret = register_trace_block_bio_bounce(blk_add_trace_bio_bounce);
+	WARN_ON(ret);
+	ret = register_trace_block_bio_complete(blk_add_trace_bio_complete);
+	WARN_ON(ret);
+	ret = register_trace_block_bio_backmerge(blk_add_trace_bio_backmerge);
+	WARN_ON(ret);
+	ret = register_trace_block_bio_frontmerge(blk_add_trace_bio_frontmerge);
+	WARN_ON(ret);
+	ret = register_trace_block_bio_queue(blk_add_trace_bio_queue);
+	WARN_ON(ret);
+	ret = register_trace_block_getrq(blk_add_trace_getrq);
+	WARN_ON(ret);
+	ret = register_trace_block_sleeprq(blk_add_trace_sleeprq);
+	WARN_ON(ret);
+	ret = register_trace_block_plug(blk_add_trace_plug);
+	WARN_ON(ret);
+	ret = register_trace_block_unplug_timer(blk_add_trace_unplug_timer);
+	WARN_ON(ret);
+	ret = register_trace_block_unplug_io(blk_add_trace_unplug_io);
+	WARN_ON(ret);
+	ret = register_trace_block_split(blk_add_trace_split);
+	WARN_ON(ret);
+	ret = register_trace_block_remap(blk_add_trace_remap);
+	WARN_ON(ret);
+	return 0;
+}
+
+static void blk_unregister_tracepoints(void)
+{
+	unregister_trace_block_remap(blk_add_trace_remap);
+	unregister_trace_block_split(blk_add_trace_split);
+	unregister_trace_block_unplug_io(blk_add_trace_unplug_io);
+	unregister_trace_block_unplug_timer(blk_add_trace_unplug_timer);
+	unregister_trace_block_plug(blk_add_trace_plug);
+	unregister_trace_block_sleeprq(blk_add_trace_sleeprq);
+	unregister_trace_block_getrq(blk_add_trace_getrq);
+	unregister_trace_block_bio_queue(blk_add_trace_bio_queue);
+	unregister_trace_block_bio_frontmerge(blk_add_trace_bio_frontmerge);
+	unregister_trace_block_bio_backmerge(blk_add_trace_bio_backmerge);
+	unregister_trace_block_bio_complete(blk_add_trace_bio_complete);
+	unregister_trace_block_bio_bounce(blk_add_trace_bio_bounce);
+	unregister_trace_block_rq_complete(blk_add_trace_rq_complete);
+	unregister_trace_block_rq_requeue(blk_add_trace_rq_requeue);
+	unregister_trace_block_rq_issue(blk_add_trace_rq_issue);
+	unregister_trace_block_rq_insert(blk_add_trace_rq_insert);
+	unregister_trace_block_rq_abort(blk_add_trace_rq_abort);
+
+	tracepoint_synchronize_unregister();
+}
diff --git a/block/elevator.c b/block/elevator.c
index 9ac82dde99d..530fcfe2ef0 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -33,6 +33,7 @@
 #include <linux/compiler.h>
 #include <linux/delay.h>
 #include <linux/blktrace_api.h>
+#include <trace/block.h>
 #include <linux/hash.h>
 #include <linux/uaccess.h>
 
@@ -586,7 +587,7 @@ void elv_insert(struct request_queue *q, struct request *rq, int where)
 	unsigned ordseq;
 	int unplug_it = 1;
 
-	blk_add_trace_rq(q, rq, BLK_TA_INSERT);
+	trace_block_rq_insert(q, rq);
 
 	rq->q = q;
 
@@ -772,7 +773,7 @@ struct request *elv_next_request(struct request_queue *q)
 			 * not be passed by new incoming requests
 			 */
 			rq->cmd_flags |= REQ_STARTED;
-			blk_add_trace_rq(q, rq, BLK_TA_ISSUE);
+			trace_block_rq_issue(q, rq);
 		}
 
 		if (!q->boundary_rq || q->boundary_rq == rq) {
@@ -921,7 +922,7 @@ void elv_abort_queue(struct request_queue *q)
 	while (!list_empty(&q->queue_head)) {
 		rq = list_entry_rq(q->queue_head.next);
 		rq->cmd_flags |= REQ_QUIET;
-		blk_add_trace_rq(q, rq, BLK_TA_ABORT);
+		trace_block_rq_abort(q, rq);
 		__blk_end_request(rq, -EIO, blk_rq_bytes(rq));
 	}
 }
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index c99e4728ff4..d23fda17816 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -21,6 +21,7 @@
 #include <linux/idr.h>
 #include <linux/hdreg.h>
 #include <linux/blktrace_api.h>
+#include <trace/block.h>
 
 #define DM_MSG_PREFIX "core"
 
@@ -504,8 +505,7 @@ static void dec_pending(struct dm_io *io, int error)
 		end_io_acct(io);
 
 		if (io->error != DM_ENDIO_REQUEUE) {
-			blk_add_trace_bio(io->md->queue, io->bio,
-					  BLK_TA_COMPLETE);
+			trace_block_bio_complete(io->md->queue, io->bio);
 
 			bio_endio(io->bio, io->error);
 		}
@@ -598,7 +598,7 @@ static void __map_bio(struct dm_target *ti, struct bio *clone,
 	if (r == DM_MAPIO_REMAPPED) {
 		/* the bio has been remapped so dispatch it */
 
-		blk_add_trace_remap(bdev_get_queue(clone->bi_bdev), clone,
+		trace_block_remap(bdev_get_queue(clone->bi_bdev), clone,
 				    tio->io->bio->bi_bdev->bd_dev,
 				    clone->bi_sector, sector);
 
diff --git a/fs/bio.c b/fs/bio.c
index 77a55bcceed..060859c6909 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -26,6 +26,7 @@
 #include <linux/mempool.h>
 #include <linux/workqueue.h>
 #include <linux/blktrace_api.h>
+#include <trace/block.h>
 #include <scsi/sg.h>		/* for struct sg_iovec */
 
 static struct kmem_cache *bio_slab __read_mostly;
@@ -1263,7 +1264,7 @@ struct bio_pair *bio_split(struct bio *bi, int first_sectors)
 	if (!bp)
 		return bp;
 
-	blk_add_trace_pdu_int(bdev_get_queue(bi->bi_bdev), BLK_TA_SPLIT, bi,
+	trace_block_split(bdev_get_queue(bi->bi_bdev), bi,
 				bi->bi_sector + first_sectors);
 
 	BUG_ON(bi->bi_vcnt != 1);
diff --git a/include/linux/blktrace_api.h b/include/linux/blktrace_api.h
index bdf505d33e7..1dba3493d52 100644
--- a/include/linux/blktrace_api.h
+++ b/include/linux/blktrace_api.h
@@ -160,7 +160,6 @@ struct blk_trace {
 
 extern int blk_trace_ioctl(struct block_device *, unsigned, char __user *);
 extern void blk_trace_shutdown(struct request_queue *);
-extern void __blk_add_trace(struct blk_trace *, sector_t, int, int, u32, int, int, void *);
 extern int do_blk_trace_setup(struct request_queue *q,
 	char *name, dev_t dev, struct blk_user_trace_setup *buts);
 extern void __trace_note_message(struct blk_trace *, const char *fmt, ...);
@@ -186,168 +185,8 @@ extern void __trace_note_message(struct blk_trace *, const char *fmt, ...);
 	} while (0)
 #define BLK_TN_MAX_MSG		128
 
-/**
- * blk_add_trace_rq - Add a trace for a request oriented action
- * @q:		queue the io is for
- * @rq:		the source request
- * @what:	the action
- *
- * Description:
- *     Records an action against a request. Will log the bio offset + size.
- *
- **/
-static inline void blk_add_trace_rq(struct request_queue *q, struct request *rq,
-				    u32 what)
-{
-	struct blk_trace *bt = q->blk_trace;
-	int rw = rq->cmd_flags & 0x03;
-
-	if (likely(!bt))
-		return;
-
-	if (blk_discard_rq(rq))
-		rw |= (1 << BIO_RW_DISCARD);
-
-	if (blk_pc_request(rq)) {
-		what |= BLK_TC_ACT(BLK_TC_PC);
-		__blk_add_trace(bt, 0, rq->data_len, rw, what, rq->errors, sizeof(rq->cmd), rq->cmd);
-	} else  {
-		what |= BLK_TC_ACT(BLK_TC_FS);
-		__blk_add_trace(bt, rq->hard_sector, rq->hard_nr_sectors << 9, rw, what, rq->errors, 0, NULL);
-	}
-}
-
-/**
- * blk_add_trace_bio - Add a trace for a bio oriented action
- * @q:		queue the io is for
- * @bio:	the source bio
- * @what:	the action
- *
- * Description:
- *     Records an action against a bio. Will log the bio offset + size.
- *
- **/
-static inline void blk_add_trace_bio(struct request_queue *q, struct bio *bio,
-				     u32 what)
-{
-	struct blk_trace *bt = q->blk_trace;
-
-	if (likely(!bt))
-		return;
-
-	__blk_add_trace(bt, bio->bi_sector, bio->bi_size, bio->bi_rw, what, !bio_flagged(bio, BIO_UPTODATE), 0, NULL);
-}
-
-/**
- * blk_add_trace_generic - Add a trace for a generic action
- * @q:		queue the io is for
- * @bio:	the source bio
- * @rw:		the data direction
- * @what:	the action
- *
- * Description:
- *     Records a simple trace
- *
- **/
-static inline void blk_add_trace_generic(struct request_queue *q,
-					 struct bio *bio, int rw, u32 what)
-{
-	struct blk_trace *bt = q->blk_trace;
-
-	if (likely(!bt))
-		return;
-
-	if (bio)
-		blk_add_trace_bio(q, bio, what);
-	else
-		__blk_add_trace(bt, 0, 0, rw, what, 0, 0, NULL);
-}
-
-/**
- * blk_add_trace_pdu_int - Add a trace for a bio with an integer payload
- * @q:		queue the io is for
- * @what:	the action
- * @bio:	the source bio
- * @pdu:	the integer payload
- *
- * Description:
- *     Adds a trace with some integer payload. This might be an unplug
- *     option given as the action, with the depth at unplug time given
- *     as the payload
- *
- **/
-static inline void blk_add_trace_pdu_int(struct request_queue *q, u32 what,
-					 struct bio *bio, unsigned int pdu)
-{
-	struct blk_trace *bt = q->blk_trace;
-	__be64 rpdu = cpu_to_be64(pdu);
-
-	if (likely(!bt))
-		return;
-
-	if (bio)
-		__blk_add_trace(bt, bio->bi_sector, bio->bi_size, bio->bi_rw, what, !bio_flagged(bio, BIO_UPTODATE), sizeof(rpdu), &rpdu);
-	else
-		__blk_add_trace(bt, 0, 0, 0, what, 0, sizeof(rpdu), &rpdu);
-}
-
-/**
- * blk_add_trace_remap - Add a trace for a remap operation
- * @q:		queue the io is for
- * @bio:	the source bio
- * @dev:	target device
- * @from:	source sector
- * @to:		target sector
- *
- * Description:
- *     Device mapper or raid target sometimes need to split a bio because
- *     it spans a stripe (or similar). Add a trace for that action.
- *
- **/
-static inline void blk_add_trace_remap(struct request_queue *q, struct bio *bio,
-				       dev_t dev, sector_t from, sector_t to)
-{
-	struct blk_trace *bt = q->blk_trace;
-	struct blk_io_trace_remap r;
-
-	if (likely(!bt))
-		return;
-
-	r.device = cpu_to_be32(dev);
-	r.device_from = cpu_to_be32(bio->bi_bdev->bd_dev);
-	r.sector = cpu_to_be64(to);
-
-	__blk_add_trace(bt, from, bio->bi_size, bio->bi_rw, BLK_TA_REMAP, !bio_flagged(bio, BIO_UPTODATE), sizeof(r), &r);
-}
-
-/**
- * blk_add_driver_data - Add binary message with driver-specific data
- * @q:		queue the io is for
- * @rq:		io request
- * @data:	driver-specific data
- * @len:	length of driver-specific data
- *
- * Description:
- *     Some drivers might want to write driver-specific data per request.
- *
- **/
-static inline void blk_add_driver_data(struct request_queue *q,
-				       struct request *rq,
-				       void *data, size_t len)
-{
-	struct blk_trace *bt = q->blk_trace;
-
-	if (likely(!bt))
-		return;
-
-	if (blk_pc_request(rq))
-		__blk_add_trace(bt, 0, rq->data_len, 0, BLK_TA_DRV_DATA,
-				rq->errors, len, data);
-	else
-		__blk_add_trace(bt, rq->hard_sector, rq->hard_nr_sectors << 9,
-				0, BLK_TA_DRV_DATA, rq->errors, len, data);
-}
-
+extern void blk_add_driver_data(struct request_queue *q, struct request *rq,
+				void *data, size_t len);
 extern int blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
 			   char __user *arg);
 extern int blk_trace_startstop(struct request_queue *q, int start);
@@ -356,13 +195,8 @@ extern int blk_trace_remove(struct request_queue *q);
 #else /* !CONFIG_BLK_DEV_IO_TRACE */
 #define blk_trace_ioctl(bdev, cmd, arg)		(-ENOTTY)
 #define blk_trace_shutdown(q)			do { } while (0)
-#define blk_add_trace_rq(q, rq, what)		do { } while (0)
-#define blk_add_trace_bio(q, rq, what)		do { } while (0)
-#define blk_add_trace_generic(q, rq, rw, what)	do { } while (0)
-#define blk_add_trace_pdu_int(q, what, bio, pdu)	do { } while (0)
-#define blk_add_trace_remap(q, bio, dev, f, t)	do {} while (0)
-#define blk_add_driver_data(q, rq, data, len)	do {} while (0)
 #define do_blk_trace_setup(q, name, dev, buts)	(-ENOTTY)
+#define blk_add_driver_data(q, rq, data, len)	do {} while (0)
 #define blk_trace_setup(q, name, dev, arg)	(-ENOTTY)
 #define blk_trace_startstop(q, start)		(-ENOTTY)
 #define blk_trace_remove(q)			(-ENOTTY)
diff --git a/include/trace/block.h b/include/trace/block.h
new file mode 100644
index 00000000000..3cc2675ebf0
--- /dev/null
+++ b/include/trace/block.h
@@ -0,0 +1,60 @@
+#ifndef _TRACE_BLOCK_H
+#define _TRACE_BLOCK_H
+
+#include <linux/blkdev.h>
+#include <linux/tracepoint.h>
+
+DEFINE_TRACE(block_rq_abort,
+	TPPROTO(struct request_queue *q, struct request *rq),
+	TPARGS(q, rq));
+DEFINE_TRACE(block_rq_insert,
+	TPPROTO(struct request_queue *q, struct request *rq),
+	TPARGS(q, rq));
+DEFINE_TRACE(block_rq_issue,
+	TPPROTO(struct request_queue *q, struct request *rq),
+	TPARGS(q, rq));
+DEFINE_TRACE(block_rq_requeue,
+	TPPROTO(struct request_queue *q, struct request *rq),
+	TPARGS(q, rq));
+DEFINE_TRACE(block_rq_complete,
+	TPPROTO(struct request_queue *q, struct request *rq),
+	TPARGS(q, rq));
+DEFINE_TRACE(block_bio_bounce,
+	TPPROTO(struct request_queue *q, struct bio *bio),
+	TPARGS(q, bio));
+DEFINE_TRACE(block_bio_complete,
+	TPPROTO(struct request_queue *q, struct bio *bio),
+	TPARGS(q, bio));
+DEFINE_TRACE(block_bio_backmerge,
+	TPPROTO(struct request_queue *q, struct bio *bio),
+	TPARGS(q, bio));
+DEFINE_TRACE(block_bio_frontmerge,
+	TPPROTO(struct request_queue *q, struct bio *bio),
+	TPARGS(q, bio));
+DEFINE_TRACE(block_bio_queue,
+	TPPROTO(struct request_queue *q, struct bio *bio),
+	TPARGS(q, bio));
+DEFINE_TRACE(block_getrq,
+	TPPROTO(struct request_queue *q, struct bio *bio, int rw),
+	TPARGS(q, bio, rw));
+DEFINE_TRACE(block_sleeprq,
+	TPPROTO(struct request_queue *q, struct bio *bio, int rw),
+	TPARGS(q, bio, rw));
+DEFINE_TRACE(block_plug,
+	TPPROTO(struct request_queue *q),
+	TPARGS(q));
+DEFINE_TRACE(block_unplug_timer,
+	TPPROTO(struct request_queue *q),
+	TPARGS(q));
+DEFINE_TRACE(block_unplug_io,
+	TPPROTO(struct request_queue *q),
+	TPARGS(q));
+DEFINE_TRACE(block_split,
+	TPPROTO(struct request_queue *q, struct bio *bio, unsigned int pdu),
+	TPARGS(q, bio, pdu));
+DEFINE_TRACE(block_remap,
+	TPPROTO(struct request_queue *q, struct bio *bio, dev_t dev,
+		sector_t from, sector_t to),
+	TPARGS(q, bio, dev, from, to));
+
+#endif
diff --git a/mm/bounce.c b/mm/bounce.c
index 06722c40305..bd1caaa582b 100644
--- a/mm/bounce.c
+++ b/mm/bounce.c
@@ -14,6 +14,7 @@
 #include <linux/hash.h>
 #include <linux/highmem.h>
 #include <linux/blktrace_api.h>
+#include <trace/block.h>
 #include <asm/tlbflush.h>
 
 #define POOL_SIZE	64
@@ -222,7 +223,7 @@ static void __blk_queue_bounce(struct request_queue *q, struct bio **bio_orig,
 	if (!bio)
 		return;
 
-	blk_add_trace_bio(q, *bio_orig, BLK_TA_BOUNCE);
+	trace_block_bio_bounce(q, *bio_orig);
 
 	/*
 	 * at least one page was bounced, fill in possible non-highmem
-- 
cgit v1.2.3-70-g09d2


From dcc7461eef7341e84e2f7274f904ce01a43b2506 Mon Sep 17 00:00:00 2001
From: David Vrabel <david.vrabel@csr.com>
Date: Wed, 26 Nov 2008 13:36:59 +0000
Subject: wusb: add debug files for ASL, PZL and DI to the whci-hcd driver

Add asl, pzl and di debugfs files to uwb/uwbN/wusbhc for WHCI host
controller.  These dump the current ASL, PZL and DI buffer.

Signed-off-by: David Vrabel <david.vrabel@csr.com>
---
 drivers/usb/host/whci/Kbuild  |   1 +
 drivers/usb/host/whci/asl.c   |  25 ------
 drivers/usb/host/whci/debug.c | 189 ++++++++++++++++++++++++++++++++++++++++++
 drivers/usb/host/whci/hcd.c   |   3 +
 drivers/usb/host/whci/pzl.c   |  28 -------
 drivers/usb/host/whci/qset.c  |  40 ---------
 drivers/usb/host/whci/whcd.h  |   9 +-
 drivers/usb/host/whci/wusb.c  |  27 ------
 drivers/uwb/pal.c             |   5 ++
 drivers/uwb/uwb-debug.c       |  13 +++
 drivers/uwb/uwb-internal.h    |   3 +-
 include/linux/uwb.h           |   3 +
 12 files changed, 223 insertions(+), 123 deletions(-)
 create mode 100644 drivers/usb/host/whci/debug.c

(limited to 'include/linux')

diff --git a/drivers/usb/host/whci/Kbuild b/drivers/usb/host/whci/Kbuild
index 26a3871ea0f..11e5040b833 100644
--- a/drivers/usb/host/whci/Kbuild
+++ b/drivers/usb/host/whci/Kbuild
@@ -2,6 +2,7 @@ obj-$(CONFIG_USB_WHCI_HCD) += whci-hcd.o
 
 whci-hcd-y := \
 	asl.o	\
+	debug.o \
 	hcd.o 	\
 	hw.o	\
 	init.o	\
diff --git a/drivers/usb/host/whci/asl.c b/drivers/usb/host/whci/asl.c
index ba99a7a3f81..577c0d29849 100644
--- a/drivers/usb/host/whci/asl.c
+++ b/drivers/usb/host/whci/asl.c
@@ -19,32 +19,11 @@
 #include <linux/dma-mapping.h>
 #include <linux/uwb/umc.h>
 #include <linux/usb.h>
-#define D_LOCAL 0
-#include <linux/uwb/debug.h>
 
 #include "../../wusbcore/wusbhc.h"
 
 #include "whcd.h"
 
-#if D_LOCAL >= 4
-static void dump_asl(struct whc *whc, const char *tag)
-{
-	struct device *dev = &whc->umc->dev;
-	struct whc_qset *qset;
-
-	d_printf(4, dev, "ASL %s\n", tag);
-
-	list_for_each_entry(qset, &whc->async_list, list_node) {
-		dump_qset(qset, dev);
-	}
-}
-#else
-static inline void dump_asl(struct whc *whc, const char *tag)
-{
-}
-#endif
-
-
 static void qset_get_next_prev(struct whc *whc, struct whc_qset *qset,
 			       struct whc_qset **next, struct whc_qset **prev)
 {
@@ -217,8 +196,6 @@ void scan_async_work(struct work_struct *work)
 
 	spin_lock_irq(&whc->lock);
 
-	dump_asl(whc, "before processing");
-
 	/*
 	 * Transerve the software list backwards so new qsets can be
 	 * safely inserted into the ASL without making it non-circular.
@@ -232,8 +209,6 @@ void scan_async_work(struct work_struct *work)
 		update |= process_qset(whc, qset);
 	}
 
-	dump_asl(whc, "after processing");
-
 	spin_unlock_irq(&whc->lock);
 
 	if (update) {
diff --git a/drivers/usb/host/whci/debug.c b/drivers/usb/host/whci/debug.c
new file mode 100644
index 00000000000..cf2d45946c5
--- /dev/null
+++ b/drivers/usb/host/whci/debug.c
@@ -0,0 +1,189 @@
+/*
+ * Wireless Host Controller (WHC) debug.
+ *
+ * Copyright (C) 2008 Cambridge Silicon Radio Ltd.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version
+ * 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+#include <linux/kernel.h>
+#include <linux/debugfs.h>
+#include <linux/seq_file.h>
+
+#include "../../wusbcore/wusbhc.h"
+
+#include "whcd.h"
+
+struct whc_dbg {
+	struct dentry *di_f;
+	struct dentry *asl_f;
+	struct dentry *pzl_f;
+};
+
+void qset_print(struct seq_file *s, struct whc_qset *qset)
+{
+	struct whc_std *std;
+	struct urb *urb = NULL;
+	int i;
+
+	seq_printf(s, "qset %08x\n", (u32)qset->qset_dma);
+	seq_printf(s, "  -> %08x\n", (u32)qset->qh.link);
+	seq_printf(s, "  info: %08x %08x %08x\n",
+		qset->qh.info1, qset->qh.info2,  qset->qh.info3);
+	seq_printf(s, "  sts: %04x errs: %d\n", qset->qh.status, qset->qh.err_count);
+	seq_printf(s, "  TD: sts: %08x opts: %08x\n",
+		qset->qh.overlay.qtd.status, qset->qh.overlay.qtd.options);
+
+	for (i = 0; i < WHCI_QSET_TD_MAX; i++) {
+		seq_printf(s, "  %c%c TD[%d]: sts: %08x opts: %08x ptr: %08x\n",
+			i == qset->td_start ? 'S' : ' ',
+			i == qset->td_end ? 'E' : ' ',
+			i, qset->qtd[i].status, qset->qtd[i].options,
+			(u32)qset->qtd[i].page_list_ptr);
+	}
+	seq_printf(s, "  ntds: %d\n", qset->ntds);
+	list_for_each_entry(std, &qset->stds, list_node) {
+		if (urb != std->urb) {
+			urb = std->urb;
+			seq_printf(s, "  urb %p transferred: %d bytes\n", urb,
+				urb->actual_length);
+		}
+		if (std->qtd)
+			seq_printf(s, "    sTD[%td]: %zu bytes @ %08x\n",
+				std->qtd - &qset->qtd[0],
+				std->len, std->num_pointers ?
+				(u32)(std->pl_virt[0].buf_ptr) : (u32)std->dma_addr);
+		else
+			seq_printf(s, "    sTD[-]: %zd bytes @ %08x\n",
+				std->len, std->num_pointers ?
+				(u32)(std->pl_virt[0].buf_ptr) : (u32)std->dma_addr);
+	}
+}
+
+static int di_print(struct seq_file *s, void *p)
+{
+	struct whc *whc = s->private;
+	char buf[72];
+	int d;
+
+	for (d = 0; d < whc->n_devices; d++) {
+		struct di_buf_entry *di = &whc->di_buf[d];
+
+		bitmap_scnprintf(buf, sizeof(buf),
+				 (unsigned long *)di->availability_info, UWB_NUM_MAS);
+
+		seq_printf(s, "DI[%d]\n", d);
+		seq_printf(s, "  availability: %s\n", buf);
+		seq_printf(s, "  %c%c key idx: %d dev addr: %d\n",
+			   (di->addr_sec_info & WHC_DI_SECURE) ? 'S' : ' ',
+			   (di->addr_sec_info & WHC_DI_DISABLE) ? 'D' : ' ',
+			   (di->addr_sec_info & WHC_DI_KEY_IDX_MASK) >> 8,
+			   (di->addr_sec_info & WHC_DI_DEV_ADDR_MASK));
+	}
+	return 0;
+}
+
+static int asl_print(struct seq_file *s, void *p)
+{
+	struct whc *whc = s->private;
+	struct whc_qset *qset;
+
+	list_for_each_entry(qset, &whc->async_list, list_node) {
+		qset_print(s, qset);
+	}
+
+	return 0;
+}
+
+static int pzl_print(struct seq_file *s, void *p)
+{
+	struct whc *whc = s->private;
+	struct whc_qset *qset;
+	int period;
+
+	for (period = 0; period < 5; period++) {
+		seq_printf(s, "Period %d\n", period);
+		list_for_each_entry(qset, &whc->periodic_list[period], list_node) {
+			qset_print(s, qset);
+		}
+	}
+	return 0;
+}
+
+static int di_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, di_print, inode->i_private);
+}
+
+static int asl_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, asl_print, inode->i_private);
+}
+
+static int pzl_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, pzl_print, inode->i_private);
+}
+
+static struct file_operations di_fops = {
+	.open    = di_open,
+	.read    = seq_read,
+	.llseek  = seq_lseek,
+	.release = single_release,
+	.owner   = THIS_MODULE,
+};
+
+static struct file_operations asl_fops = {
+	.open    = asl_open,
+	.read    = seq_read,
+	.llseek  = seq_lseek,
+	.release = single_release,
+	.owner   = THIS_MODULE,
+};
+
+static struct file_operations pzl_fops = {
+	.open    = pzl_open,
+	.read    = seq_read,
+	.llseek  = seq_lseek,
+	.release = single_release,
+	.owner   = THIS_MODULE,
+};
+
+void whc_dbg_init(struct whc *whc)
+{
+	if (whc->wusbhc.pal.debugfs_dir == NULL)
+		return;
+
+	whc->dbg = kzalloc(sizeof(struct whc_dbg), GFP_KERNEL);
+	if (whc->dbg == NULL)
+		return;
+
+	whc->dbg->di_f = debugfs_create_file("di", 0444,
+					      whc->wusbhc.pal.debugfs_dir, whc,
+					      &di_fops);
+	whc->dbg->asl_f = debugfs_create_file("asl", 0444,
+					      whc->wusbhc.pal.debugfs_dir, whc,
+					      &asl_fops);
+	whc->dbg->pzl_f = debugfs_create_file("pzl", 0444,
+					      whc->wusbhc.pal.debugfs_dir, whc,
+					      &pzl_fops);
+}
+
+void whc_dbg_clean_up(struct whc *whc)
+{
+	if (whc->dbg) {
+		debugfs_remove(whc->dbg->pzl_f);
+		debugfs_remove(whc->dbg->asl_f);
+		debugfs_remove(whc->dbg->di_f);
+		kfree(whc->dbg);
+	}
+}
diff --git a/drivers/usb/host/whci/hcd.c b/drivers/usb/host/whci/hcd.c
index f599f89d3be..1569afd6245 100644
--- a/drivers/usb/host/whci/hcd.c
+++ b/drivers/usb/host/whci/hcd.c
@@ -273,6 +273,8 @@ static int whc_probe(struct umc_dev *umc)
 		goto error_wusbhc_b_create;
 	}
 
+	whc_dbg_init(whc);
+
 	return 0;
 
 error_wusbhc_b_create:
@@ -296,6 +298,7 @@ static void whc_remove(struct umc_dev *umc)
 	struct whc *whc = wusbhc_to_whc(wusbhc);
 
 	if (usb_hcd) {
+		whc_dbg_clean_up(whc);
 		wusbhc_b_destroy(wusbhc);
 		usb_remove_hcd(usb_hcd);
 		wusbhc_destroy(wusbhc);
diff --git a/drivers/usb/host/whci/pzl.c b/drivers/usb/host/whci/pzl.c
index 34d3a0aeab2..2ae5abf69a6 100644
--- a/drivers/usb/host/whci/pzl.c
+++ b/drivers/usb/host/whci/pzl.c
@@ -19,35 +19,11 @@
 #include <linux/dma-mapping.h>
 #include <linux/uwb/umc.h>
 #include <linux/usb.h>
-#define D_LOCAL 0
-#include <linux/uwb/debug.h>
 
 #include "../../wusbcore/wusbhc.h"
 
 #include "whcd.h"
 
-#if D_LOCAL >= 4
-static void dump_pzl(struct whc *whc, const char *tag)
-{
-	struct device *dev = &whc->umc->dev;
-	struct whc_qset *qset;
-	int period = 0;
-
-	d_printf(4, dev, "PZL %s\n", tag);
-
-	for (period = 0; period < 5; period++) {
-		d_printf(4, dev, "Period %d\n", period);
-		list_for_each_entry(qset, &whc->periodic_list[period], list_node) {
-			dump_qset(qset, dev);
-		}
-	}
-}
-#else
-static inline void dump_pzl(struct whc *whc, const char *tag)
-{
-}
-#endif
-
 static void update_pzl_pointers(struct whc *whc, int period, u64 addr)
 {
 	switch (period) {
@@ -250,8 +226,6 @@ void scan_periodic_work(struct work_struct *work)
 
 	spin_lock_irq(&whc->lock);
 
-	dump_pzl(whc, "before processing");
-
 	for (period = 4; period >= 0; period--) {
 		list_for_each_entry_safe(qset, t, &whc->periodic_list[period], list_node) {
 			if (!qset->in_hw_list)
@@ -263,8 +237,6 @@ void scan_periodic_work(struct work_struct *work)
 	if (update & (WHC_UPDATE_ADDED | WHC_UPDATE_REMOVED))
 		update_pzl_hw_view(whc);
 
-	dump_pzl(whc, "after processing");
-
 	spin_unlock_irq(&whc->lock);
 
 	if (update) {
diff --git a/drivers/usb/host/whci/qset.c b/drivers/usb/host/whci/qset.c
index 0420037d2e1..7be74314ee1 100644
--- a/drivers/usb/host/whci/qset.c
+++ b/drivers/usb/host/whci/qset.c
@@ -24,46 +24,6 @@
 
 #include "whcd.h"
 
-void dump_qset(struct whc_qset *qset, struct device *dev)
-{
-	struct whc_std *std;
-	struct urb *urb = NULL;
-	int i;
-
-	dev_dbg(dev, "qset %08x\n", (u32)qset->qset_dma);
-	dev_dbg(dev, "  -> %08x\n", (u32)qset->qh.link);
-	dev_dbg(dev, "  info: %08x %08x %08x\n",
-		qset->qh.info1, qset->qh.info2,  qset->qh.info3);
-	dev_dbg(dev, "  sts: %04x errs: %d\n", qset->qh.status, qset->qh.err_count);
-	dev_dbg(dev, "  TD: sts: %08x opts: %08x\n",
-		qset->qh.overlay.qtd.status, qset->qh.overlay.qtd.options);
-
-	for (i = 0; i < WHCI_QSET_TD_MAX; i++) {
-		dev_dbg(dev, "  %c%c TD[%d]: sts: %08x opts: %08x ptr: %08x\n",
-			i == qset->td_start ? 'S' : ' ',
-			i == qset->td_end ? 'E' : ' ',
-			i, qset->qtd[i].status, qset->qtd[i].options,
-			(u32)qset->qtd[i].page_list_ptr);
-	}
-	dev_dbg(dev, "  ntds: %d\n", qset->ntds);
-	list_for_each_entry(std, &qset->stds, list_node) {
-		if (urb != std->urb) {
-			urb = std->urb;
-			dev_dbg(dev, "  urb %p transferred: %d bytes\n", urb,
-				urb->actual_length);
-		}
-		if (std->qtd)
-			dev_dbg(dev, "    sTD[%td]: %zu bytes @ %08x\n",
-				std->qtd - &qset->qtd[0],
-				std->len, std->num_pointers ?
-				(u32)(std->pl_virt[0].buf_ptr) : (u32)std->dma_addr);
-		else
-			dev_dbg(dev, "    sTD[-]: %zd bytes @ %08x\n",
-				std->len, std->num_pointers ?
-				(u32)(std->pl_virt[0].buf_ptr) : (u32)std->dma_addr);
-	}
-}
-
 struct whc_qset *qset_alloc(struct whc *whc, gfp_t mem_flags)
 {
 	struct whc_qset *qset;
diff --git a/drivers/usb/host/whci/whcd.h b/drivers/usb/host/whci/whcd.h
index 1bbb8cb6bf8..0f3540f04f5 100644
--- a/drivers/usb/host/whci/whcd.h
+++ b/drivers/usb/host/whci/whcd.h
@@ -21,6 +21,7 @@
 #define __WHCD_H
 
 #include <linux/uwb/whci.h>
+#include <linux/uwb/umc.h>
 #include <linux/workqueue.h>
 
 #include "whci-hc.h"
@@ -28,6 +29,7 @@
 /* Generic command timeout. */
 #define WHC_GENCMD_TIMEOUT_MS 100
 
+struct whc_dbg;
 
 struct whc {
 	struct wusbhc wusbhc;
@@ -69,6 +71,8 @@ struct whc {
 	struct list_head periodic_removed_list;
 	wait_queue_head_t periodic_list_wq;
 	struct work_struct periodic_work;
+
+	struct whc_dbg *dbg;
 };
 
 #define wusbhc_to_whc(w) (container_of((w), struct whc, wusbhc))
@@ -190,8 +194,11 @@ void process_inactive_qtd(struct whc *whc, struct whc_qset *qset,
 				 struct whc_qtd *qtd);
 enum whc_update qset_add_qtds(struct whc *whc, struct whc_qset *qset);
 void qset_remove_complete(struct whc *whc, struct whc_qset *qset);
-void dump_qset(struct whc_qset *qset, struct device *dev);
 void pzl_update(struct whc *whc, uint32_t wusbcmd);
 void asl_update(struct whc *whc, uint32_t wusbcmd);
 
+/* debug.c */
+void whc_dbg_init(struct whc *whc);
+void whc_dbg_clean_up(struct whc *whc);
+
 #endif /* #ifndef __WHCD_H */
diff --git a/drivers/usb/host/whci/wusb.c b/drivers/usb/host/whci/wusb.c
index 540021a0971..f24efdebad1 100644
--- a/drivers/usb/host/whci/wusb.c
+++ b/drivers/usb/host/whci/wusb.c
@@ -18,43 +18,16 @@
 #include <linux/kernel.h>
 #include <linux/init.h>
 #include <linux/uwb/umc.h>
-#define D_LOCAL 1
-#include <linux/uwb/debug.h>
 
 #include "../../wusbcore/wusbhc.h"
 
 #include "whcd.h"
 
-#if D_LOCAL >= 1
-static void dump_di(struct whc *whc, int idx)
-{
-	struct di_buf_entry *di = &whc->di_buf[idx];
-	struct device *dev = &whc->umc->dev;
-	char buf[128];
-
-	bitmap_scnprintf(buf, sizeof(buf), (unsigned long *)di->availability_info, UWB_NUM_MAS);
-
-	d_printf(1, dev, "DI[%d]\n", idx);
-	d_printf(1, dev, "  availability: %s\n", buf);
-	d_printf(1, dev, "  %c%c key idx: %d dev addr: %d\n",
-		 (di->addr_sec_info & WHC_DI_SECURE) ? 'S' : ' ',
-		 (di->addr_sec_info & WHC_DI_DISABLE) ? 'D' : ' ',
-		 (di->addr_sec_info & WHC_DI_KEY_IDX_MASK) >> 8,
-		 (di->addr_sec_info & WHC_DI_DEV_ADDR_MASK));
-}
-#else
-static inline void dump_di(struct whc *whc, int idx)
-{
-}
-#endif
-
 static int whc_update_di(struct whc *whc, int idx)
 {
 	int offset = idx / 32;
 	u32 bit = 1 << (idx % 32);
 
-	dump_di(whc, idx);
-
 	le_writel(bit, whc->base + WUSBDIBUPDATED + offset);
 
 	return whci_wait_for(&whc->umc->dev,
diff --git a/drivers/uwb/pal.c b/drivers/uwb/pal.c
index 605765124f5..99a19c19909 100644
--- a/drivers/uwb/pal.c
+++ b/drivers/uwb/pal.c
@@ -16,6 +16,7 @@
  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
  */
 #include <linux/kernel.h>
+#include <linux/debugfs.h>
 #include <linux/uwb.h>
 
 #include "uwb-internal.h"
@@ -54,6 +55,8 @@ int uwb_pal_register(struct uwb_pal *pal)
 		}
 	}
 
+	pal->debugfs_dir = uwb_dbg_create_pal_dir(pal);
+
 	mutex_lock(&rc->uwb_dev.mutex);
 	list_add(&pal->node, &rc->pals);
 	mutex_unlock(&rc->uwb_dev.mutex);
@@ -76,6 +79,8 @@ void uwb_pal_unregister(struct uwb_pal *pal)
 	list_del(&pal->node);
 	mutex_unlock(&rc->uwb_dev.mutex);
 
+	debugfs_remove(pal->debugfs_dir);
+
 	if (pal->device) {
 		sysfs_remove_link(&rc->uwb_dev.dev.kobj, pal->name);
 		sysfs_remove_link(&pal->device->kobj, "uwb_rc");
diff --git a/drivers/uwb/uwb-debug.c b/drivers/uwb/uwb-debug.c
index ec1b7a403ff..a6debb9baf3 100644
--- a/drivers/uwb/uwb-debug.c
+++ b/drivers/uwb/uwb-debug.c
@@ -407,3 +407,16 @@ void uwb_dbg_exit(void)
 {
 	debugfs_remove(root_dir);
 }
+
+/**
+ * uwb_dbg_create_pal_dir - create a debugfs directory for a PAL
+ * @pal: The PAL.
+ */
+struct dentry *uwb_dbg_create_pal_dir(struct uwb_pal *pal)
+{
+	struct uwb_rc *rc = pal->rc;
+
+	if (root_dir && rc->dbg && rc->dbg->root_d && pal->name)
+		return debugfs_create_dir(pal->name, rc->dbg->root_d);
+	return NULL;
+}
diff --git a/drivers/uwb/uwb-internal.h b/drivers/uwb/uwb-internal.h
index 9c0cdb4ded0..f0f21f406bf 100644
--- a/drivers/uwb/uwb-internal.h
+++ b/drivers/uwb/uwb-internal.h
@@ -284,8 +284,7 @@ void uwb_dbg_init(void);
 void uwb_dbg_exit(void);
 void uwb_dbg_add_rc(struct uwb_rc *rc);
 void uwb_dbg_del_rc(struct uwb_rc *rc);
-
-/* Workarounds for version specific stuff */
+struct dentry *uwb_dbg_create_pal_dir(struct uwb_pal *pal);
 
 static inline void uwb_dev_lock(struct uwb_dev *uwb_dev)
 {
diff --git a/include/linux/uwb.h b/include/linux/uwb.h
index 1719709d60c..d7ed5201ade 100644
--- a/include/linux/uwb.h
+++ b/include/linux/uwb.h
@@ -394,6 +394,8 @@ struct uwb_rc {
  * @channel: channel being used by the PAL; 0 if the PAL isn't using
  *           the radio; -1 if the PAL wishes to use the radio but
  *           cannot.
+ * @debugfs_dir: a debugfs directory which the PAL can use for its own
+ *           debugfs files.
  *
  * A Protocol Adaptation Layer (PAL) is a user of the WiMedia UWB
  * radio platform (e.g., WUSB, WLP or Bluetooth UWB AMP).
@@ -418,6 +420,7 @@ struct uwb_pal {
 	void (*new_rsv)(struct uwb_pal *pal, struct uwb_rsv *rsv);
 
 	int channel;
+	struct dentry *debugfs_dir;
 };
 
 void uwb_pal_init(struct uwb_pal *pal);
-- 
cgit v1.2.3-70-g09d2


From ce71e27c6fdc43c29f36d307b9100bde70c947fc Mon Sep 17 00:00:00 2001
From: Eduard - Gabriel Munteanu <eduard.munteanu@linux360.ro>
Date: Tue, 19 Aug 2008 20:43:25 +0300
Subject: SLUB: Replace __builtin_return_address(0) with _RET_IP_.

This patch replaces __builtin_return_address(0) with _RET_IP_, since a
previous patch moved _RET_IP_ and _THIS_IP_ to include/linux/kernel.h and
they're widely available now. This makes for shorter and easier to read
code.

[penberg@cs.helsinki.fi: remove _RET_IP_ casts to void pointer]
Signed-off-by: Eduard - Gabriel Munteanu <eduard.munteanu@linux360.ro>
Signed-off-by: Pekka Enberg <penberg@cs.helsinki.fi>
---
 include/linux/slab.h |  8 ++++----
 mm/slab.c            |  8 ++++----
 mm/slub.c            | 48 ++++++++++++++++++++++++------------------------
 3 files changed, 32 insertions(+), 32 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/slab.h b/include/linux/slab.h
index 000da12b5cf..c97ed28559e 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -253,9 +253,9 @@ static inline void *kmem_cache_alloc_node(struct kmem_cache *cachep,
  * request comes from.
  */
 #if defined(CONFIG_DEBUG_SLAB) || defined(CONFIG_SLUB)
-extern void *__kmalloc_track_caller(size_t, gfp_t, void*);
+extern void *__kmalloc_track_caller(size_t, gfp_t, unsigned long);
 #define kmalloc_track_caller(size, flags) \
-	__kmalloc_track_caller(size, flags, __builtin_return_address(0))
+	__kmalloc_track_caller(size, flags, _RET_IP_)
 #else
 #define kmalloc_track_caller(size, flags) \
 	__kmalloc(size, flags)
@@ -271,10 +271,10 @@ extern void *__kmalloc_track_caller(size_t, gfp_t, void*);
  * allocation request comes from.
  */
 #if defined(CONFIG_DEBUG_SLAB) || defined(CONFIG_SLUB)
-extern void *__kmalloc_node_track_caller(size_t, gfp_t, int, void *);
+extern void *__kmalloc_node_track_caller(size_t, gfp_t, int, unsigned long);
 #define kmalloc_node_track_caller(size, flags, node) \
 	__kmalloc_node_track_caller(size, flags, node, \
-			__builtin_return_address(0))
+			_RET_IP_)
 #else
 #define kmalloc_node_track_caller(size, flags, node) \
 	__kmalloc_node(size, flags, node)
diff --git a/mm/slab.c b/mm/slab.c
index 09187517f9d..a1478779901 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -3686,9 +3686,9 @@ void *__kmalloc_node(size_t size, gfp_t flags, int node)
 EXPORT_SYMBOL(__kmalloc_node);
 
 void *__kmalloc_node_track_caller(size_t size, gfp_t flags,
-		int node, void *caller)
+		int node, unsigned long caller)
 {
-	return __do_kmalloc_node(size, flags, node, caller);
+	return __do_kmalloc_node(size, flags, node, (void *)caller);
 }
 EXPORT_SYMBOL(__kmalloc_node_track_caller);
 #else
@@ -3730,9 +3730,9 @@ void *__kmalloc(size_t size, gfp_t flags)
 }
 EXPORT_SYMBOL(__kmalloc);
 
-void *__kmalloc_track_caller(size_t size, gfp_t flags, void *caller)
+void *__kmalloc_track_caller(size_t size, gfp_t flags, unsigned long caller)
 {
-	return __do_kmalloc(size, flags, caller);
+	return __do_kmalloc(size, flags, (void *)caller);
 }
 EXPORT_SYMBOL(__kmalloc_track_caller);
 
diff --git a/mm/slub.c b/mm/slub.c
index 7ec2888bffc..68ab260583d 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -182,7 +182,7 @@ static LIST_HEAD(slab_caches);
  * Tracking user of a slab.
  */
 struct track {
-	void *addr;		/* Called from address */
+	unsigned long addr;	/* Called from address */
 	int cpu;		/* Was running on cpu */
 	int pid;		/* Pid context */
 	unsigned long when;	/* When did the operation occur */
@@ -371,7 +371,7 @@ static struct track *get_track(struct kmem_cache *s, void *object,
 }
 
 static void set_track(struct kmem_cache *s, void *object,
-				enum track_item alloc, void *addr)
+			enum track_item alloc, unsigned long addr)
 {
 	struct track *p;
 
@@ -395,8 +395,8 @@ static void init_tracking(struct kmem_cache *s, void *object)
 	if (!(s->flags & SLAB_STORE_USER))
 		return;
 
-	set_track(s, object, TRACK_FREE, NULL);
-	set_track(s, object, TRACK_ALLOC, NULL);
+	set_track(s, object, TRACK_FREE, 0UL);
+	set_track(s, object, TRACK_ALLOC, 0UL);
 }
 
 static void print_track(const char *s, struct track *t)
@@ -405,7 +405,7 @@ static void print_track(const char *s, struct track *t)
 		return;
 
 	printk(KERN_ERR "INFO: %s in %pS age=%lu cpu=%u pid=%d\n",
-		s, t->addr, jiffies - t->when, t->cpu, t->pid);
+		s, (void *)t->addr, jiffies - t->when, t->cpu, t->pid);
 }
 
 static void print_tracking(struct kmem_cache *s, void *object)
@@ -870,7 +870,7 @@ static void setup_object_debug(struct kmem_cache *s, struct page *page,
 }
 
 static int alloc_debug_processing(struct kmem_cache *s, struct page *page,
-						void *object, void *addr)
+					void *object, unsigned long addr)
 {
 	if (!check_slab(s, page))
 		goto bad;
@@ -910,7 +910,7 @@ bad:
 }
 
 static int free_debug_processing(struct kmem_cache *s, struct page *page,
-						void *object, void *addr)
+					void *object, unsigned long addr)
 {
 	if (!check_slab(s, page))
 		goto fail;
@@ -1033,10 +1033,10 @@ static inline void setup_object_debug(struct kmem_cache *s,
 			struct page *page, void *object) {}
 
 static inline int alloc_debug_processing(struct kmem_cache *s,
-	struct page *page, void *object, void *addr) { return 0; }
+	struct page *page, void *object, unsigned long addr) { return 0; }
 
 static inline int free_debug_processing(struct kmem_cache *s,
-	struct page *page, void *object, void *addr) { return 0; }
+	struct page *page, void *object, unsigned long addr) { return 0; }
 
 static inline int slab_pad_check(struct kmem_cache *s, struct page *page)
 			{ return 1; }
@@ -1503,8 +1503,8 @@ static inline int node_match(struct kmem_cache_cpu *c, int node)
  * we need to allocate a new slab. This is the slowest path since it involves
  * a call to the page allocator and the setup of a new slab.
  */
-static void *__slab_alloc(struct kmem_cache *s,
-		gfp_t gfpflags, int node, void *addr, struct kmem_cache_cpu *c)
+static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
+			  unsigned long addr, struct kmem_cache_cpu *c)
 {
 	void **object;
 	struct page *new;
@@ -1588,7 +1588,7 @@ debug:
  * Otherwise we can simply pick the next object from the lockless free list.
  */
 static __always_inline void *slab_alloc(struct kmem_cache *s,
-		gfp_t gfpflags, int node, void *addr)
+		gfp_t gfpflags, int node, unsigned long addr)
 {
 	void **object;
 	struct kmem_cache_cpu *c;
@@ -1617,14 +1617,14 @@ static __always_inline void *slab_alloc(struct kmem_cache *s,
 
 void *kmem_cache_alloc(struct kmem_cache *s, gfp_t gfpflags)
 {
-	return slab_alloc(s, gfpflags, -1, __builtin_return_address(0));
+	return slab_alloc(s, gfpflags, -1, _RET_IP_);
 }
 EXPORT_SYMBOL(kmem_cache_alloc);
 
 #ifdef CONFIG_NUMA
 void *kmem_cache_alloc_node(struct kmem_cache *s, gfp_t gfpflags, int node)
 {
-	return slab_alloc(s, gfpflags, node, __builtin_return_address(0));
+	return slab_alloc(s, gfpflags, node, _RET_IP_);
 }
 EXPORT_SYMBOL(kmem_cache_alloc_node);
 #endif
@@ -1638,7 +1638,7 @@ EXPORT_SYMBOL(kmem_cache_alloc_node);
  * handling required then we can return immediately.
  */
 static void __slab_free(struct kmem_cache *s, struct page *page,
-				void *x, void *addr, unsigned int offset)
+			void *x, unsigned long addr, unsigned int offset)
 {
 	void *prior;
 	void **object = (void *)x;
@@ -1708,7 +1708,7 @@ debug:
  * with all sorts of special processing.
  */
 static __always_inline void slab_free(struct kmem_cache *s,
-			struct page *page, void *x, void *addr)
+			struct page *page, void *x, unsigned long addr)
 {
 	void **object = (void *)x;
 	struct kmem_cache_cpu *c;
@@ -1735,7 +1735,7 @@ void kmem_cache_free(struct kmem_cache *s, void *x)
 
 	page = virt_to_head_page(x);
 
-	slab_free(s, page, x, __builtin_return_address(0));
+	slab_free(s, page, x, _RET_IP_);
 }
 EXPORT_SYMBOL(kmem_cache_free);
 
@@ -2663,7 +2663,7 @@ void *__kmalloc(size_t size, gfp_t flags)
 	if (unlikely(ZERO_OR_NULL_PTR(s)))
 		return s;
 
-	return slab_alloc(s, flags, -1, __builtin_return_address(0));
+	return slab_alloc(s, flags, -1, _RET_IP_);
 }
 EXPORT_SYMBOL(__kmalloc);
 
@@ -2691,7 +2691,7 @@ void *__kmalloc_node(size_t size, gfp_t flags, int node)
 	if (unlikely(ZERO_OR_NULL_PTR(s)))
 		return s;
 
-	return slab_alloc(s, flags, node, __builtin_return_address(0));
+	return slab_alloc(s, flags, node, _RET_IP_);
 }
 EXPORT_SYMBOL(__kmalloc_node);
 #endif
@@ -2748,7 +2748,7 @@ void kfree(const void *x)
 		put_page(page);
 		return;
 	}
-	slab_free(page->slab, page, object, __builtin_return_address(0));
+	slab_free(page->slab, page, object, _RET_IP_);
 }
 EXPORT_SYMBOL(kfree);
 
@@ -3204,7 +3204,7 @@ static struct notifier_block __cpuinitdata slab_notifier = {
 
 #endif
 
-void *__kmalloc_track_caller(size_t size, gfp_t gfpflags, void *caller)
+void *__kmalloc_track_caller(size_t size, gfp_t gfpflags, unsigned long caller)
 {
 	struct kmem_cache *s;
 
@@ -3220,7 +3220,7 @@ void *__kmalloc_track_caller(size_t size, gfp_t gfpflags, void *caller)
 }
 
 void *__kmalloc_node_track_caller(size_t size, gfp_t gfpflags,
-					int node, void *caller)
+					int node, unsigned long caller)
 {
 	struct kmem_cache *s;
 
@@ -3431,7 +3431,7 @@ static void resiliency_test(void) {};
 
 struct location {
 	unsigned long count;
-	void *addr;
+	unsigned long addr;
 	long long sum_time;
 	long min_time;
 	long max_time;
@@ -3479,7 +3479,7 @@ static int add_location(struct loc_track *t, struct kmem_cache *s,
 {
 	long start, end, pos;
 	struct location *l;
-	void *caddr;
+	unsigned long caddr;
 	unsigned long age = jiffies - track->when;
 
 	start = -1;
-- 
cgit v1.2.3-70-g09d2


From e2f367f269fe19375f10e63efe0f2a6d3ddef8e6 Mon Sep 17 00:00:00 2001
From: Jouni Malinen <j@w1.fi>
Date: Fri, 21 Nov 2008 19:01:30 +0200
Subject: nl80211: Report max TX power in NL80211_BAND_ATTR_FREQS

This is useful information to provide for userspace (e.g., hostapd needs
this to generate Country IE).

Signed-off-by: Jouni Malinen <jouni.malinen@atheros.com>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 include/linux/nl80211.h | 4 ++++
 net/wireless/nl80211.c  | 3 +++
 2 files changed, 7 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/nl80211.h b/include/linux/nl80211.h
index 79827345351..54d6ebe38e3 100644
--- a/include/linux/nl80211.h
+++ b/include/linux/nl80211.h
@@ -508,6 +508,7 @@ enum nl80211_band_attr {
  *	on this channel in current regulatory domain.
  * @NL80211_FREQUENCY_ATTR_RADAR: Radar detection is mandatory
  *	on this channel in current regulatory domain.
+ * @NL80211_FREQUENCY_ATTR_MAX_TX_POWER: Maximum transmission power in dBm.
  */
 enum nl80211_frequency_attr {
 	__NL80211_FREQUENCY_ATTR_INVALID,
@@ -516,12 +517,15 @@ enum nl80211_frequency_attr {
 	NL80211_FREQUENCY_ATTR_PASSIVE_SCAN,
 	NL80211_FREQUENCY_ATTR_NO_IBSS,
 	NL80211_FREQUENCY_ATTR_RADAR,
+	NL80211_FREQUENCY_ATTR_MAX_TX_POWER,
 
 	/* keep last */
 	__NL80211_FREQUENCY_ATTR_AFTER_LAST,
 	NL80211_FREQUENCY_ATTR_MAX = __NL80211_FREQUENCY_ATTR_AFTER_LAST - 1
 };
 
+#define NL80211_FREQUENCY_ATTR_MAX_TX_POWER NL80211_FREQUENCY_ATTR_MAX_TX_POWER
+
 /**
  * enum nl80211_bitrate_attr - bitrate attributes
  * @NL80211_BITRATE_ATTR_RATE: Bitrate in units of 100 kbps
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index 00121ceddb1..2e8464eaaaa 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -198,6 +198,9 @@ static int nl80211_send_wiphy(struct sk_buff *msg, u32 pid, u32 seq, int flags,
 			if (chan->flags & IEEE80211_CHAN_RADAR)
 				NLA_PUT_FLAG(msg, NL80211_FREQUENCY_ATTR_RADAR);
 
+			NLA_PUT_U8(msg, NL80211_FREQUENCY_ATTR_MAX_TX_POWER,
+				   chan->max_power);
+
 			nla_nest_end(msg, nl_freq);
 		}
 
-- 
cgit v1.2.3-70-g09d2


From f80b5e99c7dac5a9a0d72496cec5075a12cd1476 Mon Sep 17 00:00:00 2001
From: Henrique de Moraes Holschuh <hmh@hmh.eng.br>
Date: Fri, 21 Nov 2008 20:40:09 -0200
Subject: rfkill: preserve state across suspend

The rfkill class API requires that the driver connected to a class
call rfkill_force_state() on resume to update the real state of the
rfkill controller, OR that it provides a get_state() hook.

This means there is potentially a hidden call in the resume code flow
that changes rfkill->state (i.e. rfkill_force_state()), so the
previous state of the transmitter was being lost.

The simplest and most future-proof way to fix this is to explicitly
store the pre-sleep state on the rfkill structure, and restore from
that on resume.

Signed-off-by: Henrique de Moraes Holschuh <hmh@hmh.eng.br>
Acked-by: Ivo van Doorn <IvDoorn@gmail.com>
Cc: Matthew Garrett <mjg59@srcf.ucam.org>
Cc: Alan Jenkins <alan-jenkins@tuffmail.co.uk>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 include/linux/rfkill.h | 1 +
 net/rfkill/rfkill.c    | 7 ++++++-
 2 files changed, 7 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/rfkill.h b/include/linux/rfkill.h
index 4cd64b0d982..f376a93927f 100644
--- a/include/linux/rfkill.h
+++ b/include/linux/rfkill.h
@@ -108,6 +108,7 @@ struct rfkill {
 
 	struct device dev;
 	struct list_head node;
+	enum rfkill_state state_for_resume;
 };
 #define to_rfkill(d)	container_of(d, struct rfkill, dev)
 
diff --git a/net/rfkill/rfkill.c b/net/rfkill/rfkill.c
index ec26eae8004..5ad411d3e8f 100644
--- a/net/rfkill/rfkill.c
+++ b/net/rfkill/rfkill.c
@@ -565,10 +565,15 @@ static void rfkill_release(struct device *dev)
 #ifdef CONFIG_PM
 static int rfkill_suspend(struct device *dev, pm_message_t state)
 {
+	struct rfkill *rfkill = to_rfkill(dev);
+
 	/* mark class device as suspended */
 	if (dev->power.power_state.event != state.event)
 		dev->power.power_state = state;
 
+	/* store state for the resume handler */
+	rfkill->state_for_resume = rfkill->state;
+
 	return 0;
 }
 
@@ -590,7 +595,7 @@ static int rfkill_resume(struct device *dev)
 		rfkill_toggle_radio(rfkill,
 				rfkill_epo_lock_active ?
 					RFKILL_STATE_SOFT_BLOCKED :
-					rfkill->state,
+					rfkill->state_for_resume,
 				1);
 
 		mutex_unlock(&rfkill->mutex);
-- 
cgit v1.2.3-70-g09d2


From bf8c1ac6d81ba8c0e4dc2215f84f5e2a3c8227e8 Mon Sep 17 00:00:00 2001
From: Jouni Malinen <j@w1.fi>
Date: Sat, 22 Nov 2008 22:00:31 +0200
Subject: nl80211: Change max TX power to be in mBm instead of dBm

In order to be consistent with NL80211_ATTR_POWER_RULE_MAX_EIRP,
change NL80211_FREQUENCY_ATTR_MAX_TX_POWER to use mBm and U32 instead
of dBm and U8. This is a userspace interface change, but the previous
version had not yet been pushed upstream and there are no userspace
programs using this yet, so there is justification to get this change in
as long as it goes in before the previous version gets out.

Signed-off-by: Jouni Malinen <j@w1.fi>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 include/linux/nl80211.h | 3 ++-
 net/wireless/nl80211.c  | 4 ++--
 2 files changed, 4 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/nl80211.h b/include/linux/nl80211.h
index 54d6ebe38e3..e08c8bcfb78 100644
--- a/include/linux/nl80211.h
+++ b/include/linux/nl80211.h
@@ -508,7 +508,8 @@ enum nl80211_band_attr {
  *	on this channel in current regulatory domain.
  * @NL80211_FREQUENCY_ATTR_RADAR: Radar detection is mandatory
  *	on this channel in current regulatory domain.
- * @NL80211_FREQUENCY_ATTR_MAX_TX_POWER: Maximum transmission power in dBm.
+ * @NL80211_FREQUENCY_ATTR_MAX_TX_POWER: Maximum transmission power in mBm
+ *	(100 * dBm).
  */
 enum nl80211_frequency_attr {
 	__NL80211_FREQUENCY_ATTR_INVALID,
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index 2e8464eaaaa..c9141e3df9b 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -198,8 +198,8 @@ static int nl80211_send_wiphy(struct sk_buff *msg, u32 pid, u32 seq, int flags,
 			if (chan->flags & IEEE80211_CHAN_RADAR)
 				NLA_PUT_FLAG(msg, NL80211_FREQUENCY_ATTR_RADAR);
 
-			NLA_PUT_U8(msg, NL80211_FREQUENCY_ATTR_MAX_TX_POWER,
-				   chan->max_power);
+			NLA_PUT_U32(msg, NL80211_FREQUENCY_ATTR_MAX_TX_POWER,
+				    DBM_TO_MBM(chan->max_power));
 
 			nla_nest_end(msg, nl_freq);
 		}
-- 
cgit v1.2.3-70-g09d2


From d211af055d0c12dc3416c2886e6fbdc6eb74a381 Mon Sep 17 00:00:00 2001
From: Alexander van Heukelum <heukelum@mailshack.com>
Date: Mon, 24 Nov 2008 15:38:45 +0100
Subject: i386: get rid of the use of KPROBE_ENTRY / KPROBE_END

entry_32.S is now the only user of KPROBE_ENTRY / KPROBE_END,
treewide. This patch reorders entry_64.S and explicitly generates
a separate section for functions that need the protection. The
generated code before and after the patch is equal.

The KPROBE_ENTRY and KPROBE_END macro's are removed too.

Signed-off-by: Alexander van Heukelum <heukelum@fastmail.fm>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/entry_32.S | 438 +++++++++++++++++++++++----------------------
 include/linux/linkage.h    |   8 -
 2 files changed, 224 insertions(+), 222 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
index bd02ec77edc..6e96028d1a9 100644
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -688,65 +688,6 @@ ENDPROC(name)
 /* The include is where all of the SMP etc. interrupts come from */
 #include "entry_arch.h"
 
-KPROBE_ENTRY(page_fault)
-	RING0_EC_FRAME
-	pushl $do_page_fault
-	CFI_ADJUST_CFA_OFFSET 4
-	ALIGN
-error_code:
-	/* the function address is in %fs's slot on the stack */
-	pushl %es
-	CFI_ADJUST_CFA_OFFSET 4
-	/*CFI_REL_OFFSET es, 0*/
-	pushl %ds
-	CFI_ADJUST_CFA_OFFSET 4
-	/*CFI_REL_OFFSET ds, 0*/
-	pushl %eax
-	CFI_ADJUST_CFA_OFFSET 4
-	CFI_REL_OFFSET eax, 0
-	pushl %ebp
-	CFI_ADJUST_CFA_OFFSET 4
-	CFI_REL_OFFSET ebp, 0
-	pushl %edi
-	CFI_ADJUST_CFA_OFFSET 4
-	CFI_REL_OFFSET edi, 0
-	pushl %esi
-	CFI_ADJUST_CFA_OFFSET 4
-	CFI_REL_OFFSET esi, 0
-	pushl %edx
-	CFI_ADJUST_CFA_OFFSET 4
-	CFI_REL_OFFSET edx, 0
-	pushl %ecx
-	CFI_ADJUST_CFA_OFFSET 4
-	CFI_REL_OFFSET ecx, 0
-	pushl %ebx
-	CFI_ADJUST_CFA_OFFSET 4
-	CFI_REL_OFFSET ebx, 0
-	cld
-	pushl %fs
-	CFI_ADJUST_CFA_OFFSET 4
-	/*CFI_REL_OFFSET fs, 0*/
-	movl $(__KERNEL_PERCPU), %ecx
-	movl %ecx, %fs
-	UNWIND_ESPFIX_STACK
-	popl %ecx
-	CFI_ADJUST_CFA_OFFSET -4
-	/*CFI_REGISTER es, ecx*/
-	movl PT_FS(%esp), %edi		# get the function address
-	movl PT_ORIG_EAX(%esp), %edx	# get the error code
-	movl $-1, PT_ORIG_EAX(%esp)	# no syscall to restart
-	mov  %ecx, PT_FS(%esp)
-	/*CFI_REL_OFFSET fs, ES*/
-	movl $(__USER_DS), %ecx
-	movl %ecx, %ds
-	movl %ecx, %es
-	TRACE_IRQS_OFF
-	movl %esp,%eax			# pt_regs pointer
-	call *%edi
-	jmp ret_from_exception
-	CFI_ENDPROC
-KPROBE_END(page_fault)
-
 ENTRY(coprocessor_error)
 	RING0_INT_FRAME
 	pushl $0
@@ -777,140 +718,6 @@ ENTRY(device_not_available)
 	CFI_ENDPROC
 END(device_not_available)
 
-/*
- * Debug traps and NMI can happen at the one SYSENTER instruction
- * that sets up the real kernel stack. Check here, since we can't
- * allow the wrong stack to be used.
- *
- * "TSS_sysenter_sp0+12" is because the NMI/debug handler will have
- * already pushed 3 words if it hits on the sysenter instruction:
- * eflags, cs and eip.
- *
- * We just load the right stack, and push the three (known) values
- * by hand onto the new stack - while updating the return eip past
- * the instruction that would have done it for sysenter.
- */
-#define FIX_STACK(offset, ok, label)		\
-	cmpw $__KERNEL_CS,4(%esp);		\
-	jne ok;					\
-label:						\
-	movl TSS_sysenter_sp0+offset(%esp),%esp;	\
-	CFI_DEF_CFA esp, 0;			\
-	CFI_UNDEFINED eip;			\
-	pushfl;					\
-	CFI_ADJUST_CFA_OFFSET 4;		\
-	pushl $__KERNEL_CS;			\
-	CFI_ADJUST_CFA_OFFSET 4;		\
-	pushl $sysenter_past_esp;		\
-	CFI_ADJUST_CFA_OFFSET 4;		\
-	CFI_REL_OFFSET eip, 0
-
-KPROBE_ENTRY(debug)
-	RING0_INT_FRAME
-	cmpl $ia32_sysenter_target,(%esp)
-	jne debug_stack_correct
-	FIX_STACK(12, debug_stack_correct, debug_esp_fix_insn)
-debug_stack_correct:
-	pushl $-1			# mark this as an int
-	CFI_ADJUST_CFA_OFFSET 4
-	SAVE_ALL
-	TRACE_IRQS_OFF
-	xorl %edx,%edx			# error code 0
-	movl %esp,%eax			# pt_regs pointer
-	call do_debug
-	jmp ret_from_exception
-	CFI_ENDPROC
-KPROBE_END(debug)
-
-/*
- * NMI is doubly nasty. It can happen _while_ we're handling
- * a debug fault, and the debug fault hasn't yet been able to
- * clear up the stack. So we first check whether we got  an
- * NMI on the sysenter entry path, but after that we need to
- * check whether we got an NMI on the debug path where the debug
- * fault happened on the sysenter path.
- */
-KPROBE_ENTRY(nmi)
-	RING0_INT_FRAME
-	pushl %eax
-	CFI_ADJUST_CFA_OFFSET 4
-	movl %ss, %eax
-	cmpw $__ESPFIX_SS, %ax
-	popl %eax
-	CFI_ADJUST_CFA_OFFSET -4
-	je nmi_espfix_stack
-	cmpl $ia32_sysenter_target,(%esp)
-	je nmi_stack_fixup
-	pushl %eax
-	CFI_ADJUST_CFA_OFFSET 4
-	movl %esp,%eax
-	/* Do not access memory above the end of our stack page,
-	 * it might not exist.
-	 */
-	andl $(THREAD_SIZE-1),%eax
-	cmpl $(THREAD_SIZE-20),%eax
-	popl %eax
-	CFI_ADJUST_CFA_OFFSET -4
-	jae nmi_stack_correct
-	cmpl $ia32_sysenter_target,12(%esp)
-	je nmi_debug_stack_check
-nmi_stack_correct:
-	/* We have a RING0_INT_FRAME here */
-	pushl %eax
-	CFI_ADJUST_CFA_OFFSET 4
-	SAVE_ALL
-	TRACE_IRQS_OFF
-	xorl %edx,%edx		# zero error code
-	movl %esp,%eax		# pt_regs pointer
-	call do_nmi
-	jmp restore_nocheck_notrace
-	CFI_ENDPROC
-
-nmi_stack_fixup:
-	RING0_INT_FRAME
-	FIX_STACK(12,nmi_stack_correct, 1)
-	jmp nmi_stack_correct
-
-nmi_debug_stack_check:
-	/* We have a RING0_INT_FRAME here */
-	cmpw $__KERNEL_CS,16(%esp)
-	jne nmi_stack_correct
-	cmpl $debug,(%esp)
-	jb nmi_stack_correct
-	cmpl $debug_esp_fix_insn,(%esp)
-	ja nmi_stack_correct
-	FIX_STACK(24,nmi_stack_correct, 1)
-	jmp nmi_stack_correct
-
-nmi_espfix_stack:
-	/* We have a RING0_INT_FRAME here.
-	 *
-	 * create the pointer to lss back
-	 */
-	pushl %ss
-	CFI_ADJUST_CFA_OFFSET 4
-	pushl %esp
-	CFI_ADJUST_CFA_OFFSET 4
-	addw $4, (%esp)
-	/* copy the iret frame of 12 bytes */
-	.rept 3
-	pushl 16(%esp)
-	CFI_ADJUST_CFA_OFFSET 4
-	.endr
-	pushl %eax
-	CFI_ADJUST_CFA_OFFSET 4
-	SAVE_ALL
-	TRACE_IRQS_OFF
-	FIXUP_ESPFIX_STACK		# %eax == %esp
-	xorl %edx,%edx			# zero error code
-	call do_nmi
-	RESTORE_REGS
-	lss 12+4(%esp), %esp		# back to espfix stack
-	CFI_ADJUST_CFA_OFFSET -24
-	jmp irq_return
-	CFI_ENDPROC
-KPROBE_END(nmi)
-
 #ifdef CONFIG_PARAVIRT
 ENTRY(native_iret)
 	iret
@@ -926,19 +733,6 @@ ENTRY(native_irq_enable_sysexit)
 END(native_irq_enable_sysexit)
 #endif
 
-KPROBE_ENTRY(int3)
-	RING0_INT_FRAME
-	pushl $-1			# mark this as an int
-	CFI_ADJUST_CFA_OFFSET 4
-	SAVE_ALL
-	TRACE_IRQS_OFF
-	xorl %edx,%edx		# zero error code
-	movl %esp,%eax		# pt_regs pointer
-	call do_int3
-	jmp ret_from_exception
-	CFI_ENDPROC
-KPROBE_END(int3)
-
 ENTRY(overflow)
 	RING0_INT_FRAME
 	pushl $0
@@ -1003,14 +797,6 @@ ENTRY(stack_segment)
 	CFI_ENDPROC
 END(stack_segment)
 
-KPROBE_ENTRY(general_protection)
-	RING0_EC_FRAME
-	pushl $do_general_protection
-	CFI_ADJUST_CFA_OFFSET 4
-	jmp error_code
-	CFI_ENDPROC
-KPROBE_END(general_protection)
-
 ENTRY(alignment_check)
 	RING0_EC_FRAME
 	pushl $do_alignment_check
@@ -1220,3 +1006,227 @@ END(mcount)
 #include "syscall_table_32.S"
 
 syscall_table_size=(.-sys_call_table)
+
+/*
+ * Some functions should be protected against kprobes
+ */
+	.pushsection .kprobes.text, "ax"
+
+ENTRY(page_fault)
+	RING0_EC_FRAME
+	pushl $do_page_fault
+	CFI_ADJUST_CFA_OFFSET 4
+	ALIGN
+error_code:
+	/* the function address is in %fs's slot on the stack */
+	pushl %es
+	CFI_ADJUST_CFA_OFFSET 4
+	/*CFI_REL_OFFSET es, 0*/
+	pushl %ds
+	CFI_ADJUST_CFA_OFFSET 4
+	/*CFI_REL_OFFSET ds, 0*/
+	pushl %eax
+	CFI_ADJUST_CFA_OFFSET 4
+	CFI_REL_OFFSET eax, 0
+	pushl %ebp
+	CFI_ADJUST_CFA_OFFSET 4
+	CFI_REL_OFFSET ebp, 0
+	pushl %edi
+	CFI_ADJUST_CFA_OFFSET 4
+	CFI_REL_OFFSET edi, 0
+	pushl %esi
+	CFI_ADJUST_CFA_OFFSET 4
+	CFI_REL_OFFSET esi, 0
+	pushl %edx
+	CFI_ADJUST_CFA_OFFSET 4
+	CFI_REL_OFFSET edx, 0
+	pushl %ecx
+	CFI_ADJUST_CFA_OFFSET 4
+	CFI_REL_OFFSET ecx, 0
+	pushl %ebx
+	CFI_ADJUST_CFA_OFFSET 4
+	CFI_REL_OFFSET ebx, 0
+	cld
+	pushl %fs
+	CFI_ADJUST_CFA_OFFSET 4
+	/*CFI_REL_OFFSET fs, 0*/
+	movl $(__KERNEL_PERCPU), %ecx
+	movl %ecx, %fs
+	UNWIND_ESPFIX_STACK
+	popl %ecx
+	CFI_ADJUST_CFA_OFFSET -4
+	/*CFI_REGISTER es, ecx*/
+	movl PT_FS(%esp), %edi		# get the function address
+	movl PT_ORIG_EAX(%esp), %edx	# get the error code
+	movl $-1, PT_ORIG_EAX(%esp)	# no syscall to restart
+	mov  %ecx, PT_FS(%esp)
+	/*CFI_REL_OFFSET fs, ES*/
+	movl $(__USER_DS), %ecx
+	movl %ecx, %ds
+	movl %ecx, %es
+	TRACE_IRQS_OFF
+	movl %esp,%eax			# pt_regs pointer
+	call *%edi
+	jmp ret_from_exception
+	CFI_ENDPROC
+END(page_fault)
+
+/*
+ * Debug traps and NMI can happen at the one SYSENTER instruction
+ * that sets up the real kernel stack. Check here, since we can't
+ * allow the wrong stack to be used.
+ *
+ * "TSS_sysenter_sp0+12" is because the NMI/debug handler will have
+ * already pushed 3 words if it hits on the sysenter instruction:
+ * eflags, cs and eip.
+ *
+ * We just load the right stack, and push the three (known) values
+ * by hand onto the new stack - while updating the return eip past
+ * the instruction that would have done it for sysenter.
+ */
+#define FIX_STACK(offset, ok, label)		\
+	cmpw $__KERNEL_CS,4(%esp);		\
+	jne ok;					\
+label:						\
+	movl TSS_sysenter_sp0+offset(%esp),%esp;	\
+	CFI_DEF_CFA esp, 0;			\
+	CFI_UNDEFINED eip;			\
+	pushfl;					\
+	CFI_ADJUST_CFA_OFFSET 4;		\
+	pushl $__KERNEL_CS;			\
+	CFI_ADJUST_CFA_OFFSET 4;		\
+	pushl $sysenter_past_esp;		\
+	CFI_ADJUST_CFA_OFFSET 4;		\
+	CFI_REL_OFFSET eip, 0
+
+ENTRY(debug)
+	RING0_INT_FRAME
+	cmpl $ia32_sysenter_target,(%esp)
+	jne debug_stack_correct
+	FIX_STACK(12, debug_stack_correct, debug_esp_fix_insn)
+debug_stack_correct:
+	pushl $-1			# mark this as an int
+	CFI_ADJUST_CFA_OFFSET 4
+	SAVE_ALL
+	TRACE_IRQS_OFF
+	xorl %edx,%edx			# error code 0
+	movl %esp,%eax			# pt_regs pointer
+	call do_debug
+	jmp ret_from_exception
+	CFI_ENDPROC
+END(debug)
+
+/*
+ * NMI is doubly nasty. It can happen _while_ we're handling
+ * a debug fault, and the debug fault hasn't yet been able to
+ * clear up the stack. So we first check whether we got  an
+ * NMI on the sysenter entry path, but after that we need to
+ * check whether we got an NMI on the debug path where the debug
+ * fault happened on the sysenter path.
+ */
+ENTRY(nmi)
+	RING0_INT_FRAME
+	pushl %eax
+	CFI_ADJUST_CFA_OFFSET 4
+	movl %ss, %eax
+	cmpw $__ESPFIX_SS, %ax
+	popl %eax
+	CFI_ADJUST_CFA_OFFSET -4
+	je nmi_espfix_stack
+	cmpl $ia32_sysenter_target,(%esp)
+	je nmi_stack_fixup
+	pushl %eax
+	CFI_ADJUST_CFA_OFFSET 4
+	movl %esp,%eax
+	/* Do not access memory above the end of our stack page,
+	 * it might not exist.
+	 */
+	andl $(THREAD_SIZE-1),%eax
+	cmpl $(THREAD_SIZE-20),%eax
+	popl %eax
+	CFI_ADJUST_CFA_OFFSET -4
+	jae nmi_stack_correct
+	cmpl $ia32_sysenter_target,12(%esp)
+	je nmi_debug_stack_check
+nmi_stack_correct:
+	/* We have a RING0_INT_FRAME here */
+	pushl %eax
+	CFI_ADJUST_CFA_OFFSET 4
+	SAVE_ALL
+	TRACE_IRQS_OFF
+	xorl %edx,%edx		# zero error code
+	movl %esp,%eax		# pt_regs pointer
+	call do_nmi
+	jmp restore_nocheck_notrace
+	CFI_ENDPROC
+
+nmi_stack_fixup:
+	RING0_INT_FRAME
+	FIX_STACK(12,nmi_stack_correct, 1)
+	jmp nmi_stack_correct
+
+nmi_debug_stack_check:
+	/* We have a RING0_INT_FRAME here */
+	cmpw $__KERNEL_CS,16(%esp)
+	jne nmi_stack_correct
+	cmpl $debug,(%esp)
+	jb nmi_stack_correct
+	cmpl $debug_esp_fix_insn,(%esp)
+	ja nmi_stack_correct
+	FIX_STACK(24,nmi_stack_correct, 1)
+	jmp nmi_stack_correct
+
+nmi_espfix_stack:
+	/* We have a RING0_INT_FRAME here.
+	 *
+	 * create the pointer to lss back
+	 */
+	pushl %ss
+	CFI_ADJUST_CFA_OFFSET 4
+	pushl %esp
+	CFI_ADJUST_CFA_OFFSET 4
+	addw $4, (%esp)
+	/* copy the iret frame of 12 bytes */
+	.rept 3
+	pushl 16(%esp)
+	CFI_ADJUST_CFA_OFFSET 4
+	.endr
+	pushl %eax
+	CFI_ADJUST_CFA_OFFSET 4
+	SAVE_ALL
+	TRACE_IRQS_OFF
+	FIXUP_ESPFIX_STACK		# %eax == %esp
+	xorl %edx,%edx			# zero error code
+	call do_nmi
+	RESTORE_REGS
+	lss 12+4(%esp), %esp		# back to espfix stack
+	CFI_ADJUST_CFA_OFFSET -24
+	jmp irq_return
+	CFI_ENDPROC
+END(nmi)
+
+ENTRY(int3)
+	RING0_INT_FRAME
+	pushl $-1			# mark this as an int
+	CFI_ADJUST_CFA_OFFSET 4
+	SAVE_ALL
+	TRACE_IRQS_OFF
+	xorl %edx,%edx		# zero error code
+	movl %esp,%eax		# pt_regs pointer
+	call do_int3
+	jmp ret_from_exception
+	CFI_ENDPROC
+END(int3)
+
+ENTRY(general_protection)
+	RING0_EC_FRAME
+	pushl $do_general_protection
+	CFI_ADJUST_CFA_OFFSET 4
+	jmp error_code
+	CFI_ENDPROC
+END(general_protection)
+
+/*
+ * End of kprobes section
+ */
+	.popsection
diff --git a/include/linux/linkage.h b/include/linux/linkage.h
index 9fd1f859021..fee9e59649c 100644
--- a/include/linux/linkage.h
+++ b/include/linux/linkage.h
@@ -64,14 +64,6 @@
 	name:
 #endif
 
-#define KPROBE_ENTRY(name) \
-  .pushsection .kprobes.text, "ax"; \
-  ENTRY(name)
-
-#define KPROBE_END(name) \
-  END(name);		 \
-  .popsection
-
 #ifndef END
 #define END(name) \
   .size name, .-name
-- 
cgit v1.2.3-70-g09d2


From 8b752e3ef6e3f5cde87afc649dd51d92b1e549c1 Mon Sep 17 00:00:00 2001
From: Liming Wang <liming.wang@windriver.com>
Date: Fri, 28 Nov 2008 09:52:40 +0800
Subject: softirq: remove useless function __local_bh_enable

Impact: remove unused code

__local_bh_enable has been replaced with _local_bh_enable.
As comments says "it always nests inside local_bh_enable() sections"
has not been valid now. Also there is no reason to use __local_bh_enable
anywhere, so we can remove this useless function.

Signed-off-by: Liming Wang <liming.wang@windriver.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/bottom_half.h |  1 -
 kernel/softirq.c            | 14 --------------
 2 files changed, 15 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bottom_half.h b/include/linux/bottom_half.h
index 777dbf695d4..27b1bcffe40 100644
--- a/include/linux/bottom_half.h
+++ b/include/linux/bottom_half.h
@@ -2,7 +2,6 @@
 #define _LINUX_BH_H
 
 extern void local_bh_disable(void);
-extern void __local_bh_enable(void);
 extern void _local_bh_enable(void);
 extern void local_bh_enable(void);
 extern void local_bh_enable_ip(unsigned long ip);
diff --git a/kernel/softirq.c b/kernel/softirq.c
index e7c69a720d6..8d9934b4162 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -102,20 +102,6 @@ void local_bh_disable(void)
 
 EXPORT_SYMBOL(local_bh_disable);
 
-void __local_bh_enable(void)
-{
-	WARN_ON_ONCE(in_irq());
-
-	/*
-	 * softirqs should never be enabled by __local_bh_enable(),
-	 * it always nests inside local_bh_enable() sections:
-	 */
-	WARN_ON_ONCE(softirq_count() == SOFTIRQ_OFFSET);
-
-	sub_preempt_count(SOFTIRQ_OFFSET);
-}
-EXPORT_SYMBOL_GPL(__local_bh_enable);
-
 /*
  * Special-case - softirqs can safely be enabled in
  * cond_resched_softirq(), or by __do_softirq(),
-- 
cgit v1.2.3-70-g09d2


From a838c2ec6ea1f18431da74dfe4978c57355b95f3 Mon Sep 17 00:00:00 2001
From: Wu Fengguang <fengguang.wu@intel.com>
Date: Thu, 27 Nov 2008 16:14:44 +0800
Subject: markers: comment marker_synchronize_unregister() on data dependency

Add document and comments on marker_synchronize_unregister(): it
should be called before freeing resources that the probes depend on.

Based on comments from Lai Jiangshan and Mathieu Desnoyers.

Signed-off-by: Wu Fengguang <wfg@linux.intel.com>
Reviewed-by: Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca>
Reviewed-by: Lai Jiangshan <laijs@cn.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 Documentation/markers.txt | 15 ++++++++++-----
 include/linux/marker.h    |  6 ++++--
 2 files changed, 14 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/markers.txt b/Documentation/markers.txt
index 6d275e4ef38..d2b3d0e91b2 100644
--- a/Documentation/markers.txt
+++ b/Documentation/markers.txt
@@ -51,11 +51,16 @@ to call) for the specific marker through marker_probe_register() and can be
 activated by calling marker_arm(). Marker deactivation can be done by calling
 marker_disarm() as many times as marker_arm() has been called. Removing a probe
 is done through marker_probe_unregister(); it will disarm the probe.
-marker_synchronize_unregister() must be called before the end of the module exit
-function to make sure there is no caller left using the probe. This, and the
-fact that preemption is disabled around the probe call, make sure that probe
-removal and module unload are safe. See the "Probe example" section below for a
-sample probe module.
+
+marker_synchronize_unregister() must be called between probe unregistration and
+the first occurrence of
+- the end of module exit function,
+  to make sure there is no caller left using the probe;
+- the free of any resource used by the probes,
+  to make sure the probes wont be accessing invalid data.
+This, and the fact that preemption is disabled around the probe call, make sure
+that probe removal and module unload are safe. See the "Probe example" section
+below for a sample probe module.
 
 The marker mechanism supports inserting multiple instances of the same marker.
 Markers can be put in inline functions, inlined static functions, and
diff --git a/include/linux/marker.h b/include/linux/marker.h
index 34c14bc957f..b85e74ca782 100644
--- a/include/linux/marker.h
+++ b/include/linux/marker.h
@@ -211,8 +211,10 @@ extern void *marker_get_private_data(const char *name, marker_probe_func *probe,
 
 /*
  * marker_synchronize_unregister must be called between the last marker probe
- * unregistration and the end of module exit to make sure there is no caller
- * executing a probe when it is freed.
+ * unregistration and the first one of
+ * - the end of module exit function
+ * - the free of any resource used by the probes
+ * to ensure the code and data are valid for any possibly running probes.
  */
 #define marker_synchronize_unregister() synchronize_sched()
 
-- 
cgit v1.2.3-70-g09d2


From 0f0ca340e57bd7446855fefd07a64249acf81223 Mon Sep 17 00:00:00 2001
From: Giuseppe Cavallaro <peppe.cavallaro@st.com>
Date: Fri, 28 Nov 2008 16:24:56 -0800
Subject: phy: power management support

This patch adds the power management support into the physical
abstraction layer.

Suspend and resume functions respectively turns on/off the bit 11
into the PHY Basic mode control register.
Generic PHY device starts supporting PM.

In order to support the wake-on LAN and avoid to put in power down
the PHY device, the MDIO is aware of what the Ethernet device wants to do.

Voluntary, no CONFIG_PM defines were added into the sources.
Also generic suspend/resume functions are exported to allow
other drivers use them (such as genphy_config_aneg etc.).

Within the phy_driver_register function, we need to remove the
memset. It overrides the device driver owner and it is not good.

Signed-off-by: Giuseppe Cavallaro <peppe.cavallaro@st.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/phy/mdio_bus.c   | 14 ++++++++++----
 drivers/net/phy/phy_device.c | 31 ++++++++++++++++++++++++++++++-
 include/linux/phy.h          |  2 ++
 3 files changed, 42 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/phy/mdio_bus.c b/drivers/net/phy/mdio_bus.c
index 868812ff6de..8755d8cd416 100644
--- a/drivers/net/phy/mdio_bus.c
+++ b/drivers/net/phy/mdio_bus.c
@@ -284,9 +284,12 @@ static int mdio_bus_suspend(struct device * dev, pm_message_t state)
 {
 	int ret = 0;
 	struct device_driver *drv = dev->driver;
+	struct phy_driver *phydrv = to_phy_driver(drv);
+	struct phy_device *phydev = to_phy_device(dev);
 
-	if (drv && drv->suspend)
-		ret = drv->suspend(dev, state);
+	if ((!device_may_wakeup(phydev->dev.parent)) &&
+		(phydrv && phydrv->suspend))
+			ret = phydrv->suspend(phydev);
 
 	return ret;
 }
@@ -295,9 +298,12 @@ static int mdio_bus_resume(struct device * dev)
 {
 	int ret = 0;
 	struct device_driver *drv = dev->driver;
+	struct phy_driver *phydrv = to_phy_driver(drv);
+	struct phy_device *phydev = to_phy_device(dev);
 
-	if (drv && drv->resume)
-		ret = drv->resume(dev);
+	if ((!device_may_wakeup(phydev->dev.parent)) &&
+		(phydrv && phydrv->resume))
+		ret = phydrv->resume(phydev);
 
 	return ret;
 }
diff --git a/drivers/net/phy/phy_device.c b/drivers/net/phy/phy_device.c
index 29546a20604..4cc75a290c0 100644
--- a/drivers/net/phy/phy_device.c
+++ b/drivers/net/phy/phy_device.c
@@ -779,7 +779,35 @@ static int genphy_config_init(struct phy_device *phydev)
 
 	return 0;
 }
+int genphy_suspend(struct phy_device *phydev)
+{
+	int value;
+
+	mutex_lock(&phydev->lock);
+
+	value = phy_read(phydev, MII_BMCR);
+	phy_write(phydev, MII_BMCR, (value | BMCR_PDOWN));
+
+	mutex_unlock(&phydev->lock);
+
+	return 0;
+}
+EXPORT_SYMBOL(genphy_suspend);
 
+int genphy_resume(struct phy_device *phydev)
+{
+	int value;
+
+	mutex_lock(&phydev->lock);
+
+	value = phy_read(phydev, MII_BMCR);
+	phy_write(phydev, MII_BMCR, (value & ~BMCR_PDOWN));
+
+	mutex_unlock(&phydev->lock);
+
+	return 0;
+}
+EXPORT_SYMBOL(genphy_resume);
 
 /**
  * phy_probe - probe and init a PHY device
@@ -855,7 +883,6 @@ int phy_driver_register(struct phy_driver *new_driver)
 {
 	int retval;
 
-	memset(&new_driver->driver, 0, sizeof(new_driver->driver));
 	new_driver->driver.name = new_driver->name;
 	new_driver->driver.bus = &mdio_bus_type;
 	new_driver->driver.probe = phy_probe;
@@ -890,6 +917,8 @@ static struct phy_driver genphy_driver = {
 	.features	= 0,
 	.config_aneg	= genphy_config_aneg,
 	.read_status	= genphy_read_status,
+	.suspend	= genphy_suspend,
+	.resume		= genphy_resume,
 	.driver		= {.owner= THIS_MODULE, },
 };
 
diff --git a/include/linux/phy.h b/include/linux/phy.h
index 77c4ed60b98..d7e54d98869 100644
--- a/include/linux/phy.h
+++ b/include/linux/phy.h
@@ -467,6 +467,8 @@ int genphy_restart_aneg(struct phy_device *phydev);
 int genphy_config_aneg(struct phy_device *phydev);
 int genphy_update_link(struct phy_device *phydev);
 int genphy_read_status(struct phy_device *phydev);
+int genphy_suspend(struct phy_device *phydev);
+int genphy_resume(struct phy_device *phydev);
 void phy_driver_unregister(struct phy_driver *drv);
 int phy_driver_register(struct phy_driver *new_driver);
 void phy_prepare_link(struct phy_device *phydev,
-- 
cgit v1.2.3-70-g09d2


From 6c415b9234a8c71f290e5d4fddc467f103f32719 Mon Sep 17 00:00:00 2001
From: Arun R Bharadwaj <arun@linux.vnet.ibm.com>
Date: Mon, 1 Dec 2008 20:49:05 +0530
Subject: sched: add uid information to sched_debug for CONFIG_USER_SCHED

Impact: extend information in /proc/sched_debug

This patch adds uid information in sched_debug for CONFIG_USER_SCHED

Signed-off-by: Arun R Bharadwaj <arun@linux.vnet.ibm.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/sched.h |  1 +
 kernel/sched.c        | 10 ++++++++++
 kernel/sched_debug.c  |  6 +++++-
 kernel/user.c         |  2 ++
 4 files changed, 18 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 7a69c4d224e..d8733f07d80 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2218,6 +2218,7 @@ extern void normalize_rt_tasks(void);
 extern struct task_group init_task_group;
 #ifdef CONFIG_USER_SCHED
 extern struct task_group root_task_group;
+extern void set_tg_uid(struct user_struct *user);
 #endif
 
 extern struct task_group *sched_create_group(struct task_group *parent);
diff --git a/kernel/sched.c b/kernel/sched.c
index 6a99703e0eb..4c7388ef5be 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -261,6 +261,10 @@ struct task_group {
 	struct cgroup_subsys_state css;
 #endif
 
+#ifdef CONFIG_USER_SCHED
+	uid_t uid;
+#endif
+
 #ifdef CONFIG_FAIR_GROUP_SCHED
 	/* schedulable entities of this group on each cpu */
 	struct sched_entity **se;
@@ -286,6 +290,12 @@ struct task_group {
 
 #ifdef CONFIG_USER_SCHED
 
+/* Helper function to pass uid information to create_sched_user() */
+void set_tg_uid(struct user_struct *user)
+{
+	user->tg->uid = user->uid;
+}
+
 /*
  * Root task group.
  * 	Every UID task group (including init_task_group aka UID-0) will
diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c
index baf2f17af46..4293cfa9681 100644
--- a/kernel/sched_debug.c
+++ b/kernel/sched_debug.c
@@ -160,10 +160,14 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
 	cgroup_path(tg->css.cgroup, path, sizeof(path));
 
 	SEQ_printf(m, "\ncfs_rq[%d]:%s\n", cpu, path);
+#elif defined(CONFIG_USER_SCHED) && defined(CONFIG_FAIR_GROUP_SCHED)
+	{
+		uid_t uid = cfs_rq->tg->uid;
+		SEQ_printf(m, "\ncfs_rq[%d] for UID: %u\n", cpu, uid);
+	}
 #else
 	SEQ_printf(m, "\ncfs_rq[%d]:\n", cpu);
 #endif
-
 	SEQ_printf(m, "  .%-30s: %Ld.%06ld\n", "exec_clock",
 			SPLIT_NS(cfs_rq->exec_clock));
 
diff --git a/kernel/user.c b/kernel/user.c
index 39d6159fae4..cec2224bc9f 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -101,6 +101,8 @@ static int sched_create_user(struct user_struct *up)
 	if (IS_ERR(up->tg))
 		rc = -ENOMEM;
 
+	set_tg_uid(up);
+
 	return rc;
 }
 
-- 
cgit v1.2.3-70-g09d2


From 8789a9e7df6bf9b93739c4c7d4e380725bc9e936 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Tue, 2 Dec 2008 15:34:07 -0500
Subject: ring-buffer: read page interface

Impact: new API to ring buffer

This patch adds a new interface into the ring buffer that allows a
page to be read from the ring buffer on a given CPU. For every page
read, one must also be given to allow for a "swap" of the pages.

 rpage = ring_buffer_alloc_read_page(buffer);
 if (!rpage)
	goto err;
 ret = ring_buffer_read_page(buffer, &rpage, cpu, full);
 if (!ret)
	goto empty;
 process_page(rpage);
 ring_buffer_free_read_page(rpage);

The caller of these functions must handle any waits that are
needed to wait for new data. The ring_buffer_read_page will simply
return 0 if there is no data, or if "full" is set and the writer
is still on the current page.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/ring_buffer.h |   5 ++
 kernel/trace/ring_buffer.c  | 166 ++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 171 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h
index 3bb87a753fa..1a350a847ed 100644
--- a/include/linux/ring_buffer.h
+++ b/include/linux/ring_buffer.h
@@ -124,6 +124,11 @@ void tracing_on(void);
 void tracing_off(void);
 void tracing_off_permanent(void);
 
+void *ring_buffer_alloc_read_page(struct ring_buffer *buffer);
+void ring_buffer_free_read_page(struct ring_buffer *buffer, void *data);
+int ring_buffer_read_page(struct ring_buffer *buffer,
+			  void **data_page, int cpu, int full);
+
 enum ring_buffer_flags {
 	RB_FL_OVERWRITE		= 1 << 0,
 };
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 8619c534588..50b74d3a5c3 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -687,6 +687,12 @@ static inline int rb_null_event(struct ring_buffer_event *event)
 	return event->type == RINGBUF_TYPE_PADDING;
 }
 
+static inline void *
+__rb_data_page_index(struct buffer_data_page *page, unsigned index)
+{
+	return page->data + index;
+}
+
 static inline void *__rb_page_index(struct buffer_page *page, unsigned index)
 {
 	return page->page->data + index;
@@ -2232,6 +2238,166 @@ int ring_buffer_swap_cpu(struct ring_buffer *buffer_a,
 	return 0;
 }
 
+static void rb_remove_entries(struct ring_buffer_per_cpu *cpu_buffer,
+			      struct buffer_data_page *page)
+{
+	struct ring_buffer_event *event;
+	unsigned long head;
+
+	__raw_spin_lock(&cpu_buffer->lock);
+	for (head = 0; head < local_read(&page->commit);
+	     head += rb_event_length(event)) {
+
+		event = __rb_data_page_index(page, head);
+		if (RB_WARN_ON(cpu_buffer, rb_null_event(event)))
+			return;
+		/* Only count data entries */
+		if (event->type != RINGBUF_TYPE_DATA)
+			continue;
+		cpu_buffer->entries--;
+	}
+	__raw_spin_unlock(&cpu_buffer->lock);
+}
+
+/**
+ * ring_buffer_alloc_read_page - allocate a page to read from buffer
+ * @buffer: the buffer to allocate for.
+ *
+ * This function is used in conjunction with ring_buffer_read_page.
+ * When reading a full page from the ring buffer, these functions
+ * can be used to speed up the process. The calling function should
+ * allocate a few pages first with this function. Then when it
+ * needs to get pages from the ring buffer, it passes the result
+ * of this function into ring_buffer_read_page, which will swap
+ * the page that was allocated, with the read page of the buffer.
+ *
+ * Returns:
+ *  The page allocated, or NULL on error.
+ */
+void *ring_buffer_alloc_read_page(struct ring_buffer *buffer)
+{
+	unsigned long addr;
+	struct buffer_data_page *page;
+
+	addr = __get_free_page(GFP_KERNEL);
+	if (!addr)
+		return NULL;
+
+	page = (void *)addr;
+
+	return page;
+}
+
+/**
+ * ring_buffer_free_read_page - free an allocated read page
+ * @buffer: the buffer the page was allocate for
+ * @data: the page to free
+ *
+ * Free a page allocated from ring_buffer_alloc_read_page.
+ */
+void ring_buffer_free_read_page(struct ring_buffer *buffer, void *data)
+{
+	free_page((unsigned long)data);
+}
+
+/**
+ * ring_buffer_read_page - extract a page from the ring buffer
+ * @buffer: buffer to extract from
+ * @data_page: the page to use allocated from ring_buffer_alloc_read_page
+ * @cpu: the cpu of the buffer to extract
+ * @full: should the extraction only happen when the page is full.
+ *
+ * This function will pull out a page from the ring buffer and consume it.
+ * @data_page must be the address of the variable that was returned
+ * from ring_buffer_alloc_read_page. This is because the page might be used
+ * to swap with a page in the ring buffer.
+ *
+ * for example:
+ *	rpage = ring_buffer_alloc_page(buffer);
+ *	if (!rpage)
+ *		return error;
+ *	ret = ring_buffer_read_page(buffer, &rpage, cpu, 0);
+ *	if (ret)
+ *		process_page(rpage);
+ *
+ * When @full is set, the function will not return true unless
+ * the writer is off the reader page.
+ *
+ * Note: it is up to the calling functions to handle sleeps and wakeups.
+ *  The ring buffer can be used anywhere in the kernel and can not
+ *  blindly call wake_up. The layer that uses the ring buffer must be
+ *  responsible for that.
+ *
+ * Returns:
+ *  1 if data has been transferred
+ *  0 if no data has been transferred.
+ */
+int ring_buffer_read_page(struct ring_buffer *buffer,
+			    void **data_page, int cpu, int full)
+{
+	struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu];
+	struct ring_buffer_event *event;
+	struct buffer_data_page *page;
+	unsigned long flags;
+	int ret = 0;
+
+	if (!data_page)
+		return 0;
+
+	page = *data_page;
+	if (!page)
+		return 0;
+
+	spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
+
+	/*
+	 * rb_buffer_peek will get the next ring buffer if
+	 * the current reader page is empty.
+	 */
+	event = rb_buffer_peek(buffer, cpu, NULL);
+	if (!event)
+		goto out;
+
+	/* check for data */
+	if (!local_read(&cpu_buffer->reader_page->page->commit))
+		goto out;
+	/*
+	 * If the writer is already off of the read page, then simply
+	 * switch the read page with the given page. Otherwise
+	 * we need to copy the data from the reader to the writer.
+	 */
+	if (cpu_buffer->reader_page == cpu_buffer->commit_page) {
+		unsigned int read = cpu_buffer->reader_page->read;
+
+		if (full)
+			goto out;
+		/* The writer is still on the reader page, we must copy */
+		page = cpu_buffer->reader_page->page;
+		memcpy(page->data,
+		       cpu_buffer->reader_page->page->data + read,
+		       local_read(&page->commit) - read);
+
+		/* consume what was read */
+		cpu_buffer->reader_page += read;
+
+	} else {
+		/* swap the pages */
+		rb_init_page(page);
+		page = cpu_buffer->reader_page->page;
+		cpu_buffer->reader_page->page = *data_page;
+		cpu_buffer->reader_page->read = 0;
+		*data_page = page;
+	}
+	ret = 1;
+
+	/* update the entry counter */
+	rb_remove_entries(cpu_buffer, page);
+ out:
+	spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
+
+	return ret;
+}
+
 static ssize_t
 rb_simple_read(struct file *filp, char __user *ubuf,
 	       size_t cnt, loff_t *ppos)
-- 
cgit v1.2.3-70-g09d2


From 14a866c567e040ccf6240d68b083dd1dbbde63e6 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Tue, 2 Dec 2008 23:50:02 -0500
Subject: ftrace: add ftrace_graph_stop()

Impact: new ftrace_graph_stop function

While developing more features of function graph, I hit a bug that
caused the WARN_ON to trigger in the prepare_ftrace_return function.
Well, it was hard for me to find out that was happening because the
bug would not print, it would just cause a hard lockup or reboot.
The reason is that it is not safe to call printk from this function.

Looking further, I also found that it calls unregister_ftrace_graph,
which grabs a mutex and calls kstop machine. This would definitely
lock the box up if it were to trigger.

This patch adds a fast and safe ftrace_graph_stop() which will
stop the function tracer. Then it is safe to call the WARN ON.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/ftrace.c | 10 ++++++----
 include/linux/ftrace.h   |  2 ++
 kernel/trace/ftrace.c    |  5 +++++
 3 files changed, 13 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c
index 1a5b8f8cb3c..adba8e9a427 100644
--- a/arch/x86/kernel/ftrace.c
+++ b/arch/x86/kernel/ftrace.c
@@ -484,14 +484,16 @@ void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr)
 		: "memory"
 	);
 
-	if (WARN_ON(faulted)) {
-		unregister_ftrace_graph();
+	if (unlikely(faulted)) {
+		ftrace_graph_stop();
+		WARN_ON(1);
 		return;
 	}
 
-	if (WARN_ON(!__kernel_text_address(old))) {
-		unregister_ftrace_graph();
+	if (unlikely(!__kernel_text_address(old))) {
+		ftrace_graph_stop();
 		*parent = old;
+		WARN_ON(1);
 		return;
 	}
 
diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index afba918c623..58ca1c3a3f4 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -376,6 +376,8 @@ typedef void (*trace_func_graph_ent_t)(struct ftrace_graph_ent *); /* entry */
 extern int register_ftrace_graph(trace_func_graph_ret_t retfunc,
 				trace_func_graph_ent_t entryfunc);
 
+extern void ftrace_graph_stop(void);
+
 /* The current handlers in use */
 extern trace_func_graph_ret_t ftrace_graph_return;
 extern trace_func_graph_ent_t ftrace_graph_entry;
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 2e78628443e..a44af05ae2d 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -1769,5 +1769,10 @@ void ftrace_graph_exit_task(struct task_struct *t)
 
 	kfree(ret_stack);
 }
+
+void ftrace_graph_stop(void)
+{
+	ftrace_stop();
+}
 #endif
 
-- 
cgit v1.2.3-70-g09d2


From e49dc19c6a19ea112fcb94b7c62ec62cdd5c08aa Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Tue, 2 Dec 2008 23:50:05 -0500
Subject: ftrace: function graph return for function entry

Impact: feature, let entry function decide to trace or not

This patch lets the graph tracer entry function decide if the tracing
should be done at the end as well. This requires all function graph
entry functions return 1 if it should trace, or 0 if the return should
not be traced.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/entry_32.S |  3 +++
 arch/x86/kernel/entry_64.S |  3 +++
 arch/x86/kernel/ftrace.c   |  7 ++++++-
 include/linux/ftrace.h     |  2 +-
 kernel/trace/ftrace.c      | 10 +++++++---
 kernel/trace/trace.c       |  4 +++-
 kernel/trace/trace.h       |  2 +-
 7 files changed, 24 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
index 826682abed1..43ceb3f454b 100644
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -1196,6 +1196,9 @@ ENTRY(mcount)
 #ifdef CONFIG_FUNCTION_GRAPH_TRACER
 	cmpl $ftrace_stub, ftrace_graph_return
 	jnz ftrace_graph_caller
+
+	cmpl $ftrace_graph_entry_stub, ftrace_graph_entry
+	jnz ftrace_graph_caller
 #endif
 .globl ftrace_stub
 ftrace_stub:
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index 9060ba6497e..54e0bbdccb9 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -120,6 +120,9 @@ ENTRY(mcount)
 #ifdef CONFIG_FUNCTION_GRAPH_TRACER
 	cmpq $ftrace_stub, ftrace_graph_return
 	jnz ftrace_graph_caller
+
+	cmpq $ftrace_graph_entry_stub, ftrace_graph_entry
+	jnz ftrace_graph_caller
 #endif
 
 .globl ftrace_stub
diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c
index adba8e9a427..d278ad2ebda 100644
--- a/arch/x86/kernel/ftrace.c
+++ b/arch/x86/kernel/ftrace.c
@@ -425,6 +425,7 @@ static void pop_return_trace(struct ftrace_graph_ret *trace, unsigned long *ret)
 	trace->calltime = current->ret_stack[index].calltime;
 	trace->overrun = atomic_read(&current->trace_overrun);
 	trace->depth = index;
+	barrier();
 	current->curr_ret_stack--;
 }
 
@@ -506,7 +507,11 @@ void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr)
 	}
 
 	trace.func = self_addr;
-	ftrace_graph_entry(&trace);
 
+	/* Only trace if the calling function expects to */
+	if (!ftrace_graph_entry(&trace)) {
+		current->curr_ret_stack--;
+		*parent = old;
+	}
 }
 #endif /* CONFIG_FUNCTION_GRAPH_TRACER */
diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index 58ca1c3a3f4..469ceb3e85b 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -371,7 +371,7 @@ struct ftrace_graph_ret {
 #define FTRACE_RETSTACK_ALLOC_SIZE 32
 /* Type of the callback handlers for tracing function graph*/
 typedef void (*trace_func_graph_ret_t)(struct ftrace_graph_ret *); /* return */
-typedef void (*trace_func_graph_ent_t)(struct ftrace_graph_ent *); /* entry */
+typedef int (*trace_func_graph_ent_t)(struct ftrace_graph_ent *); /* entry */
 
 extern int register_ftrace_graph(trace_func_graph_ret_t retfunc,
 				trace_func_graph_ent_t entryfunc);
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index a44af05ae2d..65b9e863056 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -1636,11 +1636,15 @@ ftrace_enable_sysctl(struct ctl_table *table, int write,
 
 static atomic_t ftrace_graph_active;
 
+int ftrace_graph_entry_stub(struct ftrace_graph_ent *trace)
+{
+	return 0;
+}
+
 /* The callbacks that hook a function */
 trace_func_graph_ret_t ftrace_graph_return =
 			(trace_func_graph_ret_t)ftrace_stub;
-trace_func_graph_ent_t ftrace_graph_entry =
-			(trace_func_graph_ent_t)ftrace_stub;
+trace_func_graph_ent_t ftrace_graph_entry = ftrace_graph_entry_stub;
 
 /* Try to assign a return stack array on FTRACE_RETSTACK_ALLOC_SIZE tasks. */
 static int alloc_retstack_tasklist(struct ftrace_ret_stack **ret_stack_list)
@@ -1738,7 +1742,7 @@ void unregister_ftrace_graph(void)
 
 	atomic_dec(&ftrace_graph_active);
 	ftrace_graph_return = (trace_func_graph_ret_t)ftrace_stub;
-	ftrace_graph_entry = (trace_func_graph_ent_t)ftrace_stub;
+	ftrace_graph_entry = ftrace_graph_entry_stub;
 	ftrace_shutdown(FTRACE_STOP_FUNC_RET);
 
 	mutex_unlock(&ftrace_sysctl_lock);
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 380de630ebc..8b6409a62b5 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -1200,7 +1200,7 @@ function_trace_call(unsigned long ip, unsigned long parent_ip)
 }
 
 #ifdef CONFIG_FUNCTION_GRAPH_TRACER
-void trace_graph_entry(struct ftrace_graph_ent *trace)
+int trace_graph_entry(struct ftrace_graph_ent *trace)
 {
 	struct trace_array *tr = &global_trace;
 	struct trace_array_cpu *data;
@@ -1219,6 +1219,8 @@ void trace_graph_entry(struct ftrace_graph_ent *trace)
 	}
 	atomic_dec(&data->disabled);
 	local_irq_restore(flags);
+
+	return 1;
 }
 
 void trace_graph_return(struct ftrace_graph_ret *trace)
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index f96f4e787ff..0565ae9a221 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -412,7 +412,7 @@ void trace_function(struct trace_array *tr,
 		    unsigned long flags, int pc);
 
 void trace_graph_return(struct ftrace_graph_ret *trace);
-void trace_graph_entry(struct ftrace_graph_ent *trace);
+int trace_graph_entry(struct ftrace_graph_ent *trace);
 void trace_bts(struct trace_array *tr,
 	       unsigned long from,
 	       unsigned long to);
-- 
cgit v1.2.3-70-g09d2


From b908b53d580c3e9aba81ebe3339c5b7b4fa8031d Mon Sep 17 00:00:00 2001
From: Anton Vorontsov <avorontsov@ru.mvista.com>
Date: Mon, 1 Dec 2008 06:30:04 +0000
Subject: of/gpio: Implement of_get_gpio_flags()

This adds a new function, of_get_gpio_flags, which is like
of_get_gpio(), but accepts a new "flags" argument.  This new function
will be used by the drivers that need to retrieve additional GPIO
information, such as active-low flag.

Also, this changes the default ("simple") .xlate routine to warn about
bogus (< 2) #gpio-cells usage: the second cell should always be present
for GPIO flags.

Signed-off-by: Anton Vorontsov <avorontsov@ru.mvista.com>
Signed-off-by: Paul Mackerras <paulus@samba.org>
---
 drivers/of/gpio.c       | 36 +++++++++++++++++++++++++++++-------
 include/linux/of_gpio.h | 38 ++++++++++++++++++++++++++++++++++----
 2 files changed, 63 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/of/gpio.c b/drivers/of/gpio.c
index 7cd7301b583..a4ba217116e 100644
--- a/drivers/of/gpio.c
+++ b/drivers/of/gpio.c
@@ -19,14 +19,17 @@
 #include <asm/prom.h>
 
 /**
- * of_get_gpio - Get a GPIO number from the device tree to use with GPIO API
+ * of_get_gpio_flags - Get a GPIO number and flags to use with GPIO API
  * @np:		device node to get GPIO from
  * @index:	index of the GPIO
+ * @flags:	a flags pointer to fill in
  *
  * Returns GPIO number to use with Linux generic GPIO API, or one of the errno
- * value on the error condition.
+ * value on the error condition. If @flags is not NULL the function also fills
+ * in flags for the GPIO.
  */
-int of_get_gpio(struct device_node *np, int index)
+int of_get_gpio_flags(struct device_node *np, int index,
+		      enum of_gpio_flags *flags)
 {
 	int ret;
 	struct device_node *gc;
@@ -59,7 +62,11 @@ int of_get_gpio(struct device_node *np, int index)
 		goto err1;
 	}
 
-	ret = of_gc->xlate(of_gc, np, gpio_spec);
+	/* .xlate might decide to not fill in the flags, so clear it. */
+	if (flags)
+		*flags = 0;
+
+	ret = of_gc->xlate(of_gc, np, gpio_spec, flags);
 	if (ret < 0)
 		goto err1;
 
@@ -70,26 +77,41 @@ err0:
 	pr_debug("%s exited with status %d\n", __func__, ret);
 	return ret;
 }
-EXPORT_SYMBOL(of_get_gpio);
+EXPORT_SYMBOL(of_get_gpio_flags);
 
 /**
- * of_gpio_simple_xlate - translate gpio_spec to the GPIO number
+ * of_gpio_simple_xlate - translate gpio_spec to the GPIO number and flags
  * @of_gc:	pointer to the of_gpio_chip structure
  * @np:		device node of the GPIO chip
  * @gpio_spec:	gpio specifier as found in the device tree
+ * @flags:	a flags pointer to fill in
  *
  * This is simple translation function, suitable for the most 1:1 mapped
  * gpio chips. This function performs only one sanity check: whether gpio
  * is less than ngpios (that is specified in the gpio_chip).
  */
 int of_gpio_simple_xlate(struct of_gpio_chip *of_gc, struct device_node *np,
-			 const void *gpio_spec)
+			 const void *gpio_spec, enum of_gpio_flags *flags)
 {
 	const u32 *gpio = gpio_spec;
 
+	/*
+	 * We're discouraging gpio_cells < 2, since that way you'll have to
+	 * write your own xlate function (that will have to retrive the GPIO
+	 * number and the flags from a single gpio cell -- this is possible,
+	 * but not recommended).
+	 */
+	if (of_gc->gpio_cells < 2) {
+		WARN_ON(1);
+		return -EINVAL;
+	}
+
 	if (*gpio > of_gc->gc.ngpio)
 		return -EINVAL;
 
+	if (flags)
+		*flags = gpio[1];
+
 	return *gpio;
 }
 EXPORT_SYMBOL(of_gpio_simple_xlate);
diff --git a/include/linux/of_gpio.h b/include/linux/of_gpio.h
index 67db101d0eb..e25abf610cb 100644
--- a/include/linux/of_gpio.h
+++ b/include/linux/of_gpio.h
@@ -14,9 +14,22 @@
 #ifndef __LINUX_OF_GPIO_H
 #define __LINUX_OF_GPIO_H
 
+#include <linux/compiler.h>
+#include <linux/kernel.h>
 #include <linux/errno.h>
 #include <linux/gpio.h>
 
+struct device_node;
+
+/*
+ * This is Linux-specific flags. By default controllers' and Linux' mapping
+ * match, but GPIO controllers are free to translate their own flags to
+ * Linux-specific in their .xlate callback. Though, 1:1 mapping is recommended.
+ */
+enum of_gpio_flags {
+	OF_GPIO_ACTIVE_LOW = 0x1,
+};
+
 #ifdef CONFIG_OF_GPIO
 
 /*
@@ -26,7 +39,7 @@ struct of_gpio_chip {
 	struct gpio_chip gc;
 	int gpio_cells;
 	int (*xlate)(struct of_gpio_chip *of_gc, struct device_node *np,
-		     const void *gpio_spec);
+		     const void *gpio_spec, enum of_gpio_flags *flags);
 };
 
 static inline struct of_gpio_chip *to_of_gpio_chip(struct gpio_chip *gc)
@@ -50,20 +63,37 @@ static inline struct of_mm_gpio_chip *to_of_mm_gpio_chip(struct gpio_chip *gc)
 	return container_of(of_gc, struct of_mm_gpio_chip, of_gc);
 }
 
-extern int of_get_gpio(struct device_node *np, int index);
+extern int of_get_gpio_flags(struct device_node *np, int index,
+			     enum of_gpio_flags *flags);
+
 extern int of_mm_gpiochip_add(struct device_node *np,
 			      struct of_mm_gpio_chip *mm_gc);
 extern int of_gpio_simple_xlate(struct of_gpio_chip *of_gc,
 				struct device_node *np,
-				const void *gpio_spec);
+				const void *gpio_spec,
+				enum of_gpio_flags *flags);
 #else
 
 /* Drivers may not strictly depend on the GPIO support, so let them link. */
-static inline int of_get_gpio(struct device_node *np, int index)
+static inline int of_get_gpio_flags(struct device_node *np, int index,
+				    enum of_gpio_flags *flags)
 {
 	return -ENOSYS;
 }
 
 #endif /* CONFIG_OF_GPIO */
 
+/**
+ * of_get_gpio - Get a GPIO number to use with GPIO API
+ * @np:		device node to get GPIO from
+ * @index:	index of the GPIO
+ *
+ * Returns GPIO number to use with Linux generic GPIO API, or one of the errno
+ * value on the error condition.
+ */
+static inline int of_get_gpio(struct device_node *np, int index)
+{
+	return of_get_gpio_flags(np, index, NULL);
+}
+
 #endif /* __LINUX_OF_GPIO_H */
-- 
cgit v1.2.3-70-g09d2


From 8865c418caf4e9dd2c24bdfae3a5a4106e143e60 Mon Sep 17 00:00:00 2001
From: David Woodhouse <David.Woodhouse@intel.com>
Date: Wed, 3 Dec 2008 22:12:38 -0800
Subject: atm: 32-bit ioctl compatibility

We lack compat ioctl support through most of the ATM code. This patch
deals with most of it, and I can now at least use BR2684 and PPPoATM
with 32-bit userspace.

I haven't added a .compat_ioctl method to struct atm_ioctl, because
AFAICT none of the current users need any conversion -- so we can just
call the ->ioctl() method in every case. I looked at br2684, clip, lec,
mpc, pppoatm and atmtcp.

In svc_compat_ioctl() the only mangling which is needed is to change
COMPAT_ATM_ADDPARTY to ATM_ADDPARTY. Although it's defined as
	_IOW('a', ATMIOC_SPECIAL+4,struct atm_iobuf)
it doesn't actually _take_ a struct atm_iobuf as an argument -- it takes
a struct sockaddr_atmsvc, which _is_ the same between 32-bit and 64-bit
code, so doesn't need conversion.

Almost all of vcc_ioctl() would have been identical, so I converted that
into a core do_vcc_ioctl() function with an 'int compat' argument.

I've done the same with atm_dev_ioctl(), where there _are_ a few
differences, but still it's relatively contained and there would
otherwise have been a lot of duplication.

I haven't done any of the actual device-specific ioctls, although I've
added a compat_ioctl method to struct atmdev_ops.

Signed-off-by: David Woodhouse <David.Woodhouse@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/atm.h    | 17 ++++++++--
 include/linux/atmdev.h | 15 +++++++++
 net/atm/common.h       |  1 +
 net/atm/ioctl.c        | 49 ++++++++++++++++++++++++----
 net/atm/pvc.c          |  3 ++
 net/atm/resources.c    | 88 ++++++++++++++++++++++++++++++++++++++------------
 net/atm/resources.h    |  2 +-
 net/atm/svc.c          | 19 +++++++++++
 8 files changed, 164 insertions(+), 30 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/atm.h b/include/linux/atm.h
index c791ddd9693..d3b292174ae 100644
--- a/include/linux/atm.h
+++ b/include/linux/atm.h
@@ -231,10 +231,21 @@ static __inline__ int atmpvc_addr_in_use(struct sockaddr_atmpvc addr)
  */
 
 struct atmif_sioc {
-    int number;
-    int length;
-    void __user *arg;
+	int number;
+	int length;
+	void __user *arg;
 };
 
+#ifdef __KERNEL__
+#ifdef CONFIG_COMPAT
+#include <linux/compat.h>
+struct compat_atmif_sioc {
+	int number;
+	int length;
+	compat_uptr_t arg;
+};
+#endif
+#endif
+
 typedef unsigned short atm_backend_t;
 #endif
diff --git a/include/linux/atmdev.h b/include/linux/atmdev.h
index a3d07c29d16..086e5c362d3 100644
--- a/include/linux/atmdev.h
+++ b/include/linux/atmdev.h
@@ -100,6 +100,10 @@ struct atm_dev_stats {
 					/* use backend to make new if */
 #define ATM_ADDPARTY  	_IOW('a', ATMIOC_SPECIAL+4,struct atm_iobuf)
  					/* add party to p2mp call */
+#ifdef CONFIG_COMPAT
+/* It actually takes struct sockaddr_atmsvc, not struct atm_iobuf */
+#define COMPAT_ATM_ADDPARTY  	_IOW('a', ATMIOC_SPECIAL+4,struct compat_atm_iobuf)
+#endif
 #define ATM_DROPPARTY 	_IOW('a', ATMIOC_SPECIAL+5,int)
 					/* drop party from p2mp call */
 
@@ -224,6 +228,13 @@ struct atm_cirange {
 extern struct proc_dir_entry *atm_proc_root;
 #endif
 
+#ifdef CONFIG_COMPAT
+#include <linux/compat.h>
+struct compat_atm_iobuf {
+	int length;
+	compat_uptr_t buffer;
+};
+#endif
 
 struct k_atm_aal_stats {
 #define __HANDLE_ITEM(i) atomic_t i
@@ -379,6 +390,10 @@ struct atmdev_ops { /* only send is required */
 	int (*open)(struct atm_vcc *vcc);
 	void (*close)(struct atm_vcc *vcc);
 	int (*ioctl)(struct atm_dev *dev,unsigned int cmd,void __user *arg);
+#ifdef CONFIG_COMPAT
+	int (*compat_ioctl)(struct atm_dev *dev,unsigned int cmd,
+			    void __user *arg);
+#endif
 	int (*getsockopt)(struct atm_vcc *vcc,int level,int optname,
 	    void __user *optval,int optlen);
 	int (*setsockopt)(struct atm_vcc *vcc,int level,int optname,
diff --git a/net/atm/common.h b/net/atm/common.h
index 16f32c1fa1c..92e2981f479 100644
--- a/net/atm/common.h
+++ b/net/atm/common.h
@@ -19,6 +19,7 @@ int vcc_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *m,
 		size_t total_len);
 unsigned int vcc_poll(struct file *file, struct socket *sock, poll_table *wait);
 int vcc_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg);
+int vcc_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg);
 int vcc_setsockopt(struct socket *sock, int level, int optname,
 		   char __user *optval, int optlen);
 int vcc_getsockopt(struct socket *sock, int level, int optname,
diff --git a/net/atm/ioctl.c b/net/atm/ioctl.c
index 7afd8e7754f..76ed3c8d26e 100644
--- a/net/atm/ioctl.c
+++ b/net/atm/ioctl.c
@@ -19,6 +19,7 @@
 #include <linux/atmlec.h>
 #include <linux/mutex.h>
 #include <asm/ioctls.h>
+#include <net/compat.h>
 
 #include "resources.h"
 #include "signaling.h"		/* for WAITING and sigd_attach */
@@ -46,7 +47,7 @@ void deregister_atm_ioctl(struct atm_ioctl *ioctl)
 EXPORT_SYMBOL(register_atm_ioctl);
 EXPORT_SYMBOL(deregister_atm_ioctl);
 
-int vcc_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
+static int do_vcc_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg, int compat)
 {
 	struct sock *sk = sock->sk;
 	struct atm_vcc *vcc;
@@ -80,13 +81,25 @@ int vcc_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
 				goto done;
 			}
 		case SIOCGSTAMP: /* borrowed from IP */
-			error = sock_get_timestamp(sk, argp);
+#ifdef CONFIG_COMPAT
+			if (compat)
+				error = compat_sock_get_timestamp(sk, argp);
+			else
+#endif
+				error = sock_get_timestamp(sk, argp);
 			goto done;
 		case SIOCGSTAMPNS: /* borrowed from IP */
-			error = sock_get_timestampns(sk, argp);
+#ifdef CONFIG_COMPAT
+			if (compat)
+				error = compat_sock_get_timestampns(sk, argp);
+			else
+#endif
+				error = sock_get_timestampns(sk, argp);
 			goto done;
 		case ATM_SETSC:
-			printk(KERN_WARNING "ATM_SETSC is obsolete\n");
+			if (net_ratelimit())
+				printk(KERN_WARNING "ATM_SETSC is obsolete; used by %s:%d\n",
+				       current->comm, task_pid_nr(current));
 			error = 0;
 			goto done;
 		case ATMSIGD_CTRL:
@@ -99,12 +112,23 @@ int vcc_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
 			 * info uses kernel pointers as opaque references,
 			 * so the holder of the file descriptor can scribble
 			 * on the kernel... so we should make sure that we
-			 * have the same privledges that /proc/kcore needs
+			 * have the same privileges that /proc/kcore needs
 			 */
 			if (!capable(CAP_SYS_RAWIO)) {
 				error = -EPERM;
 				goto done;
 			}
+#ifdef CONFIG_COMPAT
+			/* WTF? I don't even want to _think_ about making this
+			   work for 32-bit userspace. TBH I don't really want
+			   to think about it at all. dwmw2. */
+			if (compat) {
+				if (net_ratelimit())
+					printk(KERN_WARNING "32-bit task cannot be atmsigd\n");
+				error = -EINVAL;
+				goto done;
+			}
+#endif
 			error = sigd_attach(vcc);
 			if (!error)
 				sock->state = SS_CONNECTED;
@@ -155,8 +179,21 @@ int vcc_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
 	if (error != -ENOIOCTLCMD)
 		goto done;
 
-	error = atm_dev_ioctl(cmd, argp);
+	error = atm_dev_ioctl(cmd, argp, compat);
 
 done:
 	return error;
 }
+
+
+int vcc_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
+{
+	return do_vcc_ioctl(sock, cmd, arg, 0);
+}
+
+#ifdef CONFIG_COMPAT
+int vcc_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
+{
+	return do_vcc_ioctl(sock, cmd, arg, 1);
+}
+#endif
diff --git a/net/atm/pvc.c b/net/atm/pvc.c
index 43e8bf5ed00..e1d22d9430d 100644
--- a/net/atm/pvc.c
+++ b/net/atm/pvc.c
@@ -113,6 +113,9 @@ static const struct proto_ops pvc_proto_ops = {
 	.getname =	pvc_getname,
 	.poll =		vcc_poll,
 	.ioctl =	vcc_ioctl,
+#ifdef CONFIG_COMPAT
+	.compat_ioctl = vcc_compat_ioctl,
+#endif
 	.listen =	sock_no_listen,
 	.shutdown =	pvc_shutdown,
 	.setsockopt =	pvc_setsockopt,
diff --git a/net/atm/resources.c b/net/atm/resources.c
index a34ba948af9..56b7322ff46 100644
--- a/net/atm/resources.c
+++ b/net/atm/resources.c
@@ -195,20 +195,39 @@ static int fetch_stats(struct atm_dev *dev, struct atm_dev_stats __user *arg, in
 }
 
 
-int atm_dev_ioctl(unsigned int cmd, void __user *arg)
+int atm_dev_ioctl(unsigned int cmd, void __user *arg, int compat)
 {
 	void __user *buf;
 	int error, len, number, size = 0;
 	struct atm_dev *dev;
 	struct list_head *p;
 	int *tmp_buf, *tmp_p;
-	struct atm_iobuf __user *iobuf = arg;
-	struct atmif_sioc __user *sioc = arg;
+	int __user *sioc_len;
+	int __user *iobuf_len;
+
+#ifndef CONFIG_COMPAT
+	compat = 0; /* Just so the compiler _knows_ */
+#endif
+
 	switch (cmd) {
 		case ATM_GETNAMES:
-			if (get_user(buf, &iobuf->buffer))
-				return -EFAULT;
-			if (get_user(len, &iobuf->length))
+
+			if (compat) {
+#ifdef CONFIG_COMPAT
+				struct compat_atm_iobuf __user *ciobuf = arg;
+				compat_uptr_t cbuf;
+				iobuf_len = &ciobuf->length;
+				if (get_user(cbuf, &ciobuf->buffer))
+					return -EFAULT;
+				buf = compat_ptr(cbuf);
+#endif
+			} else {
+				struct atm_iobuf __user *iobuf = arg;
+				iobuf_len = &iobuf->length;
+				if (get_user(buf, &iobuf->buffer))
+					return -EFAULT;
+			}
+			if (get_user(len, iobuf_len))
 				return -EFAULT;
 			mutex_lock(&atm_dev_mutex);
 			list_for_each(p, &atm_devs)
@@ -229,7 +248,7 @@ int atm_dev_ioctl(unsigned int cmd, void __user *arg)
 			}
 			mutex_unlock(&atm_dev_mutex);
 			error = ((copy_to_user(buf, tmp_buf, size)) ||
-					put_user(size, &iobuf->length))
+					put_user(size, iobuf_len))
 						? -EFAULT : 0;
 			kfree(tmp_buf);
 			return error;
@@ -237,13 +256,32 @@ int atm_dev_ioctl(unsigned int cmd, void __user *arg)
 			break;
 	}
 
-	if (get_user(buf, &sioc->arg))
-		return -EFAULT;
-	if (get_user(len, &sioc->length))
-		return -EFAULT;
-	if (get_user(number, &sioc->number))
-		return -EFAULT;
-
+	if (compat) {
+#ifdef CONFIG_COMPAT
+		struct compat_atmif_sioc __user *csioc = arg;
+		compat_uptr_t carg;
+
+		sioc_len = &csioc->length;
+		if (get_user(carg, &csioc->arg))
+			return -EFAULT;
+		buf = compat_ptr(carg);
+
+		if (get_user(len, &csioc->length))
+			return -EFAULT;
+		if (get_user(number, &csioc->number))
+			return -EFAULT;
+#endif
+	} else {
+		struct atmif_sioc __user *sioc = arg;
+
+		sioc_len = &sioc->length;
+		if (get_user(buf, &sioc->arg))
+			return -EFAULT;
+		if (get_user(len, &sioc->length))
+			return -EFAULT;
+		if (get_user(number, &sioc->number))
+			return -EFAULT;
+	}
 	if (!(dev = try_then_request_module(atm_dev_lookup(number),
 					    "atm-device-%d", number)))
 		return -ENODEV;
@@ -358,7 +396,7 @@ int atm_dev_ioctl(unsigned int cmd, void __user *arg)
 			size = error;
 			/* may return 0, but later on size == 0 means "don't
 			   write the length" */
-			error = put_user(size, &sioc->length)
+			error = put_user(size, sioc_len)
 				? -EFAULT : 0;
 			goto done;
 		case ATM_SETLOOP:
@@ -380,11 +418,21 @@ int atm_dev_ioctl(unsigned int cmd, void __user *arg)
 			}
 			/* fall through */
 		default:
-			if (!dev->ops->ioctl) {
-				error = -EINVAL;
-				goto done;
+			if (compat) {
+#ifdef CONFIG_COMPAT
+				if (!dev->ops->compat_ioctl) {
+					error = -EINVAL;
+					goto done;
+				}
+				size = dev->ops->compat_ioctl(dev, cmd, buf);
+#endif
+			} else {
+				if (!dev->ops->ioctl) {
+					error = -EINVAL;
+					goto done;
+				}
+				size = dev->ops->ioctl(dev, cmd, buf);
 			}
-			size = dev->ops->ioctl(dev, cmd, buf);
 			if (size < 0) {
 				error = (size == -ENOIOCTLCMD ? -EINVAL : size);
 				goto done;
@@ -392,7 +440,7 @@ int atm_dev_ioctl(unsigned int cmd, void __user *arg)
 	}
 
 	if (size)
-		error = put_user(size, &sioc->length)
+		error = put_user(size, sioc_len)
 			? -EFAULT : 0;
 	else
 		error = 0;
diff --git a/net/atm/resources.h b/net/atm/resources.h
index 1d004aaaeec..126fb1840df 100644
--- a/net/atm/resources.h
+++ b/net/atm/resources.h
@@ -13,7 +13,7 @@
 extern struct list_head atm_devs;
 extern struct mutex atm_dev_mutex;
 
-int atm_dev_ioctl(unsigned int cmd, void __user *arg);
+int atm_dev_ioctl(unsigned int cmd, void __user *arg, int compat);
 
 
 #ifdef CONFIG_PROC_FS
diff --git a/net/atm/svc.c b/net/atm/svc.c
index de1e4f2f3a4..e9c65500f84 100644
--- a/net/atm/svc.c
+++ b/net/atm/svc.c
@@ -604,6 +604,22 @@ static int svc_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
 	return error;
 }
 
+#ifdef CONFIG_COMPAT
+static int svc_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
+{
+	/* The definition of ATM_ADDPARTY uses the size of struct atm_iobuf.
+	   But actually it takes a struct sockaddr_atmsvc, which doesn't need
+	   compat handling. So all we have to do is fix up cmd... */
+	if (cmd == COMPAT_ATM_ADDPARTY)
+		cmd = ATM_ADDPARTY;
+
+	if (cmd == ATM_ADDPARTY || cmd == ATM_DROPPARTY)
+		return svc_ioctl(sock, cmd, arg);
+	else
+		return vcc_compat_ioctl(sock, cmd, arg);
+}
+#endif /* CONFIG_COMPAT */
+
 static const struct proto_ops svc_proto_ops = {
 	.family =	PF_ATMSVC,
 	.owner =	THIS_MODULE,
@@ -616,6 +632,9 @@ static const struct proto_ops svc_proto_ops = {
 	.getname =	svc_getname,
 	.poll =		vcc_poll,
 	.ioctl =	svc_ioctl,
+#ifdef CONFIG_COMPAT
+	.compat_ioctl =	svc_compat_ioctl,
+#endif
 	.listen =	svc_listen,
 	.shutdown =	svc_shutdown,
 	.setsockopt =	svc_setsockopt,
-- 
cgit v1.2.3-70-g09d2


From ea4e2bc4d9f7370e57a343ccb5e7c0ad3222ec3c Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Wed, 3 Dec 2008 15:36:57 -0500
Subject: ftrace: graph of a single function

This patch adds the file:

   /debugfs/tracing/set_graph_function

which can be used along with the function graph tracer.

When this file is empty, the function graph tracer will act as
usual. When the file has a function in it, the function graph
tracer will only trace that function.

For example:

 # echo blk_unplug > /debugfs/tracing/set_graph_function
 # cat /debugfs/tracing/trace
 [...]
 ------------------------------------------
 | 2)  make-19003  =>  kjournald-2219
 ------------------------------------------

 2)               |  blk_unplug() {
 2)               |    dm_unplug_all() {
 2)               |      dm_get_table() {
 2)      1.381 us |        _read_lock();
 2)      0.911 us |        dm_table_get();
 2)      1. 76 us |        _read_unlock();
 2) +   12.912 us |      }
 2)               |      dm_table_unplug_all() {
 2)               |        blk_unplug() {
 2)      0.778 us |          generic_unplug_device();
 2)      2.409 us |        }
 2)      5.992 us |      }
 2)      0.813 us |      dm_table_put();
 2) +   29. 90 us |    }
 2) +   34.532 us |  }

You can add up to 32 functions into this file. Currently we limit it
to 32, but this may change with later improvements.

To add another function, use the append '>>':

  # echo sys_read >> /debugfs/tracing/set_graph_function
  # cat /debugfs/tracing/set_graph_function
  blk_unplug
  sys_read

Using the '>' will clear out the function and write anew:

  # echo sys_write > /debug/tracing/set_graph_function
  # cat /debug/tracing/set_graph_function
  sys_write

Note, if you have function graph running while doing this, the small
time between clearing it and updating it will cause the graph to
record all functions. This should not be an issue because after
it sets the filter, only those functions will be recorded from then on.
If you need to only record a particular function then set this
file first before starting the function graph tracer. In the future
this side effect may be corrected.

The set_graph_function file is similar to the set_ftrace_filter but
it does not take wild cards nor does it allow for more than one
function to be set with a single write. There is no technical reason why
this is the case, I just do not have the time yet to implement that.

Note, dynamic ftrace must be enabled for this to appear because it
uses the dynamic ftrace records to match the name to the mcount
call sites.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/ftrace.h |  46 ++++++++++
 include/linux/sched.h  |   4 +
 kernel/trace/ftrace.c  | 227 +++++++++++++++++++++++++++++++++++++++++++++++++
 kernel/trace/trace.c   |   8 ++
 kernel/trace/trace.h   |  30 ++++++-
 5 files changed, 314 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index 469ceb3e85b..b295d3106bf 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -7,6 +7,7 @@
 #include <linux/init.h>
 #include <linux/types.h>
 #include <linux/kallsyms.h>
+#include <linux/bitops.h>
 
 #ifdef CONFIG_FUNCTION_TRACER
 
@@ -391,4 +392,49 @@ static inline void ftrace_graph_init_task(struct task_struct *t) { }
 static inline void ftrace_graph_exit_task(struct task_struct *t) { }
 #endif
 
+#ifdef CONFIG_TRACING
+#include <linux/sched.h>
+
+/* flags for current->trace */
+enum {
+	TSK_TRACE_FL_TRACE_BIT	= 0,
+	TSK_TRACE_FL_GRAPH_BIT	= 1,
+};
+enum {
+	TSK_TRACE_FL_TRACE	= 1 << TSK_TRACE_FL_TRACE_BIT,
+	TSK_TRACE_FL_GRAPH	= 1 << TSK_TRACE_FL_GRAPH_BIT,
+};
+
+static inline void set_tsk_trace_trace(struct task_struct *tsk)
+{
+	set_bit(TSK_TRACE_FL_TRACE_BIT, &tsk->trace);
+}
+
+static inline void clear_tsk_trace_trace(struct task_struct *tsk)
+{
+	clear_bit(TSK_TRACE_FL_TRACE_BIT, &tsk->trace);
+}
+
+static inline int test_tsk_trace_trace(struct task_struct *tsk)
+{
+	return tsk->trace & TSK_TRACE_FL_TRACE;
+}
+
+static inline void set_tsk_trace_graph(struct task_struct *tsk)
+{
+	set_bit(TSK_TRACE_FL_GRAPH_BIT, &tsk->trace);
+}
+
+static inline void clear_tsk_trace_graph(struct task_struct *tsk)
+{
+	clear_bit(TSK_TRACE_FL_GRAPH_BIT, &tsk->trace);
+}
+
+static inline int test_tsk_trace_graph(struct task_struct *tsk)
+{
+	return tsk->trace & TSK_TRACE_FL_GRAPH;
+}
+
+#endif /* CONFIG_TRACING */
+
 #endif /* _LINUX_FTRACE_H */
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 2d0a93c3122..4c152e0acc9 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1380,6 +1380,10 @@ struct task_struct {
 	 */
 	atomic_t trace_overrun;
 #endif
+#ifdef CONFIG_TRACING
+	/* state flags for use by tracers */
+	unsigned long trace;
+#endif
 };
 
 /*
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 65b9e863056..b17a30350f0 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -1320,6 +1320,224 @@ static struct file_operations ftrace_notrace_fops = {
 	.release = ftrace_notrace_release,
 };
 
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+
+static DEFINE_MUTEX(graph_lock);
+
+int ftrace_graph_count;
+unsigned long ftrace_graph_funcs[FTRACE_GRAPH_MAX_FUNCS] __read_mostly;
+
+static void *
+g_next(struct seq_file *m, void *v, loff_t *pos)
+{
+	unsigned long *array = m->private;
+	int index = *pos;
+
+	(*pos)++;
+
+	if (index >= ftrace_graph_count)
+		return NULL;
+
+	return &array[index];
+}
+
+static void *g_start(struct seq_file *m, loff_t *pos)
+{
+	void *p = NULL;
+
+	mutex_lock(&graph_lock);
+
+	p = g_next(m, p, pos);
+
+	return p;
+}
+
+static void g_stop(struct seq_file *m, void *p)
+{
+	mutex_unlock(&graph_lock);
+}
+
+static int g_show(struct seq_file *m, void *v)
+{
+	unsigned long *ptr = v;
+	char str[KSYM_SYMBOL_LEN];
+
+	if (!ptr)
+		return 0;
+
+	kallsyms_lookup(*ptr, NULL, NULL, NULL, str);
+
+	seq_printf(m, "%s\n", str);
+
+	return 0;
+}
+
+static struct seq_operations ftrace_graph_seq_ops = {
+	.start = g_start,
+	.next = g_next,
+	.stop = g_stop,
+	.show = g_show,
+};
+
+static int
+ftrace_graph_open(struct inode *inode, struct file *file)
+{
+	int ret = 0;
+
+	if (unlikely(ftrace_disabled))
+		return -ENODEV;
+
+	mutex_lock(&graph_lock);
+	if ((file->f_mode & FMODE_WRITE) &&
+	    !(file->f_flags & O_APPEND)) {
+		ftrace_graph_count = 0;
+		memset(ftrace_graph_funcs, 0, sizeof(ftrace_graph_funcs));
+	}
+
+	if (file->f_mode & FMODE_READ) {
+		ret = seq_open(file, &ftrace_graph_seq_ops);
+		if (!ret) {
+			struct seq_file *m = file->private_data;
+			m->private = ftrace_graph_funcs;
+		}
+	} else
+		file->private_data = ftrace_graph_funcs;
+	mutex_unlock(&graph_lock);
+
+	return ret;
+}
+
+static ssize_t
+ftrace_graph_read(struct file *file, char __user *ubuf,
+		       size_t cnt, loff_t *ppos)
+{
+	if (file->f_mode & FMODE_READ)
+		return seq_read(file, ubuf, cnt, ppos);
+	else
+		return -EPERM;
+}
+
+static int
+ftrace_set_func(unsigned long *array, int idx, char *buffer)
+{
+	char str[KSYM_SYMBOL_LEN];
+	struct dyn_ftrace *rec;
+	struct ftrace_page *pg;
+	int found = 0;
+	int i;
+
+	if (ftrace_disabled)
+		return -ENODEV;
+
+	/* should not be called from interrupt context */
+	spin_lock(&ftrace_lock);
+
+	for (pg = ftrace_pages_start; pg; pg = pg->next) {
+		for (i = 0; i < pg->index; i++) {
+			rec = &pg->records[i];
+
+			if (rec->flags & (FTRACE_FL_FAILED | FTRACE_FL_FREE))
+				continue;
+
+			kallsyms_lookup(rec->ip, NULL, NULL, NULL, str);
+			if (strcmp(str, buffer) == 0) {
+				found = 1;
+				array[idx] = rec->ip;
+				break;
+			}
+		}
+	}
+	spin_unlock(&ftrace_lock);
+
+	return found ? 0 : -EINVAL;
+}
+
+static ssize_t
+ftrace_graph_write(struct file *file, const char __user *ubuf,
+		   size_t cnt, loff_t *ppos)
+{
+	unsigned char buffer[FTRACE_BUFF_MAX+1];
+	unsigned long *array;
+	size_t read = 0;
+	ssize_t ret;
+	int index = 0;
+	char ch;
+
+	if (!cnt || cnt < 0)
+		return 0;
+
+	mutex_lock(&graph_lock);
+
+	if (ftrace_graph_count >= FTRACE_GRAPH_MAX_FUNCS) {
+		ret = -EBUSY;
+		goto out;
+	}
+
+	if (file->f_mode & FMODE_READ) {
+		struct seq_file *m = file->private_data;
+		array = m->private;
+	} else
+		array = file->private_data;
+
+	ret = get_user(ch, ubuf++);
+	if (ret)
+		goto out;
+	read++;
+	cnt--;
+
+	/* skip white space */
+	while (cnt && isspace(ch)) {
+		ret = get_user(ch, ubuf++);
+		if (ret)
+			goto out;
+		read++;
+		cnt--;
+	}
+
+	if (isspace(ch)) {
+		*ppos += read;
+		ret = read;
+		goto out;
+	}
+
+	while (cnt && !isspace(ch)) {
+		if (index < FTRACE_BUFF_MAX)
+			buffer[index++] = ch;
+		else {
+			ret = -EINVAL;
+			goto out;
+		}
+		ret = get_user(ch, ubuf++);
+		if (ret)
+			goto out;
+		read++;
+		cnt--;
+	}
+	buffer[index] = 0;
+
+	/* we allow only one at a time */
+	ret = ftrace_set_func(array, ftrace_graph_count, buffer);
+	if (ret)
+		goto out;
+
+	ftrace_graph_count++;
+
+	file->f_pos += read;
+
+	ret = read;
+ out:
+	mutex_unlock(&graph_lock);
+
+	return ret;
+}
+
+static const struct file_operations ftrace_graph_fops = {
+	.open = ftrace_graph_open,
+	.read = ftrace_graph_read,
+	.write = ftrace_graph_write,
+};
+#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
+
 static __init int ftrace_init_dyn_debugfs(struct dentry *d_tracer)
 {
 	struct dentry *entry;
@@ -1347,6 +1565,15 @@ static __init int ftrace_init_dyn_debugfs(struct dentry *d_tracer)
 		pr_warning("Could not create debugfs "
 			   "'set_ftrace_notrace' entry\n");
 
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+	entry = debugfs_create_file("set_graph_function", 0444, d_tracer,
+				    NULL,
+				    &ftrace_graph_fops);
+	if (!entry)
+		pr_warning("Could not create debugfs "
+			   "'set_graph_function' entry\n");
+#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
+
 	return 0;
 }
 
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 8b6409a62b5..710b39acd81 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -1209,6 +1209,9 @@ int trace_graph_entry(struct ftrace_graph_ent *trace)
 	int cpu;
 	int pc;
 
+	if (!ftrace_graph_addr(trace->func))
+		return 0;
+
 	local_irq_save(flags);
 	cpu = raw_smp_processor_id();
 	data = tr->data[cpu];
@@ -1217,6 +1220,9 @@ int trace_graph_entry(struct ftrace_graph_ent *trace)
 		pc = preempt_count();
 		__trace_graph_entry(tr, data, trace, flags, pc);
 	}
+	/* Only do the atomic if it is not already set */
+	if (!test_tsk_trace_graph(current))
+		set_tsk_trace_graph(current);
 	atomic_dec(&data->disabled);
 	local_irq_restore(flags);
 
@@ -1240,6 +1246,8 @@ void trace_graph_return(struct ftrace_graph_ret *trace)
 		pc = preempt_count();
 		__trace_graph_return(tr, data, trace, flags, pc);
 	}
+	if (!trace->depth)
+		clear_tsk_trace_graph(current);
 	atomic_dec(&data->disabled);
 	local_irq_restore(flags);
 }
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 0565ae9a221..41f026bfc9e 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -505,13 +505,41 @@ extern unsigned long trace_flags;
 /* Standard output formatting function used for function return traces */
 #ifdef CONFIG_FUNCTION_GRAPH_TRACER
 extern enum print_line_t print_graph_function(struct trace_iterator *iter);
+
+#ifdef CONFIG_DYNAMIC_FTRACE
+/* TODO: make this variable */
+#define FTRACE_GRAPH_MAX_FUNCS		32
+extern int ftrace_graph_count;
+extern unsigned long ftrace_graph_funcs[FTRACE_GRAPH_MAX_FUNCS];
+
+static inline int ftrace_graph_addr(unsigned long addr)
+{
+	int i;
+
+	if (!ftrace_graph_count || test_tsk_trace_graph(current))
+		return 1;
+
+	for (i = 0; i < ftrace_graph_count; i++) {
+		if (addr == ftrace_graph_funcs[i])
+			return 1;
+	}
+
+	return 0;
+}
 #else
+static inline int ftrace_trace_addr(unsigned long addr)
+{
+	return 1
+}
+#endif /* CONFIG_DYNAMIC_FTRACE */
+
+#else /* CONFIG_FUNCTION_GRAPH_TRACER */
 static inline enum print_line_t
 print_graph_function(struct trace_iterator *iter)
 {
 	return TRACE_TYPE_UNHANDLED;
 }
-#endif
+#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
 
 /*
  * trace_iterator_flags is an enumeration that defines bit
-- 
cgit v1.2.3-70-g09d2


From 5ef6476190d24419a9a537baa0b5641845136989 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Thu, 4 Dec 2008 00:26:39 -0500
Subject: pid: fix the do_each_pid_task() macro

Impact: macro side-effects fix

This patch adds parenthesis around 'pid' in the do_each_pid_task
macro to allow callers to pass in more complex parameters.

e.g.  do_each_pid_task(*pid, type, task)

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/pid.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/pid.h b/include/linux/pid.h
index d7e98ff8021..bb206c56d1f 100644
--- a/include/linux/pid.h
+++ b/include/linux/pid.h
@@ -147,9 +147,9 @@ pid_t pid_vnr(struct pid *pid);
 #define do_each_pid_task(pid, type, task)				\
 	do {								\
 		struct hlist_node *pos___;				\
-		if (pid != NULL)					\
+		if ((pid) != NULL)					\
 			hlist_for_each_entry_rcu((task), pos___,	\
-				&pid->tasks[type], pids[type].node) {
+				&(pid)->tasks[type], pids[type].node) {
 
 			/*
 			 * Both old and new leaders may be attached to
-- 
cgit v1.2.3-70-g09d2


From 00ef9f7348dfd2fc223ec42aceb30836e86b367f Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Thu, 4 Dec 2008 09:00:17 +0100
Subject: lockdep: change a held lock's class

Impact: introduce new lockdep API

Allow to change a held lock's class. Basically the same as the existing
code to change a subclass therefore reuse all that.

The XFS code will be able to use this to annotate their inode locking.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/lockdep.h | 12 ++++++++++--
 kernel/lockdep.c        | 24 +++++++++---------------
 2 files changed, 19 insertions(+), 17 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h
index 8956daf64ab..37a0361f468 100644
--- a/include/linux/lockdep.h
+++ b/include/linux/lockdep.h
@@ -314,8 +314,15 @@ extern void lock_acquire(struct lockdep_map *lock, unsigned int subclass,
 extern void lock_release(struct lockdep_map *lock, int nested,
 			 unsigned long ip);
 
-extern void lock_set_subclass(struct lockdep_map *lock, unsigned int subclass,
-			      unsigned long ip);
+extern void lock_set_class(struct lockdep_map *lock, const char *name,
+			   struct lock_class_key *key, unsigned int subclass,
+			   unsigned long ip);
+
+static inline void lock_set_subclass(struct lockdep_map *lock,
+		unsigned int subclass, unsigned long ip)
+{
+	lock_set_class(lock, lock->name, lock->key, subclass, ip);
+}
 
 # define INIT_LOCKDEP				.lockdep_recursion = 0,
 
@@ -333,6 +340,7 @@ static inline void lockdep_on(void)
 
 # define lock_acquire(l, s, t, r, c, n, i)	do { } while (0)
 # define lock_release(l, n, i)			do { } while (0)
+# define lock_set_class(l, n, k, s, i)		do { } while (0)
 # define lock_set_subclass(l, s, i)		do { } while (0)
 # define lockdep_init()				do { } while (0)
 # define lockdep_info()				do { } while (0)
diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index 90f3fb64dbc..4fa6eeb4e8a 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -291,14 +291,12 @@ void lockdep_off(void)
 {
 	current->lockdep_recursion++;
 }
-
 EXPORT_SYMBOL(lockdep_off);
 
 void lockdep_on(void)
 {
 	current->lockdep_recursion--;
 }
-
 EXPORT_SYMBOL(lockdep_on);
 
 /*
@@ -2513,7 +2511,6 @@ void lockdep_init_map(struct lockdep_map *lock, const char *name,
 	if (subclass)
 		register_lock_class(lock, subclass, 1);
 }
-
 EXPORT_SYMBOL_GPL(lockdep_init_map);
 
 /*
@@ -2694,8 +2691,9 @@ static int check_unlock(struct task_struct *curr, struct lockdep_map *lock,
 }
 
 static int
-__lock_set_subclass(struct lockdep_map *lock,
-		    unsigned int subclass, unsigned long ip)
+__lock_set_class(struct lockdep_map *lock, const char *name,
+		 struct lock_class_key *key, unsigned int subclass,
+		 unsigned long ip)
 {
 	struct task_struct *curr = current;
 	struct held_lock *hlock, *prev_hlock;
@@ -2722,6 +2720,7 @@ __lock_set_subclass(struct lockdep_map *lock,
 	return print_unlock_inbalance_bug(curr, lock, ip);
 
 found_it:
+	lockdep_init_map(lock, name, key, 0);
 	class = register_lock_class(lock, subclass, 0);
 	hlock->class_idx = class - lock_classes + 1;
 
@@ -2906,9 +2905,9 @@ static void check_flags(unsigned long flags)
 #endif
 }
 
-void
-lock_set_subclass(struct lockdep_map *lock,
-		  unsigned int subclass, unsigned long ip)
+void lock_set_class(struct lockdep_map *lock, const char *name,
+		    struct lock_class_key *key, unsigned int subclass,
+		    unsigned long ip)
 {
 	unsigned long flags;
 
@@ -2918,13 +2917,12 @@ lock_set_subclass(struct lockdep_map *lock,
 	raw_local_irq_save(flags);
 	current->lockdep_recursion = 1;
 	check_flags(flags);
-	if (__lock_set_subclass(lock, subclass, ip))
+	if (__lock_set_class(lock, name, key, subclass, ip))
 		check_chain_key(current);
 	current->lockdep_recursion = 0;
 	raw_local_irq_restore(flags);
 }
-
-EXPORT_SYMBOL_GPL(lock_set_subclass);
+EXPORT_SYMBOL_GPL(lock_set_class);
 
 /*
  * We are not always called with irqs disabled - do that here,
@@ -2948,7 +2946,6 @@ void lock_acquire(struct lockdep_map *lock, unsigned int subclass,
 	current->lockdep_recursion = 0;
 	raw_local_irq_restore(flags);
 }
-
 EXPORT_SYMBOL_GPL(lock_acquire);
 
 void lock_release(struct lockdep_map *lock, int nested,
@@ -2966,7 +2963,6 @@ void lock_release(struct lockdep_map *lock, int nested,
 	current->lockdep_recursion = 0;
 	raw_local_irq_restore(flags);
 }
-
 EXPORT_SYMBOL_GPL(lock_release);
 
 #ifdef CONFIG_LOCK_STAT
@@ -3451,7 +3447,6 @@ retry:
 	if (unlock)
 		read_unlock(&tasklist_lock);
 }
-
 EXPORT_SYMBOL_GPL(debug_show_all_locks);
 
 /*
@@ -3472,7 +3467,6 @@ void debug_show_held_locks(struct task_struct *task)
 {
 		__debug_show_held_locks(task);
 }
-
 EXPORT_SYMBOL_GPL(debug_show_held_locks);
 
 void lockdep_sys_exit(void)
-- 
cgit v1.2.3-70-g09d2


From c9bb6003dd096ad190e1594a7d835ae1c39fae8f Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Wed, 29 Oct 2008 23:46:09 -0700
Subject: of: Fix comment, sparc no longer uses of_device objects on special
 busses.

It only uses of_platform_bus_type.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/of_platform.h | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/of_platform.h b/include/linux/of_platform.h
index a8efcfeea73..3d327b67d7e 100644
--- a/include/linux/of_platform.h
+++ b/include/linux/of_platform.h
@@ -26,8 +26,7 @@ extern struct bus_type of_platform_bus_type;
 
 /*
  * An of_platform_driver driver is attached to a basic of_device on
- * the "platform bus" (of_platform_bus_type) (or ISA, EBUS and SBUS
- * busses on sparc).
+ * the "platform bus" (of_platform_bus_type).
  */
 struct of_platform_driver
 {
-- 
cgit v1.2.3-70-g09d2


From 21a8c466f99063eeb8567318b4e305eda9015408 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Thu, 4 Dec 2008 23:51:23 +0100
Subject: tracing/ftrace: provide the macro task_curr_ret_stack()

Impact: cleanup

As suggested by Steven Rostedt, this patch provide a new macro
task_curr_ret_stack() to move the cpp conditionnal CONFIG into
the linux/ftrace.h headers.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Acked-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/ftrace.h | 11 +++++++++++
 kernel/trace/trace.c   |  8 +-------
 2 files changed, 12 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index b295d3106bf..b9b4d0a22d1 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -8,6 +8,7 @@
 #include <linux/types.h>
 #include <linux/kallsyms.h>
 #include <linux/bitops.h>
+#include <linux/sched.h>
 
 #ifdef CONFIG_FUNCTION_TRACER
 
@@ -387,9 +388,19 @@ extern void unregister_ftrace_graph(void);
 
 extern void ftrace_graph_init_task(struct task_struct *t);
 extern void ftrace_graph_exit_task(struct task_struct *t);
+
+static inline int task_curr_ret_stack(struct task_struct *t)
+{
+	return t->curr_ret_stack;
+}
 #else
 static inline void ftrace_graph_init_task(struct task_struct *t) { }
 static inline void ftrace_graph_exit_task(struct task_struct *t) { }
+
+static inline int task_curr_ret_stack(struct task_struct *tsk)
+{
+	return -1;
+}
 #endif
 
 #ifdef CONFIG_TRACING
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 5dca6ef1fbe..7a93c663e52 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -3657,13 +3657,7 @@ int __ftrace_printk(unsigned long ip, const char *fmt, ...)
 		return 0;
 
 	va_start(ap, fmt);
-
-#ifdef CONFIG_FUNCTION_GRAPH_TRACER
-	ret = trace_vprintk(ip, current->curr_ret_stack, fmt, ap);
-#else
-	ret = trace_vprintk(ip, -1, fmt, ap);
-#endif
-
+	ret = trace_vprintk(ip, task_curr_ret_stack(current), fmt, ap);
 	va_end(ap);
 	return ret;
 }
-- 
cgit v1.2.3-70-g09d2


From 72bdcf34380917260da41e3c49e10edee04bc5cd Mon Sep 17 00:00:00 2001
From: Jouni Malinen <j@w1.fi>
Date: Wed, 26 Nov 2008 16:15:24 +0200
Subject: nl80211: Add frequency configuration (including HT40)

This patch adds new NL80211_CMD_SET_WIPHY attributes
NL80211_ATTR_WIPHY_FREQ and NL80211_ATTR_WIPHY_SEC_CHAN_OFFSET to allow
userspace to set the operating channel (e.g., hostapd for AP mode).

Signed-off-by: Jouni Malinen <jouni.malinen@atheros.com>
Acked-by: Johannes Berg <johannes@sipsolutions.net>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 include/linux/nl80211.h    | 23 +++++++++++++++++--
 include/net/cfg80211.h     |  9 ++++++++
 include/net/mac80211.h     |  3 +++
 net/mac80211/cfg.c         | 13 +++++++++++
 net/mac80211/ieee80211_i.h |  1 +
 net/mac80211/main.c        | 30 ++++++++++++++++++++----
 net/mac80211/util.c        |  1 +
 net/wireless/nl80211.c     | 57 ++++++++++++++++++++++++++++++++++++++++++++++
 8 files changed, 131 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/nl80211.h b/include/linux/nl80211.h
index e08c8bcfb78..92f79d2bdd8 100644
--- a/include/linux/nl80211.h
+++ b/include/linux/nl80211.h
@@ -26,8 +26,9 @@
  * @NL80211_CMD_GET_WIPHY: request information about a wiphy or dump request
  *	to get a list of all present wiphys.
  * @NL80211_CMD_SET_WIPHY: set wiphy parameters, needs %NL80211_ATTR_WIPHY or
- *	%NL80211_ATTR_IFINDEX; can be used to set %NL80211_ATTR_WIPHY_NAME
- *	and/or %NL80211_ATTR_WIPHY_TXQ_PARAMS.
+ *	%NL80211_ATTR_IFINDEX; can be used to set %NL80211_ATTR_WIPHY_NAME,
+ *	%NL80211_ATTR_WIPHY_TXQ_PARAMS, %NL80211_ATTR_WIPHY_FREQ, and/or
+ *	%NL80211_ATTR_WIPHY_SEC_CHAN_OFFSET.
  * @NL80211_CMD_NEW_WIPHY: Newly created wiphy, response to get request
  *	or rename notification. Has attributes %NL80211_ATTR_WIPHY and
  *	%NL80211_ATTR_WIPHY_NAME.
@@ -180,6 +181,14 @@ enum nl80211_commands {
  *	/sys/class/ieee80211/<phyname>/index
  * @NL80211_ATTR_WIPHY_NAME: wiphy name (used for renaming)
  * @NL80211_ATTR_WIPHY_TXQ_PARAMS: a nested array of TX queue parameters
+ * @NL80211_ATTR_WIPHY_FREQ: frequency of the selected channel in MHz
+ * @NL80211_ATTR_WIPHY_SEC_CHAN_OFFSET: included with NL80211_ATTR_WIPHY_FREQ
+ *	if HT20 or HT40 are allowed (i.e., 802.11n disabled if not included):
+ *	NL80211_SEC_CHAN_NO_HT = HT not allowed (i.e., same as not including
+ *		this attribute)
+ *	NL80211_SEC_CHAN_DISABLED = HT20 only
+ *	NL80211_SEC_CHAN_BELOW = secondary channel is below the primary channel
+ *	NL80211_SEC_CHAN_ABOVE = secondary channel is above the primary channel
  *
  * @NL80211_ATTR_IFINDEX: network interface index of the device to operate on
  * @NL80211_ATTR_IFNAME: network interface name
@@ -315,6 +324,8 @@ enum nl80211_attrs {
 	NL80211_ATTR_BSS_BASIC_RATES,
 
 	NL80211_ATTR_WIPHY_TXQ_PARAMS,
+	NL80211_ATTR_WIPHY_FREQ,
+	NL80211_ATTR_WIPHY_SEC_CHAN_OFFSET,
 
 	/* add attributes here, update the policy in nl80211.c */
 
@@ -329,6 +340,8 @@ enum nl80211_attrs {
 #define NL80211_ATTR_HT_CAPABILITY NL80211_ATTR_HT_CAPABILITY
 #define NL80211_ATTR_BSS_BASIC_RATES NL80211_ATTR_BSS_BASIC_RATES
 #define NL80211_ATTR_WIPHY_TXQ_PARAMS NL80211_ATTR_WIPHY_TXQ_PARAMS
+#define NL80211_ATTR_WIPHY_FREQ NL80211_ATTR_WIPHY_FREQ
+#define NL80211_ATTR_WIPHY_SEC_CHAN_OFFSET NL80211_ATTR_WIPHY_SEC_CHAN_OFFSET
 
 #define NL80211_MAX_SUPP_RATES			32
 #define NL80211_MAX_SUPP_REG_RULES		32
@@ -742,4 +755,10 @@ enum nl80211_txq_q {
 	NL80211_TXQ_Q_BK
 };
 
+enum nl80211_sec_chan_offset {
+	NL80211_SEC_CHAN_NO_HT /* No HT */,
+	NL80211_SEC_CHAN_DISABLED /* HT20 only */,
+	NL80211_SEC_CHAN_BELOW /* HT40- */,
+	NL80211_SEC_CHAN_ABOVE /* HT40+ */
+};
 #endif /* __LINUX_NL80211_H */
diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index 1d57835d73f..53b06f6c62c 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -392,6 +392,9 @@ struct ieee80211_txq_params {
 /* from net/wireless.h */
 struct wiphy;
 
+/* from net/ieee80211.h */
+struct ieee80211_channel;
+
 /**
  * struct cfg80211_ops - backend description for wireless configuration
  *
@@ -450,6 +453,8 @@ struct wiphy;
  * @change_bss: Modify parameters for a given BSS.
  *
  * @set_txq_params: Set TX queue parameters
+ *
+ * @set_channel: Set channel
  */
 struct cfg80211_ops {
 	int	(*add_virtual_intf)(struct wiphy *wiphy, char *name,
@@ -513,6 +518,10 @@ struct cfg80211_ops {
 
 	int	(*set_txq_params)(struct wiphy *wiphy,
 				  struct ieee80211_txq_params *params);
+
+	int	(*set_channel)(struct wiphy *wiphy,
+			       struct ieee80211_channel *chan,
+			       enum nl80211_sec_chan_offset);
 };
 
 #endif /* __NET_CFG80211_H */
diff --git a/include/net/mac80211.h b/include/net/mac80211.h
index 6a1d4ea1818..6e823cc6a4b 100644
--- a/include/net/mac80211.h
+++ b/include/net/mac80211.h
@@ -507,6 +507,9 @@ static inline int __deprecated __IEEE80211_CONF_SHORT_SLOT_TIME(void)
 
 struct ieee80211_ht_conf {
 	bool enabled;
+	int sec_chan_offset; /* 0 = HT40 disabled; -1 = HT40 enabled, secondary
+			      * channel below primary; 1 = HT40 enabled,
+			      * secondary channel above primary */
 };
 
 /**
diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c
index 16423f94801..7a7a6c176dc 100644
--- a/net/mac80211/cfg.c
+++ b/net/mac80211/cfg.c
@@ -1095,6 +1095,18 @@ static int ieee80211_set_txq_params(struct wiphy *wiphy,
 	return 0;
 }
 
+static int ieee80211_set_channel(struct wiphy *wiphy,
+				 struct ieee80211_channel *chan,
+				 enum nl80211_sec_chan_offset sec_chan_offset)
+{
+	struct ieee80211_local *local = wiphy_priv(wiphy);
+
+	local->oper_channel = chan;
+	local->oper_sec_chan_offset = sec_chan_offset;
+
+	return ieee80211_hw_config(local, IEEE80211_CONF_CHANGE_CHANNEL);
+}
+
 struct cfg80211_ops mac80211_config_ops = {
 	.add_virtual_intf = ieee80211_add_iface,
 	.del_virtual_intf = ieee80211_del_iface,
@@ -1122,4 +1134,5 @@ struct cfg80211_ops mac80211_config_ops = {
 #endif
 	.change_bss = ieee80211_change_bss,
 	.set_txq_params = ieee80211_set_txq_params,
+	.set_channel = ieee80211_set_channel,
 };
diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h
index 155a2041001..527205f8c1a 100644
--- a/net/mac80211/ieee80211_i.h
+++ b/net/mac80211/ieee80211_i.h
@@ -626,6 +626,7 @@ struct ieee80211_local {
 	struct delayed_work scan_work;
 	struct ieee80211_sub_if_data *scan_sdata;
 	struct ieee80211_channel *oper_channel, *scan_channel;
+	enum nl80211_sec_chan_offset oper_sec_chan_offset;
 	u8 scan_ssid[IEEE80211_MAX_SSID_LEN];
 	size_t scan_ssid_len;
 	struct list_head bss_list;
diff --git a/net/mac80211/main.c b/net/mac80211/main.c
index cec9b6d3e1c..29c3ecf7e91 100644
--- a/net/mac80211/main.c
+++ b/net/mac80211/main.c
@@ -195,20 +195,42 @@ int ieee80211_hw_config(struct ieee80211_local *local, u32 changed)
 	struct ieee80211_channel *chan;
 	int ret = 0;
 	int power;
+	enum nl80211_sec_chan_offset sec_chan_offset;
 
 	might_sleep();
 
-	if (local->sw_scanning)
+	if (local->sw_scanning) {
 		chan = local->scan_channel;
-	else
+		sec_chan_offset = NL80211_SEC_CHAN_NO_HT;
+	} else {
 		chan = local->oper_channel;
+		sec_chan_offset = local->oper_sec_chan_offset;
+	}
 
-	if (chan != local->hw.conf.channel) {
+	if (chan != local->hw.conf.channel ||
+	    sec_chan_offset != local->hw.conf.ht.sec_chan_offset) {
 		local->hw.conf.channel = chan;
+		switch (sec_chan_offset) {
+		case NL80211_SEC_CHAN_NO_HT:
+			local->hw.conf.ht.enabled = false;
+			local->hw.conf.ht.sec_chan_offset = 0;
+			break;
+		case NL80211_SEC_CHAN_DISABLED:
+			local->hw.conf.ht.enabled = true;
+			local->hw.conf.ht.sec_chan_offset = 0;
+			break;
+		case NL80211_SEC_CHAN_BELOW:
+			local->hw.conf.ht.enabled = true;
+			local->hw.conf.ht.sec_chan_offset = -1;
+			break;
+		case NL80211_SEC_CHAN_ABOVE:
+			local->hw.conf.ht.enabled = true;
+			local->hw.conf.ht.sec_chan_offset = 1;
+			break;
+		}
 		changed |= IEEE80211_CONF_CHANGE_CHANNEL;
 	}
 
-
 	if (!local->hw.conf.power_level)
 		power = chan->max_power;
 	else
diff --git a/net/mac80211/util.c b/net/mac80211/util.c
index 0f841317c7e..505d68f344c 100644
--- a/net/mac80211/util.c
+++ b/net/mac80211/util.c
@@ -641,6 +641,7 @@ int ieee80211_set_freq(struct ieee80211_sub_if_data *sdata, int freqMHz)
 		    chan->flags & IEEE80211_CHAN_NO_IBSS)
 			return ret;
 		local->oper_channel = chan;
+		local->oper_sec_chan_offset = NL80211_SEC_CHAN_NO_HT;
 
 		if (local->sw_scanning || local->hw_scanning)
 			ret = 0;
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index c9141e3df9b..9caee6022e3 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -59,6 +59,8 @@ static struct nla_policy nl80211_policy[NL80211_ATTR_MAX+1] __read_mostly = {
 	[NL80211_ATTR_WIPHY_NAME] = { .type = NLA_NUL_STRING,
 				      .len = BUS_ID_SIZE-1 },
 	[NL80211_ATTR_WIPHY_TXQ_PARAMS] = { .type = NLA_NESTED },
+	[NL80211_ATTR_WIPHY_FREQ] = { .type = NLA_U32 },
+	[NL80211_ATTR_WIPHY_SEC_CHAN_OFFSET] = { .type = NLA_U32 },
 
 	[NL80211_ATTR_IFTYPE] = { .type = NLA_U32 },
 	[NL80211_ATTR_IFINDEX] = { .type = NLA_U32 },
@@ -359,6 +361,61 @@ static int nl80211_set_wiphy(struct sk_buff *skb, struct genl_info *info)
 		}
 	}
 
+	if (info->attrs[NL80211_ATTR_WIPHY_FREQ]) {
+		enum nl80211_sec_chan_offset sec_chan_offset =
+			NL80211_SEC_CHAN_NO_HT;
+		struct ieee80211_channel *chan;
+		u32 freq, sec_freq;
+
+		if (!rdev->ops->set_channel) {
+			result = -EOPNOTSUPP;
+			goto bad_res;
+		}
+
+		if (info->attrs[NL80211_ATTR_WIPHY_SEC_CHAN_OFFSET]) {
+			sec_chan_offset = nla_get_u32(
+				info->attrs[
+					NL80211_ATTR_WIPHY_SEC_CHAN_OFFSET]);
+			if (sec_chan_offset != NL80211_SEC_CHAN_NO_HT &&
+			    sec_chan_offset != NL80211_SEC_CHAN_DISABLED &&
+			    sec_chan_offset != NL80211_SEC_CHAN_BELOW &&
+			    sec_chan_offset != NL80211_SEC_CHAN_ABOVE) {
+				result = -EINVAL;
+				goto bad_res;
+			}
+		}
+
+		freq = nla_get_u32(info->attrs[NL80211_ATTR_WIPHY_FREQ]);
+		chan = ieee80211_get_channel(&rdev->wiphy, freq);
+		if (!chan || chan->flags & IEEE80211_CHAN_DISABLED) {
+			/* Primary channel not allowed */
+			result = -EINVAL;
+			goto bad_res;
+		}
+		if (sec_chan_offset == NL80211_SEC_CHAN_BELOW)
+			sec_freq = freq - 20;
+		else if (sec_chan_offset == NL80211_SEC_CHAN_ABOVE)
+			sec_freq = freq + 20;
+		else
+			sec_freq = 0;
+
+		if (sec_freq) {
+			struct ieee80211_channel *schan;
+			schan = ieee80211_get_channel(&rdev->wiphy, sec_freq);
+			if (!schan || schan->flags & IEEE80211_CHAN_DISABLED) {
+				/* Secondary channel not allowed */
+				result = -EINVAL;
+				goto bad_res;
+			}
+		}
+
+		result = rdev->ops->set_channel(&rdev->wiphy, chan,
+						sec_chan_offset);
+		if (result)
+			goto bad_res;
+	}
+
+
 bad_res:
 	cfg80211_put_dev(rdev);
 	return result;
-- 
cgit v1.2.3-70-g09d2


From 10ec4f1d0851eb97cd53db66150835dd7f64829d Mon Sep 17 00:00:00 2001
From: "Luis R. Rodriguez" <lrodriguez@atheros.com>
Date: Wed, 26 Nov 2008 13:03:08 -0800
Subject: nl80211: relicense nl80211.h under the ISC

We have a few BSD/ISC licensed userspace applications which
include nl80211.h from the kernel. To avoid legal ambiguity
for usage of the header file in these projects we rather simply
relicense the header file under the ISC. We've received consent
from all contributors to it.

Signed-off-by: Luis R. Rodriguez <lrodriguez@atheros.com>
Acked-by: Johannes Berg <johannes@sipsolutions.net>
Acked-by: Michael Wu <flamingice@sourmilk.net>
Acked-by: Luis Carlos Cobo <luisca@cozybit.com>
Acked-by: Michael Buesch <mb@bu3sch.de>
Acked-by: Jouni Malinen <jouni.malinen@atheros.com>
Acked-by: Colin McCabe <colin@cozybit.com>
Acked-by: Javier Cardona <javier@cozybit.com>
Cc: johannes@sipsolutions.net
Cc: altape@eden.rutgers.edu
Cc: luisca@cozybit.com
Cc: mb@bu3sch.de
Cc: jouni.malinen@atheros.com
Cc: colin@cozybit.com
Cc: javier@cozybit.com
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 include/linux/nl80211.h | 21 ++++++++++++++++++++-
 1 file changed, 20 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/nl80211.h b/include/linux/nl80211.h
index 92f79d2bdd8..04d4516f9c7 100644
--- a/include/linux/nl80211.h
+++ b/include/linux/nl80211.h
@@ -3,7 +3,26 @@
 /*
  * 802.11 netlink interface public header
  *
- * Copyright 2006, 2007 Johannes Berg <johannes@sipsolutions.net>
+ * Copyright 2006, 2007, 2008 Johannes Berg <johannes@sipsolutions.net>
+ * Copyright 2008 Michael Wu <flamingice@sourmilk.net>
+ * Copyright 2008 Luis Carlos Cobo <luisca@cozybit.com>
+ * Copyright 2008 Michael Buesch <mb@bu3sch.de>
+ * Copyright 2008 Luis R. Rodriguez <lrodriguez@atheros.com>
+ * Copyright 2008 Jouni Malinen <jouni.malinen@atheros.com>
+ * Copyright 2008 Colin McCabe <colin@cozybit.com>
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ *
  */
 
 /**
-- 
cgit v1.2.3-70-g09d2


From b74ca3a896b9ab5f952bc440154758e708c48884 Mon Sep 17 00:00:00 2001
From: Wang Chen <wangchen@cn.fujitsu.com>
Date: Mon, 8 Dec 2008 01:14:16 -0800
Subject: netdevice: Kill netdev->priv

This is the last shoot of this series.
After I removing all directly reference of netdev->priv, I am killing
"priv" of "struct net_device" and fixing relative comments/docs.

Anyone will not be allowed to reference netdev->priv directly.
If you want to reference the memory of private data, use netdev_priv()
instead.
If the private data is not allocted when alloc_netdev(), use
netdev->ml_priv to point that memory after you creating that private
data.

Signed-off-by: Wang Chen <wangchen@cn.fujitsu.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/driver.txt     | 2 +-
 Documentation/networking/netdevices.txt | 2 +-
 drivers/net/3c501.h                     | 2 +-
 drivers/net/atp.c                       | 2 +-
 drivers/net/eexpress.c                  | 2 +-
 drivers/net/forcedeth.c                 | 4 ++--
 drivers/net/lance.c                     | 2 +-
 drivers/net/myri_sbus.c                 | 2 +-
 drivers/net/pci-skeleton.c              | 2 +-
 drivers/net/sun3_82586.c                | 2 +-
 drivers/net/sunbmac.c                   | 2 +-
 drivers/net/tokenring/3c359.c           | 5 +++--
 drivers/net/via-rhine.c                 | 9 +++++----
 drivers/net/wireless/strip.c            | 2 +-
 include/linux/hdlc.h                    | 2 +-
 include/linux/netdevice.h               | 1 -
 net/atm/mpc.c                           | 4 ++--
 net/core/dev.c                          | 6 ------
 18 files changed, 24 insertions(+), 29 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/networking/driver.txt b/Documentation/networking/driver.txt
index ea72d2e66ca..03283daa64f 100644
--- a/Documentation/networking/driver.txt
+++ b/Documentation/networking/driver.txt
@@ -13,7 +13,7 @@ Transmit path guidelines:
 	static int drv_hard_start_xmit(struct sk_buff *skb,
 		   		       struct net_device *dev)
 	{
-		struct drv *dp = dev->priv;
+		struct drv *dp = netdev_priv(dev);
 
 		lock_tx(dp);
 		...
diff --git a/Documentation/networking/netdevices.txt b/Documentation/networking/netdevices.txt
index d0f71fc7f78..a2ab6a0b116 100644
--- a/Documentation/networking/netdevices.txt
+++ b/Documentation/networking/netdevices.txt
@@ -18,7 +18,7 @@ There are routines in net_init.c to handle the common cases of
 alloc_etherdev, alloc_netdev.  These reserve extra space for driver
 private data which gets freed when the network device is freed. If
 separately allocated data is attached to the network device
-(dev->priv) then it is up to the module exit handler to free that.
+(netdev_priv(dev)) then it is up to the module exit handler to free that.
 
 MTU
 ===
diff --git a/drivers/net/3c501.h b/drivers/net/3c501.h
index cfec64efff7..f40b0493337 100644
--- a/drivers/net/3c501.h
+++ b/drivers/net/3c501.h
@@ -23,7 +23,7 @@ static const struct ethtool_ops netdev_ethtool_ops;
 static int el_debug = EL_DEBUG;
 
 /*
- *	Board-specific info in dev->priv.
+ *	Board-specific info in netdev_priv(dev).
  */
 
 struct net_local
diff --git a/drivers/net/atp.c b/drivers/net/atp.c
index 7028b276dfd..1d6b74c5d6c 100644
--- a/drivers/net/atp.c
+++ b/drivers/net/atp.c
@@ -420,7 +420,7 @@ static unsigned short __init eeprom_op(long ioaddr, u32 cmd)
    registers that "should" only need to be set once at boot, so that
    there is non-reboot way to recover if something goes wrong.
 
-   This is an attachable device: if there is no dev->priv entry then it wasn't
+   This is an attachable device: if there is no private entry then it wasn't
    probed for at boot-time, and we need to probe for it again.
    */
 static int net_open(struct net_device *dev)
diff --git a/drivers/net/eexpress.c b/drivers/net/eexpress.c
index a125e41240f..9ff3f2f5e38 100644
--- a/drivers/net/eexpress.c
+++ b/drivers/net/eexpress.c
@@ -1046,7 +1046,7 @@ static void eexp_hw_tx_pio(struct net_device *dev, unsigned short *buf,
 /*
  * Sanity check the suspected EtherExpress card
  * Read hardware address, reset card, size memory and initialize buffer
- * memory pointers. These are held in dev->priv, in case someone has more
+ * memory pointers. These are held in netdev_priv(), in case someone has more
  * than one card in a machine.
  */
 
diff --git a/drivers/net/forcedeth.c b/drivers/net/forcedeth.c
index 12384df8cb2..1f2b24743ee 100644
--- a/drivers/net/forcedeth.c
+++ b/drivers/net/forcedeth.c
@@ -712,12 +712,12 @@ struct nv_skb_map {
 
 /*
  * SMP locking:
- * All hardware access under dev->priv->lock, except the performance
+ * All hardware access under netdev_priv(dev)->lock, except the performance
  * critical parts:
  * - rx is (pseudo-) lockless: it relies on the single-threading provided
  *	by the arch code for interrupts.
  * - tx setup is lockless: it relies on netif_tx_lock. Actual submission
- *	needs dev->priv->lock :-(
+ *	needs netdev_priv(dev)->lock :-(
  * - set_multicast_list: preparation lockless, relies on netif_tx_lock.
  */
 
diff --git a/drivers/net/lance.c b/drivers/net/lance.c
index e81b6113ed9..d7afb938ea6 100644
--- a/drivers/net/lance.c
+++ b/drivers/net/lance.c
@@ -519,7 +519,7 @@ static int __init lance_probe1(struct net_device *dev, int ioaddr, int irq, int
 		}
 	}
 
-	/* We can't allocate dev->priv from alloc_etherdev() because it must
+	/* We can't allocate private data from alloc_etherdev() because it must
 	   a ISA DMA-able region. */
 	chipname = chip_table[lance_version].name;
 	printk("%s: %s at %#3x, ", dev->name, chipname, ioaddr);
diff --git a/drivers/net/myri_sbus.c b/drivers/net/myri_sbus.c
index 6833f65f8ae..899ed065a14 100644
--- a/drivers/net/myri_sbus.c
+++ b/drivers/net/myri_sbus.c
@@ -1091,7 +1091,7 @@ static int __devinit myri_sbus_probe(struct of_device *op, const struct of_devic
 err_free_irq:
 	free_irq(dev->irq, dev);
 err:
-	/* This will also free the co-allocated 'dev->priv' */
+	/* This will also free the co-allocated private data*/
 	free_netdev(dev);
 	return -ENODEV;
 }
diff --git a/drivers/net/pci-skeleton.c b/drivers/net/pci-skeleton.c
index b23b5c397b1..c95fd72c3bb 100644
--- a/drivers/net/pci-skeleton.c
+++ b/drivers/net/pci-skeleton.c
@@ -781,7 +781,7 @@ static int __devinit netdrv_init_one (struct pci_dev *pdev,
 	dev->irq = pdev->irq;
 	dev->base_addr = (unsigned long) ioaddr;
 
-	/* dev->priv/tp zeroed and aligned in alloc_etherdev */
+	/* netdev_priv()/tp zeroed and aligned in alloc_etherdev */
 	tp = netdev_priv(dev);
 
 	/* note: tp->chipset set in netdrv_init_board */
diff --git a/drivers/net/sun3_82586.c b/drivers/net/sun3_82586.c
index e8f97d5c9c2..e0d84772771 100644
--- a/drivers/net/sun3_82586.c
+++ b/drivers/net/sun3_82586.c
@@ -209,7 +209,7 @@ static int sun3_82586_open(struct net_device *dev)
 static int check586(struct net_device *dev,char *where,unsigned size)
 {
 	struct priv pb;
-	struct priv *p = /* (struct priv *) dev->priv*/ &pb;
+	struct priv *p = &pb;
 	char *iscp_addr;
 	int i;
 
diff --git a/drivers/net/sunbmac.c b/drivers/net/sunbmac.c
index 977b3e08bbf..7f69c7f176c 100644
--- a/drivers/net/sunbmac.c
+++ b/drivers/net/sunbmac.c
@@ -1233,7 +1233,7 @@ fail_and_cleanup:
 				  bp->bmac_block,
 				  bp->bblock_dvma);
 
-	/* This also frees the co-located 'dev->priv' */
+	/* This also frees the co-located private data */
 	free_netdev(dev);
 	return -ENODEV;
 }
diff --git a/drivers/net/tokenring/3c359.c b/drivers/net/tokenring/3c359.c
index e7a944657cf..43853e3b210 100644
--- a/drivers/net/tokenring/3c359.c
+++ b/drivers/net/tokenring/3c359.c
@@ -296,8 +296,9 @@ static int __devinit xl_probe(struct pci_dev *pdev,
 	} ; 
 
 	/* 
-	 * Allowing init_trdev to allocate the dev->priv structure will align xl_private
-   	 * on a 32 bytes boundary which we need for the rx/tx descriptors
+	 * Allowing init_trdev to allocate the private data will align
+	 * xl_private on a 32 bytes boundary which we need for the rx/tx
+	 * descriptors
 	 */
 
 	dev = alloc_trdev(sizeof(struct xl_private)) ; 
diff --git a/drivers/net/via-rhine.c b/drivers/net/via-rhine.c
index 93b74b7b707..8d405c83df8 100644
--- a/drivers/net/via-rhine.c
+++ b/drivers/net/via-rhine.c
@@ -191,12 +191,13 @@ IIId. Synchronization
 
 The driver runs as two independent, single-threaded flows of control. One
 is the send-packet routine, which enforces single-threaded use by the
-dev->priv->lock spinlock. The other thread is the interrupt handler, which
-is single threaded by the hardware and interrupt handling software.
+netdev_priv(dev)->lock spinlock. The other thread is the interrupt handler,
+which is single threaded by the hardware and interrupt handling software.
 
 The send packet thread has partial control over the Tx ring. It locks the
-dev->priv->lock whenever it's queuing a Tx packet. If the next slot in the ring
-is not available it stops the transmit queue by calling netif_stop_queue.
+netdev_priv(dev)->lock whenever it's queuing a Tx packet. If the next slot in
+the ring is not available it stops the transmit queue by
+calling netif_stop_queue.
 
 The interrupt handler has exclusive control over the Rx ring and records stats
 from the Tx ring. After reaping the stats, it marks the Tx queue entry as
diff --git a/drivers/net/wireless/strip.c b/drivers/net/wireless/strip.c
index 692e6c5e009..dd0de3a9ed4 100644
--- a/drivers/net/wireless/strip.c
+++ b/drivers/net/wireless/strip.c
@@ -2494,7 +2494,7 @@ static void strip_dev_setup(struct net_device *dev)
 	dev->type = ARPHRD_METRICOM;	/* dtang */
 	dev->hard_header_len = sizeof(STRIP_Header);
 	/*
-	 *  dev->priv             Already holds a pointer to our struct strip
+	 *  netdev_priv(dev) Already holds a pointer to our struct strip
 	 */
 
 	*(MetricomAddress *) & dev->broadcast = broadcast_address;
diff --git a/include/linux/hdlc.h b/include/linux/hdlc.h
index e960faac609..fd47a151665 100644
--- a/include/linux/hdlc.h
+++ b/include/linux/hdlc.h
@@ -43,7 +43,7 @@ struct hdlc_proto {
 };
 
 
-/* Pointed to by dev->priv */
+/* Pointed to by netdev_priv(dev) */
 typedef struct hdlc_device {
 	/* used by HDLC layer to take control over HDLC device from hw driver*/
 	int (*attach)(struct net_device *dev,
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 0df0db068ac..47e73152831 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -785,7 +785,6 @@ struct net_device
 /*
  * One part is mostly used on xmit path (device)
  */
-	void			*priv;	/* pointer to private data	*/
 	/* These may be needed for future network-power-down code. */
 	unsigned long		trans_start;	/* Time (in jiffies) of last Tx	*/
 
diff --git a/net/atm/mpc.c b/net/atm/mpc.c
index 12e9ea371db..039d5cc72c3 100644
--- a/net/atm/mpc.c
+++ b/net/atm/mpc.c
@@ -341,8 +341,8 @@ static const char *mpoa_device_type_string(char type)
 }
 
 /*
- * lec device calls this via its dev->priv->lane2_ops->associate_indicator()
- * when it sees a TLV in LE_ARP packet.
+ * lec device calls this via its netdev_priv(dev)->lane2_ops
+ * ->associate_indicator() when it sees a TLV in LE_ARP packet.
  * We fill in the pointer above when we see a LANE2 lec initializing
  * See LANE2 spec 3.1.5
  *
diff --git a/net/core/dev.c b/net/core/dev.c
index 4615e9a443a..f54cac76438 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -4378,12 +4378,6 @@ struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name,
 	dev->num_tx_queues = queue_count;
 	dev->real_num_tx_queues = queue_count;
 
-	if (sizeof_priv) {
-		dev->priv = ((char *)dev +
-			     ((sizeof(struct net_device) + NETDEV_ALIGN_CONST)
-			      & ~NETDEV_ALIGN_CONST));
-	}
-
 	dev->gso_max_size = GSO_MAX_SIZE;
 
 	netdev_init_queues(dev);
-- 
cgit v1.2.3-70-g09d2


From 0049bab5e765aa74cf767a834fa336e19453fc5e Mon Sep 17 00:00:00 2001
From: Gerrit Renker <gerrit@erg.abdn.ac.uk>
Date: Mon, 8 Dec 2008 01:18:05 -0800
Subject: dccp: Remove obsolete parts of the old CCID interface

The TX/RX CCIDs of the minisock are now redundant: similar to the Ack Vector
case, their value equals initially that of the sysctl, but at the end of
feature negotiation may be something different.

The old interface removed by this patch thus has been replaced by the newer
interface to dynamically query the currently loaded CCIDs.

Also removed are the constructors for the TX CCID and the RX CCID, since the
switch "rx <-> non-rx" is done by the handler in minisocks.c (and the handler
is the only place in the code where CCIDs are loaded).

Signed-off-by: Gerrit Renker <gerrit@erg.abdn.ac.uk>
Acked-by: Ian McDonald <ian.mcdonald@jandi.co.nz>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/dccp.txt |  5 +++--
 include/linux/dccp.h              |  3 ---
 net/dccp/ccid.c                   | 14 --------------
 net/dccp/ccid.h                   |  5 -----
 net/dccp/feat.c                   | 13 -------------
 net/dccp/minisocks.c              |  2 --
 6 files changed, 3 insertions(+), 39 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/networking/dccp.txt b/Documentation/networking/dccp.txt
index 610083ff73f..a203d132dbe 100644
--- a/Documentation/networking/dccp.txt
+++ b/Documentation/networking/dccp.txt
@@ -140,10 +140,11 @@ send_ackvec = 1
 	Whether or not to send Ack Vector options (sec. 11.5).
 
 tx_ccid = 2
-	Default CCID for the sender-receiver half-connection.
+	Default CCID for the sender-receiver half-connection. Depending on the
+	choice of CCID, the Send Ack Vector feature is enabled automatically.
 
 rx_ccid = 2
-	Default CCID for the receiver-sender half-connection.
+	Default CCID for the receiver-sender half-connection; see tx_ccid.
 
 seq_window = 100
 	The initial sequence window (sec. 7.5.2).
diff --git a/include/linux/dccp.h b/include/linux/dccp.h
index 6a72ff52a8a..46daea312d9 100644
--- a/include/linux/dccp.h
+++ b/include/linux/dccp.h
@@ -370,7 +370,6 @@ static inline unsigned int dccp_hdr_len(const struct sk_buff *skb)
   * Will be used to pass the state from dccp_request_sock to dccp_sock.
   *
   * @dccpms_sequence_window - Sequence Window Feature (section 7.5.2)
-  * @dccpms_ccid - Congestion Control Id (CCID) (section 10)
   * @dccpms_send_ack_vector - Send Ack Vector Feature (section 11.5)
   * @dccpms_send_ndp_count - Send NDP Count Feature (7.7.2)
   * @dccpms_pending - List of features being negotiated
@@ -378,8 +377,6 @@ static inline unsigned int dccp_hdr_len(const struct sk_buff *skb)
   */
 struct dccp_minisock {
 	__u64			dccpms_sequence_window;
-	__u8			dccpms_rx_ccid;
-	__u8			dccpms_tx_ccid;
 	__u8			dccpms_send_ack_vector;
 	__u8			dccpms_send_ndp_count;
 	struct list_head	dccpms_pending;
diff --git a/net/dccp/ccid.c b/net/dccp/ccid.c
index 647cb0614f8..bcc643f992a 100644
--- a/net/dccp/ccid.c
+++ b/net/dccp/ccid.c
@@ -253,20 +253,6 @@ out_module_put:
 
 EXPORT_SYMBOL_GPL(ccid_new);
 
-struct ccid *ccid_hc_rx_new(unsigned char id, struct sock *sk, gfp_t gfp)
-{
-	return ccid_new(id, sk, 1, gfp);
-}
-
-EXPORT_SYMBOL_GPL(ccid_hc_rx_new);
-
-struct ccid *ccid_hc_tx_new(unsigned char id,struct sock *sk,  gfp_t gfp)
-{
-	return ccid_new(id, sk, 0, gfp);
-}
-
-EXPORT_SYMBOL_GPL(ccid_hc_tx_new);
-
 static void ccid_delete(struct ccid *ccid, struct sock *sk, int rx)
 {
 	struct ccid_operations *ccid_ops;
diff --git a/net/dccp/ccid.h b/net/dccp/ccid.h
index 803343aed00..18f69423a70 100644
--- a/net/dccp/ccid.h
+++ b/net/dccp/ccid.h
@@ -111,11 +111,6 @@ extern int  ccid_getsockopt_builtin_ccids(struct sock *sk, int len,
 extern struct ccid *ccid_new(unsigned char id, struct sock *sk, int rx,
 			     gfp_t gfp);
 
-extern struct ccid *ccid_hc_rx_new(unsigned char id, struct sock *sk,
-				   gfp_t gfp);
-extern struct ccid *ccid_hc_tx_new(unsigned char id, struct sock *sk,
-				   gfp_t gfp);
-
 static inline int ccid_get_current_rx_ccid(struct dccp_sock *dp)
 {
 	struct ccid *ccid = dp->dccps_hc_rx_ccid;
diff --git a/net/dccp/feat.c b/net/dccp/feat.c
index 077f78d579c..a0d5891a37b 100644
--- a/net/dccp/feat.c
+++ b/net/dccp/feat.c
@@ -1124,22 +1124,9 @@ int dccp_feat_init(struct sock *sk)
 	INIT_LIST_HEAD(&dmsk->dccpms_pending);	/* XXX no longer used */
 	INIT_LIST_HEAD(&dmsk->dccpms_conf);	/* XXX no longer used */
 
-	/* CCID L */
-	rc = __feat_register_sp(&dp->dccps_featneg, DCCPF_CCID, 1, 0,
-				&dmsk->dccpms_tx_ccid, 1);
-	if (rc)
-		goto out;
-
-	/* CCID R */
-	rc = __feat_register_sp(&dp->dccps_featneg, DCCPF_CCID, 0, 0,
-				&dmsk->dccpms_rx_ccid, 1);
-	if (rc)
-		goto out;
-
 	/* Ack ratio */
 	rc = __feat_register_nn(&dp->dccps_featneg, DCCPF_ACK_RATIO, 0,
 				dp->dccps_l_ack_ratio);
-out:
 	return rc;
 }
 
diff --git a/net/dccp/minisocks.c b/net/dccp/minisocks.c
index 308b6b928c3..210c346899b 100644
--- a/net/dccp/minisocks.c
+++ b/net/dccp/minisocks.c
@@ -45,8 +45,6 @@ EXPORT_SYMBOL_GPL(dccp_death_row);
 void dccp_minisock_init(struct dccp_minisock *dmsk)
 {
 	dmsk->dccpms_sequence_window = sysctl_dccp_feat_sequence_window;
-	dmsk->dccpms_rx_ccid	     = sysctl_dccp_feat_rx_ccid;
-	dmsk->dccpms_tx_ccid	     = sysctl_dccp_feat_tx_ccid;
 	dmsk->dccpms_send_ack_vector = sysctl_dccp_feat_send_ack_vector;
 	dmsk->dccpms_send_ndp_count  = sysctl_dccp_feat_send_ndp_count;
 }
-- 
cgit v1.2.3-70-g09d2


From 4098dce5be537a157eed4a326efd464109825b8b Mon Sep 17 00:00:00 2001
From: Gerrit Renker <gerrit@erg.abdn.ac.uk>
Date: Mon, 8 Dec 2008 01:18:37 -0800
Subject: dccp: Remove manual influence on NDP Count feature

Updating the NDP count feature is handled automatically now:
 * for CCID-2 it is disabled, since the code does not use NDP counts;
 * for CCID-3 it is enabled, as NDP counts are used to determine loss lengths.

Allowing the user to change NDP values leads to unpredictable and failing
behaviour, since it is then possible to disable NDP counts even when they
are needed (e.g. in CCID-3).

This means that only those user settings are sensible that agree with the
values for Send NDP Count implied by the choice of CCID. But those settings
are already activated by the feature negotiation (CCID dependency tracking),
hence this form of support is redundant.

At startup the initialisation of the NDP count feature uses the default
value of 0, which is done implicitly by the zeroing-out of the socket when
it is allocated. If the choice of CCID or feature negotiation enables NDP
count, this will then be updated via the NDP activation handler.

Signed-off-by: Gerrit Renker <gerrit@erg.abdn.ac.uk>
Acked-by: Ian McDonald <ian.mcdonald@jandi.co.nz>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/dccp.txt | 3 ---
 include/linux/dccp.h              | 4 ++--
 net/dccp/dccp.h                   | 1 -
 net/dccp/feat.c                   | 2 +-
 net/dccp/minisocks.c              | 1 -
 net/dccp/options.c                | 4 +---
 net/dccp/sysctl.c                 | 7 -------
 7 files changed, 4 insertions(+), 18 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/networking/dccp.txt b/Documentation/networking/dccp.txt
index a203d132dbe..1403745ab40 100644
--- a/Documentation/networking/dccp.txt
+++ b/Documentation/networking/dccp.txt
@@ -133,9 +133,6 @@ retries2
 	importance for retransmitted acknowledgments and feature negotiation,
 	data packets are never retransmitted. Analogue of tcp_retries2.
 
-send_ndp = 1
-	Whether or not to send NDP count options (sec. 7.7.2).
-
 send_ackvec = 1
 	Whether or not to send Ack Vector options (sec. 11.5).
 
diff --git a/include/linux/dccp.h b/include/linux/dccp.h
index 46daea312d9..60e94438ead 100644
--- a/include/linux/dccp.h
+++ b/include/linux/dccp.h
@@ -371,14 +371,12 @@ static inline unsigned int dccp_hdr_len(const struct sk_buff *skb)
   *
   * @dccpms_sequence_window - Sequence Window Feature (section 7.5.2)
   * @dccpms_send_ack_vector - Send Ack Vector Feature (section 11.5)
-  * @dccpms_send_ndp_count - Send NDP Count Feature (7.7.2)
   * @dccpms_pending - List of features being negotiated
   * @dccpms_conf -
   */
 struct dccp_minisock {
 	__u64			dccpms_sequence_window;
 	__u8			dccpms_send_ack_vector;
-	__u8			dccpms_send_ndp_count;
 	struct list_head	dccpms_pending;
 	struct list_head	dccpms_conf;
 };
@@ -490,6 +488,7 @@ struct dccp_ackvec;
  * @dccps_r_ack_ratio - feature-remote Ack Ratio
  * @dccps_pcslen - sender   partial checksum coverage (via sockopt)
  * @dccps_pcrlen - receiver partial checksum coverage (via sockopt)
+ * @dccps_send_ndp_count - local Send NDP Count feature (7.7.2)
  * @dccps_ndp_count - number of Non Data Packets since last data packet
  * @dccps_mss_cache - current value of MSS (path MTU minus header sizes)
  * @dccps_rate_last - timestamp for rate-limiting DCCP-Sync (RFC 4340, 7.5.4)
@@ -529,6 +528,7 @@ struct dccp_sock {
 	__u16				dccps_r_ack_ratio;
 	__u8				dccps_pcslen:4;
 	__u8				dccps_pcrlen:4;
+	__u8				dccps_send_ndp_count:1;
 	__u64				dccps_ndp_count:48;
 	unsigned long			dccps_rate_last;
 	struct dccp_minisock		dccps_minisock;
diff --git a/net/dccp/dccp.h b/net/dccp/dccp.h
index 94f6785f81e..e0759d098b9 100644
--- a/net/dccp/dccp.h
+++ b/net/dccp/dccp.h
@@ -99,7 +99,6 @@ extern int  sysctl_dccp_feat_sequence_window;
 extern int  sysctl_dccp_feat_rx_ccid;
 extern int  sysctl_dccp_feat_tx_ccid;
 extern int  sysctl_dccp_feat_send_ack_vector;
-extern int  sysctl_dccp_feat_send_ndp_count;
 extern int  sysctl_dccp_tx_qlen;
 extern int  sysctl_dccp_sync_ratelimit;
 
diff --git a/net/dccp/feat.c b/net/dccp/feat.c
index a0d5891a37b..30f9fb76b92 100644
--- a/net/dccp/feat.c
+++ b/net/dccp/feat.c
@@ -85,7 +85,7 @@ static int dccp_hdlr_ackvec(struct sock *sk, u64 enable, bool rx)
 static int dccp_hdlr_ndp(struct sock *sk, u64 enable, bool rx)
 {
 	if (!rx)
-		dccp_msk(sk)->dccpms_send_ndp_count = (enable > 0);
+		dccp_sk(sk)->dccps_send_ndp_count = (enable > 0);
 	return 0;
 }
 
diff --git a/net/dccp/minisocks.c b/net/dccp/minisocks.c
index 210c346899b..02b14812464 100644
--- a/net/dccp/minisocks.c
+++ b/net/dccp/minisocks.c
@@ -46,7 +46,6 @@ void dccp_minisock_init(struct dccp_minisock *dmsk)
 {
 	dmsk->dccpms_sequence_window = sysctl_dccp_feat_sequence_window;
 	dmsk->dccpms_send_ack_vector = sysctl_dccp_feat_send_ack_vector;
-	dmsk->dccpms_send_ndp_count  = sysctl_dccp_feat_send_ndp_count;
 }
 
 void dccp_time_wait(struct sock *sk, int state, int timeo)
diff --git a/net/dccp/options.c b/net/dccp/options.c
index debb1008c7a..e9d674874b4 100644
--- a/net/dccp/options.c
+++ b/net/dccp/options.c
@@ -27,7 +27,6 @@ int sysctl_dccp_feat_sequence_window = DCCPF_INITIAL_SEQUENCE_WINDOW;
 int sysctl_dccp_feat_rx_ccid	      = DCCPF_INITIAL_CCID;
 int sysctl_dccp_feat_tx_ccid	      = DCCPF_INITIAL_CCID;
 int sysctl_dccp_feat_send_ack_vector = DCCPF_INITIAL_SEND_ACK_VECTOR;
-int sysctl_dccp_feat_send_ndp_count  = DCCPF_INITIAL_SEND_NDP_COUNT;
 
 u64 dccp_decode_value_var(const u8 *bf, const u8 len)
 {
@@ -531,8 +530,7 @@ int dccp_insert_options(struct sock *sk, struct sk_buff *skb)
 
 	DCCP_SKB_CB(skb)->dccpd_opt_len = 0;
 
-	if (dmsk->dccpms_send_ndp_count &&
-	    dccp_insert_option_ndp(sk, skb))
+	if (dp->dccps_send_ndp_count && dccp_insert_option_ndp(sk, skb))
 		return -1;
 
 	if (DCCP_SKB_CB(skb)->dccpd_type != DCCP_PKT_DATA) {
diff --git a/net/dccp/sysctl.c b/net/dccp/sysctl.c
index f6e54f433e2..587c12f915c 100644
--- a/net/dccp/sysctl.c
+++ b/net/dccp/sysctl.c
@@ -47,13 +47,6 @@ static struct ctl_table dccp_default_table[] = {
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec,
 	},
-	{
-		.procname	= "send_ndp",
-		.data		= &sysctl_dccp_feat_send_ndp_count,
-		.maxlen		= sizeof(sysctl_dccp_feat_send_ndp_count),
-		.mode		= 0644,
-		.proc_handler	= proc_dointvec,
-	},
 	{
 		.procname	= "request_retries",
 		.data		= &sysctl_dccp_request_retries,
-- 
cgit v1.2.3-70-g09d2


From 6fdd34d43bff8be9bb925b49d87a0ee144d2ab07 Mon Sep 17 00:00:00 2001
From: Gerrit Renker <gerrit@erg.abdn.ac.uk>
Date: Mon, 8 Dec 2008 01:19:06 -0800
Subject: dccp ccid-2: Phase out the use of boolean Ack Vector sysctl

This removes the use of the sysctl and the minisock variable for the Send Ack
Vector feature, as it now is handled fully dynamically via feature negotiation
(i.e. when CCID-2 is enabled, Ack Vectors are automatically enabled as per
 RFC 4341, 4.).

Using a sysctl in parallel to this implementation would open the door to
crashes, since much of the code relies on tests of the boolean minisock /
sysctl variable. Thus, this patch replaces all tests of type

	if (dccp_msk(sk)->dccpms_send_ack_vector)
		/* ... */
with
	if (dp->dccps_hc_rx_ackvec != NULL)
		/* ... */

The dccps_hc_rx_ackvec is allocated by the dccp_hdlr_ackvec() when feature
negotiation concluded that Ack Vectors are to be used on the half-connection.
Otherwise, it is NULL (due to dccp_init_sock/dccp_create_openreq_child),
so that the test is a valid one.

The activation handler for Ack Vectors is called as soon as the feature
negotiation has concluded at the
 * server when the Ack marking the transition RESPOND => OPEN arrives;
 * client after it has sent its ACK, marking the transition REQUEST => PARTOPEN.

Adding the sequence number of the Response packet to the Ack Vector has been
removed, since
 (a) connection establishment implies that the Response has been received;
 (b) the CCIDs only look at packets received in the (PART)OPEN state, i.e.
     this entry will always be ignored;
 (c) it can not be used for anything useful - to detect loss for instance, only
     packets received after the loss can serve as pseudo-dupacks.

There was a FIXME to change the error code when dccp_ackvec_add() fails.
I removed this after finding out that:
 * the check whether ackno < ISN is already made earlier,
 * this Response is likely the 1st packet with an Ackno that the client gets,
 * so when dccp_ackvec_add() fails, the reason is likely not a packet error.

Signed-off-by: Gerrit Renker <gerrit@erg.abdn.ac.uk>
Acked-by: Ian McDonald <ian.mcdonald@jandi.co.nz>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/dccp.txt |  3 ---
 include/linux/dccp.h              |  3 ---
 net/dccp/dccp.h                   |  3 +--
 net/dccp/diag.c                   |  2 +-
 net/dccp/input.c                  | 12 +++---------
 net/dccp/minisocks.c              |  1 -
 net/dccp/options.c                |  7 ++-----
 net/dccp/proto.c                  |  3 +--
 net/dccp/sysctl.c                 |  7 -------
 9 files changed, 8 insertions(+), 33 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/networking/dccp.txt b/Documentation/networking/dccp.txt
index 1403745ab40..7a3bb1abb83 100644
--- a/Documentation/networking/dccp.txt
+++ b/Documentation/networking/dccp.txt
@@ -133,9 +133,6 @@ retries2
 	importance for retransmitted acknowledgments and feature negotiation,
 	data packets are never retransmitted. Analogue of tcp_retries2.
 
-send_ackvec = 1
-	Whether or not to send Ack Vector options (sec. 11.5).
-
 tx_ccid = 2
 	Default CCID for the sender-receiver half-connection. Depending on the
 	choice of CCID, the Send Ack Vector feature is enabled automatically.
diff --git a/include/linux/dccp.h b/include/linux/dccp.h
index 60e94438ead..61734e27abb 100644
--- a/include/linux/dccp.h
+++ b/include/linux/dccp.h
@@ -360,7 +360,6 @@ static inline unsigned int dccp_hdr_len(const struct sk_buff *skb)
 #define DCCPF_INITIAL_SEQUENCE_WINDOW		100
 #define DCCPF_INITIAL_ACK_RATIO			2
 #define DCCPF_INITIAL_CCID			DCCPC_CCID2
-#define DCCPF_INITIAL_SEND_ACK_VECTOR		1
 /* FIXME: for now we're default to 1 but it should really be 0 */
 #define DCCPF_INITIAL_SEND_NDP_COUNT		1
 
@@ -370,13 +369,11 @@ static inline unsigned int dccp_hdr_len(const struct sk_buff *skb)
   * Will be used to pass the state from dccp_request_sock to dccp_sock.
   *
   * @dccpms_sequence_window - Sequence Window Feature (section 7.5.2)
-  * @dccpms_send_ack_vector - Send Ack Vector Feature (section 11.5)
   * @dccpms_pending - List of features being negotiated
   * @dccpms_conf -
   */
 struct dccp_minisock {
 	__u64			dccpms_sequence_window;
-	__u8			dccpms_send_ack_vector;
 	struct list_head	dccpms_pending;
 	struct list_head	dccpms_conf;
 };
diff --git a/net/dccp/dccp.h b/net/dccp/dccp.h
index e0759d098b9..0bc4c9a02e1 100644
--- a/net/dccp/dccp.h
+++ b/net/dccp/dccp.h
@@ -98,7 +98,6 @@ extern int  sysctl_dccp_retries2;
 extern int  sysctl_dccp_feat_sequence_window;
 extern int  sysctl_dccp_feat_rx_ccid;
 extern int  sysctl_dccp_feat_tx_ccid;
-extern int  sysctl_dccp_feat_send_ack_vector;
 extern int  sysctl_dccp_tx_qlen;
 extern int  sysctl_dccp_sync_ratelimit;
 
@@ -434,7 +433,7 @@ static inline int dccp_ack_pending(const struct sock *sk)
 	const struct dccp_sock *dp = dccp_sk(sk);
 	return dp->dccps_timestamp_echo != 0 ||
 #ifdef CONFIG_IP_DCCP_ACKVEC
-	       (dccp_msk(sk)->dccpms_send_ack_vector &&
+	       (dp->dccps_hc_rx_ackvec != NULL &&
 		dccp_ackvec_pending(dp->dccps_hc_rx_ackvec)) ||
 #endif
 	       inet_csk_ack_scheduled(sk);
diff --git a/net/dccp/diag.c b/net/dccp/diag.c
index d1e100395ef..652a1b67f72 100644
--- a/net/dccp/diag.c
+++ b/net/dccp/diag.c
@@ -29,7 +29,7 @@ static void dccp_get_info(struct sock *sk, struct tcp_info *info)
 	info->tcpi_backoff	= icsk->icsk_backoff;
 	info->tcpi_pmtu		= icsk->icsk_pmtu_cookie;
 
-	if (dccp_msk(sk)->dccpms_send_ack_vector)
+	if (dp->dccps_hc_rx_ackvec != NULL)
 		info->tcpi_options |= TCPI_OPT_SACK;
 
 	ccid_hc_rx_get_info(dp->dccps_hc_rx_ccid, sk, info);
diff --git a/net/dccp/input.c b/net/dccp/input.c
index 0672b7e1763..5eb443f656c 100644
--- a/net/dccp/input.c
+++ b/net/dccp/input.c
@@ -163,7 +163,7 @@ static void dccp_event_ack_recv(struct sock *sk, struct sk_buff *skb)
 {
 	struct dccp_sock *dp = dccp_sk(sk);
 
-	if (dccp_msk(sk)->dccpms_send_ack_vector)
+	if (dp->dccps_hc_rx_ackvec != NULL)
 		dccp_ackvec_check_rcv_ackno(dp->dccps_hc_rx_ackvec, sk,
 					    DCCP_SKB_CB(skb)->dccpd_ack_seq);
 }
@@ -375,7 +375,7 @@ int dccp_rcv_established(struct sock *sk, struct sk_buff *skb,
 	if (DCCP_SKB_CB(skb)->dccpd_ack_seq != DCCP_PKT_WITHOUT_ACK_SEQ)
 		dccp_event_ack_recv(sk, skb);
 
-	if (dccp_msk(sk)->dccpms_send_ack_vector &&
+	if (dp->dccps_hc_rx_ackvec != NULL &&
 	    dccp_ackvec_add(dp->dccps_hc_rx_ackvec, sk,
 			    DCCP_SKB_CB(skb)->dccpd_seq,
 			    DCCP_ACKVEC_STATE_RECEIVED))
@@ -434,12 +434,6 @@ static int dccp_rcv_request_sent_state_process(struct sock *sk,
 			dp->dccps_syn_rtt = dccp_sample_rtt(sk, 10 * (tstamp -
 			    dp->dccps_options_received.dccpor_timestamp_echo));
 
-		if (dccp_msk(sk)->dccpms_send_ack_vector &&
-		    dccp_ackvec_add(dp->dccps_hc_rx_ackvec, sk,
-				    DCCP_SKB_CB(skb)->dccpd_seq,
-				    DCCP_ACKVEC_STATE_RECEIVED))
-			goto out_invalid_packet; /* FIXME: change error code */
-
 		/* Stop the REQUEST timer */
 		inet_csk_clear_xmit_timer(sk, ICSK_TIME_RETRANS);
 		WARN_ON(sk->sk_send_head == NULL);
@@ -637,7 +631,7 @@ int dccp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
 		if (dcb->dccpd_ack_seq != DCCP_PKT_WITHOUT_ACK_SEQ)
 			dccp_event_ack_recv(sk, skb);
 
-		if (dccp_msk(sk)->dccpms_send_ack_vector &&
+		if (dp->dccps_hc_rx_ackvec != NULL &&
 		    dccp_ackvec_add(dp->dccps_hc_rx_ackvec, sk,
 				    DCCP_SKB_CB(skb)->dccpd_seq,
 				    DCCP_ACKVEC_STATE_RECEIVED))
diff --git a/net/dccp/minisocks.c b/net/dccp/minisocks.c
index 02b14812464..6821ae33dd3 100644
--- a/net/dccp/minisocks.c
+++ b/net/dccp/minisocks.c
@@ -45,7 +45,6 @@ EXPORT_SYMBOL_GPL(dccp_death_row);
 void dccp_minisock_init(struct dccp_minisock *dmsk)
 {
 	dmsk->dccpms_sequence_window = sysctl_dccp_feat_sequence_window;
-	dmsk->dccpms_send_ack_vector = sysctl_dccp_feat_send_ack_vector;
 }
 
 void dccp_time_wait(struct sock *sk, int state, int timeo)
diff --git a/net/dccp/options.c b/net/dccp/options.c
index e9d674874b4..7b1165c21f5 100644
--- a/net/dccp/options.c
+++ b/net/dccp/options.c
@@ -26,7 +26,6 @@
 int sysctl_dccp_feat_sequence_window = DCCPF_INITIAL_SEQUENCE_WINDOW;
 int sysctl_dccp_feat_rx_ccid	      = DCCPF_INITIAL_CCID;
 int sysctl_dccp_feat_tx_ccid	      = DCCPF_INITIAL_CCID;
-int sysctl_dccp_feat_send_ack_vector = DCCPF_INITIAL_SEND_ACK_VECTOR;
 
 u64 dccp_decode_value_var(const u8 *bf, const u8 len)
 {
@@ -145,8 +144,7 @@ int dccp_parse_options(struct sock *sk, struct dccp_request_sock *dreq,
 		case DCCPO_ACK_VECTOR_1:
 			if (dccp_packet_without_ack(skb))   /* RFC 4340, 11.4 */
 				break;
-
-			if (dccp_msk(sk)->dccpms_send_ack_vector &&
+			if (dp->dccps_hc_rx_ackvec != NULL &&
 			    dccp_ackvec_parse(sk, skb, &ackno, opt, value, len))
 				goto out_invalid_option;
 			break;
@@ -526,7 +524,6 @@ static void dccp_insert_option_padding(struct sk_buff *skb)
 int dccp_insert_options(struct sock *sk, struct sk_buff *skb)
 {
 	struct dccp_sock *dp = dccp_sk(sk);
-	struct dccp_minisock *dmsk = dccp_msk(sk);
 
 	DCCP_SKB_CB(skb)->dccpd_opt_len = 0;
 
@@ -547,7 +544,7 @@ int dccp_insert_options(struct sock *sk, struct sk_buff *skb)
 			if (dccp_insert_option_timestamp(sk, skb))
 				return -1;
 
-		} else if (dmsk->dccpms_send_ack_vector	&&
+		} else if (dp->dccps_hc_rx_ackvec != NULL &&
 			   dccp_ackvec_pending(dp->dccps_hc_rx_ackvec) &&
 			   dccp_insert_option_ackvec(sk, skb)) {
 				return -1;
diff --git a/net/dccp/proto.c b/net/dccp/proto.c
index 0941f8fe167..d5c2bacb713 100644
--- a/net/dccp/proto.c
+++ b/net/dccp/proto.c
@@ -201,7 +201,6 @@ EXPORT_SYMBOL_GPL(dccp_init_sock);
 void dccp_destroy_sock(struct sock *sk)
 {
 	struct dccp_sock *dp = dccp_sk(sk);
-	struct dccp_minisock *dmsk = dccp_msk(sk);
 
 	/*
 	 * DCCP doesn't use sk_write_queue, just sk_send_head
@@ -219,7 +218,7 @@ void dccp_destroy_sock(struct sock *sk)
 	kfree(dp->dccps_service_list);
 	dp->dccps_service_list = NULL;
 
-	if (dmsk->dccpms_send_ack_vector) {
+	if (dp->dccps_hc_rx_ackvec != NULL) {
 		dccp_ackvec_free(dp->dccps_hc_rx_ackvec);
 		dp->dccps_hc_rx_ackvec = NULL;
 	}
diff --git a/net/dccp/sysctl.c b/net/dccp/sysctl.c
index 587c12f915c..018e210875e 100644
--- a/net/dccp/sysctl.c
+++ b/net/dccp/sysctl.c
@@ -40,13 +40,6 @@ static struct ctl_table dccp_default_table[] = {
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec,
 	},
-	{
-		.procname	= "send_ackvec",
-		.data		= &sysctl_dccp_feat_send_ack_vector,
-		.maxlen		= sizeof(sysctl_dccp_feat_send_ack_vector),
-		.mode		= 0644,
-		.proc_handler	= proc_dointvec,
-	},
 	{
 		.procname	= "request_retries",
 		.data		= &sysctl_dccp_request_retries,
-- 
cgit v1.2.3-70-g09d2


From 361b73d5c34f59c3fd107bb9dbe7a1fbff2c2517 Mon Sep 17 00:00:00 2001
From: Lai Jiangshan <laijs@cn.fujitsu.com>
Date: Mon, 8 Dec 2008 10:58:08 +0800
Subject: ring_buffer: fix comments

Impact: comments cleanup

fix incorrect comments for enum ring_buffer_type

Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/ring_buffer.h | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h
index 1a350a847ed..d363467c8f1 100644
--- a/include/linux/ring_buffer.h
+++ b/include/linux/ring_buffer.h
@@ -28,17 +28,19 @@ struct ring_buffer_event {
  *				 size = 8 bytes
  *
  * @RINGBUF_TYPE_TIME_STAMP:	Sync time stamp with external clock
- *				 array[0] = tv_nsec
- *				 array[1] = tv_sec
+ *				 array[0]    = tv_nsec
+ *				 array[1..2] = tv_sec
  *				 size = 16 bytes
  *
  * @RINGBUF_TYPE_DATA:		Data record
  *				 If len is zero:
  *				  array[0] holds the actual length
- *				  array[1..(length+3)/4-1] holds data
+ *				  array[1..(length+3)/4] holds data
+ *				  size = 4 + 4 + length (bytes)
  *				 else
  *				  length = len << 2
- *				  array[0..(length+3)/4] holds data
+ *				  array[0..(length+3)/4-1] holds data
+ *				  size = 4 + length (bytes)
  */
 enum ring_buffer_type {
 	RINGBUF_TYPE_PADDING,
-- 
cgit v1.2.3-70-g09d2


From 0b8f1efad30bd58f89961b82dfe68b9edf8fd2ac Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai@kernel.org>
Date: Fri, 5 Dec 2008 18:58:31 -0800
Subject: sparse irq_desc[] array: core kernel and x86 changes

Impact: new feature

Problem on distro kernels: irq_desc[NR_IRQS] takes megabytes of RAM with
NR_CPUS set to large values. The goal is to be able to scale up to much
larger NR_IRQS value without impacting the (important) common case.

To solve this, we generalize irq_desc[NR_IRQS] to an (optional) array of
irq_desc pointers.

When CONFIG_SPARSE_IRQ=y is used, we use kzalloc_node to get irq_desc,
this also makes the IRQ descriptors NUMA-local (to the site that calls
request_irq()).

This gets rid of the irq_cfg[] static array on x86 as well: irq_cfg now
uses desc->chip_data for x86 to store irq_cfg.

Signed-off-by: Yinghai Lu <yinghai@kernel.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/Kconfig                   |  10 ++
 arch/x86/include/asm/irq_vectors.h |   9 ++
 arch/x86/kernel/io_apic.c          | 301 +++++++++++++++++++++++--------------
 arch/x86/kernel/irq.c              |   3 +
 arch/x86/kernel/irq_32.c           |   2 +
 arch/x86/kernel/irq_64.c           |   2 +
 arch/x86/kernel/irqinit_32.c       |   1 -
 arch/x86/kernel/irqinit_64.c       |   1 -
 drivers/char/random.c              |  22 +--
 drivers/pci/intr_remapping.c       |  76 +++++++++-
 drivers/xen/events.c               |  12 +-
 fs/proc/stat.c                     |  17 ++-
 include/linux/interrupt.h          |   2 +
 include/linux/irq.h                |  54 ++++++-
 include/linux/irqnr.h              |  14 +-
 include/linux/kernel_stat.h        |  14 +-
 include/linux/random.h             |  51 +++++++
 init/main.c                        |  11 ++
 kernel/irq/autoprobe.c             |  15 ++
 kernel/irq/chip.c                  |   3 +-
 kernel/irq/handle.c                | 181 +++++++++++++++++++++-
 kernel/irq/proc.c                  |   6 +-
 kernel/irq/spurious.c              |   5 +
 23 files changed, 649 insertions(+), 163 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index ac22bb7719f..48ac688de3c 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -238,6 +238,16 @@ config X86_HAS_BOOT_CPU_ID
 	def_bool y
 	depends on X86_VOYAGER
 
+config SPARSE_IRQ
+	bool "Support sparse irq numbering"
+	depends on PCI_MSI || HT_IRQ
+	default y
+	help
+	  This enables support for sparse irq, esp for msi/msi-x. You may need
+	  if you have lots of cards supports msi-x installed.
+
+	  If you don't know what to do here, say Y.
+
 config X86_FIND_SMP_CONFIG
 	def_bool y
 	depends on X86_MPPARSE || X86_VOYAGER
diff --git a/arch/x86/include/asm/irq_vectors.h b/arch/x86/include/asm/irq_vectors.h
index 0005adb0f94..bb6b69a6b12 100644
--- a/arch/x86/include/asm/irq_vectors.h
+++ b/arch/x86/include/asm/irq_vectors.h
@@ -102,11 +102,20 @@
 #define invalid_vm86_irq(irq)	((irq) < 3 || (irq) > 15)
 
 #if defined(CONFIG_X86_IO_APIC) && !defined(CONFIG_X86_VOYAGER)
+
+#ifndef CONFIG_SPARSE_IRQ
 # if NR_CPUS < MAX_IO_APICS
 #  define NR_IRQS (NR_VECTORS + (32 * NR_CPUS))
 # else
 #  define NR_IRQS (NR_VECTORS + (32 * MAX_IO_APICS))
 # endif
+#else
+# if (8 * NR_CPUS) > (32 * MAX_IO_APICS)
+#  define NR_IRQS (NR_VECTORS + (8 * NR_CPUS))
+# else
+#  define NR_IRQS (NR_VECTORS + (32 * MAX_IO_APICS))
+# endif
+#endif
 
 #elif defined(CONFIG_X86_VOYAGER)
 
diff --git a/arch/x86/kernel/io_apic.c b/arch/x86/kernel/io_apic.c
index 9043251210f..9de17f5c112 100644
--- a/arch/x86/kernel/io_apic.c
+++ b/arch/x86/kernel/io_apic.c
@@ -108,8 +108,33 @@ static int __init parse_noapic(char *str)
 early_param("noapic", parse_noapic);
 
 struct irq_pin_list;
+
+/*
+ * This is performance-critical, we want to do it O(1)
+ *
+ * the indexing order of this array favors 1:1 mappings
+ * between pins and IRQs.
+ */
+
+struct irq_pin_list {
+	int apic, pin;
+	struct irq_pin_list *next;
+};
+
+static struct irq_pin_list *get_one_free_irq_2_pin(int cpu)
+{
+	struct irq_pin_list *pin;
+	int node;
+
+	node = cpu_to_node(cpu);
+
+	pin = kzalloc_node(sizeof(*pin), GFP_ATOMIC, node);
+	printk(KERN_DEBUG "  alloc irq_2_pin on cpu %d node %d\n", cpu, node);
+
+	return pin;
+}
+
 struct irq_cfg {
-	unsigned int irq;
 	struct irq_pin_list *irq_2_pin;
 	cpumask_t domain;
 	cpumask_t old_domain;
@@ -119,83 +144,93 @@ struct irq_cfg {
 };
 
 /* irq_cfg is indexed by the sum of all RTEs in all I/O APICs. */
+#ifdef CONFIG_SPARSE_IRQ
+static struct irq_cfg irq_cfgx[] = {
+#else
 static struct irq_cfg irq_cfgx[NR_IRQS] = {
-	[0]  = { .irq =  0, .domain = CPU_MASK_ALL, .vector = IRQ0_VECTOR,  },
-	[1]  = { .irq =  1, .domain = CPU_MASK_ALL, .vector = IRQ1_VECTOR,  },
-	[2]  = { .irq =  2, .domain = CPU_MASK_ALL, .vector = IRQ2_VECTOR,  },
-	[3]  = { .irq =  3, .domain = CPU_MASK_ALL, .vector = IRQ3_VECTOR,  },
-	[4]  = { .irq =  4, .domain = CPU_MASK_ALL, .vector = IRQ4_VECTOR,  },
-	[5]  = { .irq =  5, .domain = CPU_MASK_ALL, .vector = IRQ5_VECTOR,  },
-	[6]  = { .irq =  6, .domain = CPU_MASK_ALL, .vector = IRQ6_VECTOR,  },
-	[7]  = { .irq =  7, .domain = CPU_MASK_ALL, .vector = IRQ7_VECTOR,  },
-	[8]  = { .irq =  8, .domain = CPU_MASK_ALL, .vector = IRQ8_VECTOR,  },
-	[9]  = { .irq =  9, .domain = CPU_MASK_ALL, .vector = IRQ9_VECTOR,  },
-	[10] = { .irq = 10, .domain = CPU_MASK_ALL, .vector = IRQ10_VECTOR, },
-	[11] = { .irq = 11, .domain = CPU_MASK_ALL, .vector = IRQ11_VECTOR, },
-	[12] = { .irq = 12, .domain = CPU_MASK_ALL, .vector = IRQ12_VECTOR, },
-	[13] = { .irq = 13, .domain = CPU_MASK_ALL, .vector = IRQ13_VECTOR, },
-	[14] = { .irq = 14, .domain = CPU_MASK_ALL, .vector = IRQ14_VECTOR, },
-	[15] = { .irq = 15, .domain = CPU_MASK_ALL, .vector = IRQ15_VECTOR, },
+#endif
+	[0]  = { .domain = CPU_MASK_ALL, .vector = IRQ0_VECTOR,  },
+	[1]  = { .domain = CPU_MASK_ALL, .vector = IRQ1_VECTOR,  },
+	[2]  = { .domain = CPU_MASK_ALL, .vector = IRQ2_VECTOR,  },
+	[3]  = { .domain = CPU_MASK_ALL, .vector = IRQ3_VECTOR,  },
+	[4]  = { .domain = CPU_MASK_ALL, .vector = IRQ4_VECTOR,  },
+	[5]  = { .domain = CPU_MASK_ALL, .vector = IRQ5_VECTOR,  },
+	[6]  = { .domain = CPU_MASK_ALL, .vector = IRQ6_VECTOR,  },
+	[7]  = { .domain = CPU_MASK_ALL, .vector = IRQ7_VECTOR,  },
+	[8]  = { .domain = CPU_MASK_ALL, .vector = IRQ8_VECTOR,  },
+	[9]  = { .domain = CPU_MASK_ALL, .vector = IRQ9_VECTOR,  },
+	[10] = { .domain = CPU_MASK_ALL, .vector = IRQ10_VECTOR, },
+	[11] = { .domain = CPU_MASK_ALL, .vector = IRQ11_VECTOR, },
+	[12] = { .domain = CPU_MASK_ALL, .vector = IRQ12_VECTOR, },
+	[13] = { .domain = CPU_MASK_ALL, .vector = IRQ13_VECTOR, },
+	[14] = { .domain = CPU_MASK_ALL, .vector = IRQ14_VECTOR, },
+	[15] = { .domain = CPU_MASK_ALL, .vector = IRQ15_VECTOR, },
 };
 
-#define for_each_irq_cfg(irq, cfg)		\
-	for (irq = 0, cfg = irq_cfgx; irq < nr_irqs; irq++, cfg++)
-
-static struct irq_cfg *irq_cfg(unsigned int irq)
+void __init arch_early_irq_init(void)
 {
-	return irq < nr_irqs ? irq_cfgx + irq : NULL;
-}
+	struct irq_cfg *cfg;
+	struct irq_desc *desc;
+	int count;
+	int i;
 
-static struct irq_cfg *irq_cfg_alloc(unsigned int irq)
-{
-	return irq_cfg(irq);
-}
+	cfg = irq_cfgx;
+	count = ARRAY_SIZE(irq_cfgx);
 
-/*
- * Rough estimation of how many shared IRQs there are, can be changed
- * anytime.
- */
-#define MAX_PLUS_SHARED_IRQS NR_IRQS
-#define PIN_MAP_SIZE (MAX_PLUS_SHARED_IRQS + NR_IRQS)
+	for (i = 0; i < count; i++) {
+		desc = irq_to_desc(i);
+		desc->chip_data = &cfg[i];
+	}
+}
 
-/*
- * This is performance-critical, we want to do it O(1)
- *
- * the indexing order of this array favors 1:1 mappings
- * between pins and IRQs.
- */
+#ifdef CONFIG_SPARSE_IRQ
+static struct irq_cfg *irq_cfg(unsigned int irq)
+{
+	struct irq_cfg *cfg = NULL;
+	struct irq_desc *desc;
 
-struct irq_pin_list {
-	int apic, pin;
-	struct irq_pin_list *next;
-};
+	desc = irq_to_desc(irq);
+	if (desc)
+		cfg = desc->chip_data;
 
-static struct irq_pin_list irq_2_pin_head[PIN_MAP_SIZE];
-static struct irq_pin_list *irq_2_pin_ptr;
+	return cfg;
+}
 
-static void __init irq_2_pin_init(void)
+static struct irq_cfg *get_one_free_irq_cfg(int cpu)
 {
-	struct irq_pin_list *pin = irq_2_pin_head;
-	int i;
+	struct irq_cfg *cfg;
+	int node;
+
+	node = cpu_to_node(cpu);
 
-	for (i = 1; i < PIN_MAP_SIZE; i++)
-		pin[i-1].next = &pin[i];
+	cfg = kzalloc_node(sizeof(*cfg), GFP_ATOMIC, node);
+	printk(KERN_DEBUG "  alloc irq_cfg on cpu %d node %d\n", cpu, node);
 
-	irq_2_pin_ptr = &pin[0];
+	return cfg;
 }
 
-static struct irq_pin_list *get_one_free_irq_2_pin(void)
+void arch_init_chip_data(struct irq_desc *desc, int cpu)
 {
-	struct irq_pin_list *pin = irq_2_pin_ptr;
+	struct irq_cfg *cfg;
 
-	if (!pin)
-		panic("can not get more irq_2_pin\n");
+	cfg = desc->chip_data;
+	if (!cfg) {
+		desc->chip_data = get_one_free_irq_cfg(cpu);
+		if (!desc->chip_data) {
+			printk(KERN_ERR "can not alloc irq_cfg\n");
+			BUG_ON(1);
+		}
+	}
+}
 
-	irq_2_pin_ptr = pin->next;
-	pin->next = NULL;
-	return pin;
+#else
+static struct irq_cfg *irq_cfg(unsigned int irq)
+{
+	return irq < nr_irqs ? irq_cfgx + irq : NULL;
 }
 
+#endif
+
 struct io_apic {
 	unsigned int index;
 	unsigned int unused[3];
@@ -397,16 +432,19 @@ static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t mask)
  * shared ISA-space IRQs, so we have to support them. We are super
  * fast in the common case, and fast for shared ISA-space IRQs.
  */
-static void add_pin_to_irq(unsigned int irq, int apic, int pin)
+static void add_pin_to_irq_cpu(unsigned int irq, int cpu, int apic, int pin)
 {
-	struct irq_cfg *cfg;
 	struct irq_pin_list *entry;
+	struct irq_cfg *cfg = irq_cfg(irq);
 
-	/* first time to refer irq_cfg, so with new */
-	cfg = irq_cfg_alloc(irq);
 	entry = cfg->irq_2_pin;
 	if (!entry) {
-		entry = get_one_free_irq_2_pin();
+		entry = get_one_free_irq_2_pin(cpu);
+		if (!entry) {
+			printk(KERN_ERR "can not alloc irq_2_pin to add %d - %d\n",
+					apic, pin);
+			return;
+		}
 		cfg->irq_2_pin = entry;
 		entry->apic = apic;
 		entry->pin = pin;
@@ -421,7 +459,7 @@ static void add_pin_to_irq(unsigned int irq, int apic, int pin)
 		entry = entry->next;
 	}
 
-	entry->next = get_one_free_irq_2_pin();
+	entry->next = get_one_free_irq_2_pin(cpu);
 	entry = entry->next;
 	entry->apic = apic;
 	entry->pin = pin;
@@ -430,7 +468,7 @@ static void add_pin_to_irq(unsigned int irq, int apic, int pin)
 /*
  * Reroute an IRQ to a different pin.
  */
-static void __init replace_pin_at_irq(unsigned int irq,
+static void __init replace_pin_at_irq(unsigned int irq, int cpu,
 				      int oldapic, int oldpin,
 				      int newapic, int newpin)
 {
@@ -451,7 +489,7 @@ static void __init replace_pin_at_irq(unsigned int irq,
 
 	/* why? call replace before add? */
 	if (!replaced)
-		add_pin_to_irq(irq, newapic, newpin);
+		add_pin_to_irq_cpu(irq, cpu, newapic, newpin);
 }
 
 static inline void io_apic_modify_irq(unsigned int irq,
@@ -1162,9 +1200,13 @@ void __setup_vector_irq(int cpu)
 	/* This function must be called with vector_lock held */
 	int irq, vector;
 	struct irq_cfg *cfg;
+	struct irq_desc *desc;
 
 	/* Mark the inuse vectors */
-	for_each_irq_cfg(irq, cfg) {
+	for_each_irq_desc(irq, desc) {
+		if (!desc)
+			continue;
+		cfg = desc->chip_data;
 		if (!cpu_isset(cpu, cfg->domain))
 			continue;
 		vector = cfg->vector;
@@ -1356,6 +1398,8 @@ static void __init setup_IO_APIC_irqs(void)
 {
 	int apic, pin, idx, irq;
 	int notcon = 0;
+	struct irq_desc *desc;
+	int cpu = boot_cpu_id;
 
 	apic_printk(APIC_VERBOSE, KERN_DEBUG "init IO_APIC IRQs\n");
 
@@ -1387,7 +1431,12 @@ static void __init setup_IO_APIC_irqs(void)
 			if (multi_timer_check(apic, irq))
 				continue;
 #endif
-			add_pin_to_irq(irq, apic, pin);
+			desc = irq_to_desc_alloc_cpu(irq, cpu);
+			if (!desc) {
+				printk(KERN_INFO "can not get irq_desc for %d\n", irq);
+				continue;
+			}
+			add_pin_to_irq_cpu(irq, cpu, apic, pin);
 
 			setup_IO_APIC_irq(apic, pin, irq,
 					irq_trigger(idx), irq_polarity(idx));
@@ -1448,6 +1497,7 @@ __apicdebuginit(void) print_IO_APIC(void)
 	union IO_APIC_reg_03 reg_03;
 	unsigned long flags;
 	struct irq_cfg *cfg;
+	struct irq_desc *desc;
 	unsigned int irq;
 
 	if (apic_verbosity == APIC_QUIET)
@@ -1537,8 +1587,13 @@ __apicdebuginit(void) print_IO_APIC(void)
 	}
 	}
 	printk(KERN_DEBUG "IRQ to pin mappings:\n");
-	for_each_irq_cfg(irq, cfg) {
-		struct irq_pin_list *entry = cfg->irq_2_pin;
+	for_each_irq_desc(irq, desc) {
+		struct irq_pin_list *entry;
+
+		if (!desc)
+			continue;
+		cfg = desc->chip_data;
+		entry = cfg->irq_2_pin;
 		if (!entry)
 			continue;
 		printk(KERN_DEBUG "IRQ%d ", irq);
@@ -2022,6 +2077,7 @@ static unsigned int startup_ioapic_irq(unsigned int irq)
 {
 	int was_pending = 0;
 	unsigned long flags;
+	struct irq_cfg *cfg;
 
 	spin_lock_irqsave(&ioapic_lock, flags);
 	if (irq < 16) {
@@ -2029,6 +2085,7 @@ static unsigned int startup_ioapic_irq(unsigned int irq)
 		if (i8259A_irq_pending(irq))
 			was_pending = 1;
 	}
+	cfg = irq_cfg(irq);
 	__unmask_IO_APIC_irq(irq);
 	spin_unlock_irqrestore(&ioapic_lock, flags);
 
@@ -2178,6 +2235,9 @@ static void ir_irq_migration(struct work_struct *work)
 	struct irq_desc *desc;
 
 	for_each_irq_desc(irq, desc) {
+		if (!desc)
+			continue;
+
 		if (desc->status & IRQ_MOVE_PENDING) {
 			unsigned long flags;
 
@@ -2229,6 +2289,9 @@ asmlinkage void smp_irq_move_cleanup_interrupt(void)
 		struct irq_cfg *cfg;
 		irq = __get_cpu_var(vector_irq)[vector];
 
+		if (irq == -1)
+			continue;
+
 		desc = irq_to_desc(irq);
 		if (!desc)
 			continue;
@@ -2430,8 +2493,12 @@ static inline void init_IO_APIC_traps(void)
 	 * Also, we've got to be careful not to trash gate
 	 * 0x80, because int 0x80 is hm, kind of importantish. ;)
 	 */
-	for_each_irq_cfg(irq, cfg) {
-		if (IO_APIC_IRQ(irq) && !cfg->vector) {
+	for_each_irq_desc(irq, desc) {
+		if (!desc)
+			continue;
+
+		cfg = desc->chip_data;
+		if (IO_APIC_IRQ(irq) && cfg && !cfg->vector) {
 			/*
 			 * Hmm.. We don't have an entry for this,
 			 * so default to an old-fashioned 8259
@@ -2439,11 +2506,9 @@ static inline void init_IO_APIC_traps(void)
 			 */
 			if (irq < 16)
 				make_8259A_irq(irq);
-			else {
-				desc = irq_to_desc(irq);
+			else
 				/* Strange. Oh, well.. */
 				desc->chip = &no_irq_chip;
-			}
 		}
 	}
 }
@@ -2654,7 +2719,7 @@ static inline void __init check_timer(void)
 		 * Ok, does IRQ0 through the IOAPIC work?
 		 */
 		if (no_pin1) {
-			add_pin_to_irq(0, apic1, pin1);
+			add_pin_to_irq_cpu(0, boot_cpu_id, apic1, pin1);
 			setup_timer_IRQ0_pin(apic1, pin1, cfg->vector);
 		}
 		unmask_IO_APIC_irq(0);
@@ -2683,7 +2748,7 @@ static inline void __init check_timer(void)
 		/*
 		 * legacy devices should be connected to IO APIC #0
 		 */
-		replace_pin_at_irq(0, apic1, pin1, apic2, pin2);
+		replace_pin_at_irq(0, boot_cpu_id, apic1, pin1, apic2, pin2);
 		setup_timer_IRQ0_pin(apic2, pin2, cfg->vector);
 		unmask_IO_APIC_irq(0);
 		enable_8259A_irq(0);
@@ -2902,21 +2967,25 @@ unsigned int create_irq_nr(unsigned int irq_want)
 	unsigned int irq;
 	unsigned int new;
 	unsigned long flags;
-	struct irq_cfg *cfg_new;
-
-	irq_want = nr_irqs - 1;
+	struct irq_cfg *cfg_new = NULL;
+	int cpu = boot_cpu_id;
+	struct irq_desc *desc_new = NULL;
 
 	irq = 0;
 	spin_lock_irqsave(&vector_lock, flags);
 	for (new = irq_want; new > 0; new--) {
 		if (platform_legacy_irq(new))
 			continue;
-		cfg_new = irq_cfg(new);
-		if (cfg_new && cfg_new->vector != 0)
+
+		desc_new = irq_to_desc_alloc_cpu(new, cpu);
+		if (!desc_new) {
+			printk(KERN_INFO "can not get irq_desc for %d\n", new);
+			continue;
+		}
+		cfg_new = desc_new->chip_data;
+
+		if (cfg_new->vector != 0)
 			continue;
-		/* check if need to create one */
-		if (!cfg_new)
-			cfg_new = irq_cfg_alloc(new);
 		if (__assign_irq_vector(new, TARGET_CPUS) == 0)
 			irq = new;
 		break;
@@ -2925,6 +2994,9 @@ unsigned int create_irq_nr(unsigned int irq_want)
 
 	if (irq > 0) {
 		dynamic_irq_init(irq);
+		/* restore it, in case dynamic_irq_init clear it */
+		if (desc_new)
+			desc_new->chip_data = cfg_new;
 	}
 	return irq;
 }
@@ -2944,8 +3016,16 @@ int create_irq(void)
 void destroy_irq(unsigned int irq)
 {
 	unsigned long flags;
+	struct irq_cfg *cfg;
+	struct irq_desc *desc;
 
+	/* store it, in case dynamic_irq_cleanup clear it */
+	desc = irq_to_desc(irq);
+	cfg = desc->chip_data;
 	dynamic_irq_cleanup(irq);
+	/* connect back irq_cfg */
+	if (desc)
+		desc->chip_data = cfg;
 
 #ifdef CONFIG_INTR_REMAP
 	free_irte(irq);
@@ -3195,26 +3275,13 @@ static int setup_msi_irq(struct pci_dev *dev, struct msi_desc *desc, int irq)
 	return 0;
 }
 
-static unsigned int build_irq_for_pci_dev(struct pci_dev *dev)
-{
-	unsigned int irq;
-
-	irq = dev->bus->number;
-	irq <<= 8;
-	irq |= dev->devfn;
-	irq <<= 12;
-
-	return irq;
-}
-
-int arch_setup_msi_irq(struct pci_dev *dev, struct msi_desc *desc)
+int arch_setup_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc)
 {
 	unsigned int irq;
 	int ret;
 	unsigned int irq_want;
 
-	irq_want = build_irq_for_pci_dev(dev) + 0x100;
-
+	irq_want = nr_irqs - 1;
 	irq = create_irq_nr(irq_want);
 	if (irq == 0)
 		return -1;
@@ -3228,7 +3295,7 @@ int arch_setup_msi_irq(struct pci_dev *dev, struct msi_desc *desc)
 		goto error;
 no_ir:
 #endif
-	ret = setup_msi_irq(dev, desc, irq);
+	ret = setup_msi_irq(dev, msidesc, irq);
 	if (ret < 0) {
 		destroy_irq(irq);
 		return ret;
@@ -3246,7 +3313,7 @@ int arch_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
 {
 	unsigned int irq;
 	int ret, sub_handle;
-	struct msi_desc *desc;
+	struct msi_desc *msidesc;
 	unsigned int irq_want;
 
 #ifdef CONFIG_INTR_REMAP
@@ -3254,10 +3321,11 @@ int arch_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
 	int index = 0;
 #endif
 
-	irq_want = build_irq_for_pci_dev(dev) + 0x100;
+	irq_want = nr_irqs - 1;
 	sub_handle = 0;
-	list_for_each_entry(desc, &dev->msi_list, list) {
-		irq = create_irq_nr(irq_want--);
+	list_for_each_entry(msidesc, &dev->msi_list, list) {
+		irq = create_irq_nr(irq_want);
+		irq_want--;
 		if (irq == 0)
 			return -1;
 #ifdef CONFIG_INTR_REMAP
@@ -3289,7 +3357,7 @@ int arch_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
 		}
 no_ir:
 #endif
-		ret = setup_msi_irq(dev, desc, irq);
+		ret = setup_msi_irq(dev, msidesc, irq);
 		if (ret < 0)
 			goto error;
 		sub_handle++;
@@ -3707,17 +3775,29 @@ int __init io_apic_get_version(int ioapic)
 
 int io_apic_set_pci_routing (int ioapic, int pin, int irq, int triggering, int polarity)
 {
+	struct irq_desc *desc;
+	struct irq_cfg *cfg;
+	int cpu = boot_cpu_id;
+
 	if (!IO_APIC_IRQ(irq)) {
 		apic_printk(APIC_QUIET,KERN_ERR "IOAPIC[%d]: Invalid reference to IRQ 0\n",
 			ioapic);
 		return -EINVAL;
 	}
 
+	desc = irq_to_desc_alloc_cpu(irq, cpu);
+	if (!desc) {
+		printk(KERN_INFO "can not get irq_desc %d\n", irq);
+		return 0;
+	}
+
 	/*
 	 * IRQs < 16 are already in the irq_2_pin[] map
 	 */
-	if (irq >= 16)
-		add_pin_to_irq(irq, ioapic, pin);
+	if (irq >= 16) {
+		cfg = desc->chip_data;
+		add_pin_to_irq_cpu(irq, cpu, ioapic, pin);
+	}
 
 	setup_IO_APIC_irq(ioapic, pin, irq, triggering, polarity);
 
@@ -3773,7 +3853,8 @@ void __init setup_ioapic_dest(void)
 			 * when you have too many devices, because at that time only boot
 			 * cpu is online.
 			 */
-			cfg = irq_cfg(irq);
+			desc = irq_to_desc(irq);
+			cfg = desc->chip_data;
 			if (!cfg->vector) {
 				setup_IO_APIC_irq(ioapic, pin, irq,
 						  irq_trigger(irq_entry),
@@ -3785,7 +3866,6 @@ void __init setup_ioapic_dest(void)
 			/*
 			 * Honour affinities which have been set in early boot
 			 */
-			desc = irq_to_desc(irq);
 			if (desc->status &
 			    (IRQ_NO_BALANCING | IRQ_AFFINITY_SET))
 				mask = desc->affinity;
@@ -3846,7 +3926,6 @@ void __init ioapic_init_mappings(void)
 	struct resource *ioapic_res;
 	int i;
 
-	irq_2_pin_init();
 	ioapic_res = ioapic_setup_resources();
 	for (i = 0; i < nr_ioapics; i++) {
 		if (smp_found_config) {
diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
index d1d4dc52f64..3f1d9d18df6 100644
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -118,6 +118,9 @@ int show_interrupts(struct seq_file *p, void *v)
 	}
 
 	desc = irq_to_desc(i);
+	if (!desc)
+		return 0;
+
 	spin_lock_irqsave(&desc->lock, flags);
 #ifndef CONFIG_SMP
 	any_count = kstat_irqs(i);
diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c
index a51382672de..119fc9c8ff7 100644
--- a/arch/x86/kernel/irq_32.c
+++ b/arch/x86/kernel/irq_32.c
@@ -242,6 +242,8 @@ void fixup_irqs(cpumask_t map)
 	for_each_irq_desc(irq, desc) {
 		cpumask_t mask;
 
+		if (!desc)
+			continue;
 		if (irq == 2)
 			continue;
 
diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c
index 60eb84eb77a..900009c7059 100644
--- a/arch/x86/kernel/irq_64.c
+++ b/arch/x86/kernel/irq_64.c
@@ -94,6 +94,8 @@ void fixup_irqs(cpumask_t map)
 		int break_affinity = 0;
 		int set_affinity = 1;
 
+		if (!desc)
+			continue;
 		if (irq == 2)
 			continue;
 
diff --git a/arch/x86/kernel/irqinit_32.c b/arch/x86/kernel/irqinit_32.c
index 845aa9803e8..5a5651b7f9e 100644
--- a/arch/x86/kernel/irqinit_32.c
+++ b/arch/x86/kernel/irqinit_32.c
@@ -69,7 +69,6 @@ void __init init_ISA_irqs (void)
 	 * 16 old-style INTA-cycle interrupts:
 	 */
 	for (i = 0; i < 16; i++) {
-		/* first time call this irq_desc */
 		struct irq_desc *desc = irq_to_desc(i);
 
 		desc->status = IRQ_DISABLED;
diff --git a/arch/x86/kernel/irqinit_64.c b/arch/x86/kernel/irqinit_64.c
index ff023539128..cd9f42d028d 100644
--- a/arch/x86/kernel/irqinit_64.c
+++ b/arch/x86/kernel/irqinit_64.c
@@ -143,7 +143,6 @@ void __init init_ISA_irqs(void)
 	init_8259A(0);
 
 	for (i = 0; i < 16; i++) {
-		/* first time call this irq_desc */
 		struct irq_desc *desc = irq_to_desc(i);
 
 		desc->status = IRQ_DISABLED;
diff --git a/drivers/char/random.c b/drivers/char/random.c
index 675076f5fca..d26891bfcd4 100644
--- a/drivers/char/random.c
+++ b/drivers/char/random.c
@@ -558,23 +558,9 @@ struct timer_rand_state {
 	unsigned dont_count_entropy:1;
 };
 
-static struct timer_rand_state *irq_timer_state[NR_IRQS];
-
-static struct timer_rand_state *get_timer_rand_state(unsigned int irq)
-{
-	if (irq >= nr_irqs)
-		return NULL;
-
-	return irq_timer_state[irq];
-}
-
-static void set_timer_rand_state(unsigned int irq, struct timer_rand_state *state)
-{
-	if (irq >= nr_irqs)
-		return;
-
-	irq_timer_state[irq] = state;
-}
+#ifndef CONFIG_SPARSE_IRQ
+struct timer_rand_state *irq_timer_state[NR_IRQS];
+#endif
 
 static struct timer_rand_state input_timer_state;
 
@@ -933,8 +919,10 @@ void rand_initialize_irq(int irq)
 {
 	struct timer_rand_state *state;
 
+#ifndef CONFIG_SPARSE_IRQ
 	if (irq >= nr_irqs)
 		return;
+#endif
 
 	state = get_timer_rand_state(irq);
 
diff --git a/drivers/pci/intr_remapping.c b/drivers/pci/intr_remapping.c
index 2de5a3238c9..c9958ec5e25 100644
--- a/drivers/pci/intr_remapping.c
+++ b/drivers/pci/intr_remapping.c
@@ -19,17 +19,75 @@ struct irq_2_iommu {
 	u8  irte_mask;
 };
 
-static struct irq_2_iommu irq_2_iommuX[NR_IRQS];
+#ifdef CONFIG_SPARSE_IRQ
+static struct irq_2_iommu *get_one_free_irq_2_iommu(int cpu)
+{
+	struct irq_2_iommu *iommu;
+	int node;
+
+	node = cpu_to_node(cpu);
+
+	iommu = kzalloc_node(sizeof(*iommu), GFP_ATOMIC, node);
+	printk(KERN_DEBUG "alloc irq_2_iommu on cpu %d node %d\n", cpu, node);
+
+	return iommu;
+}
 
 static struct irq_2_iommu *irq_2_iommu(unsigned int irq)
 {
-	return (irq < nr_irqs) ? irq_2_iommuX + irq : NULL;
+	struct irq_desc *desc;
+
+	desc = irq_to_desc(irq);
+
+	if (WARN_ON_ONCE(!desc))
+		return NULL;
+
+	return desc->irq_2_iommu;
+}
+
+static struct irq_2_iommu *irq_2_iommu_alloc_cpu(unsigned int irq, int cpu)
+{
+	struct irq_desc *desc;
+	struct irq_2_iommu *irq_iommu;
+
+	/*
+	 * alloc irq desc if not allocated already.
+	 */
+	desc = irq_to_desc_alloc_cpu(irq, cpu);
+	if (!desc) {
+		printk(KERN_INFO "can not get irq_desc for %d\n", irq);
+		return NULL;
+	}
+
+	irq_iommu = desc->irq_2_iommu;
+
+	if (!irq_iommu)
+		desc->irq_2_iommu = get_one_free_irq_2_iommu(cpu);
+
+	return desc->irq_2_iommu;
 }
 
+static struct irq_2_iommu *irq_2_iommu_alloc(unsigned int irq)
+{
+	return irq_2_iommu_alloc_cpu(irq, boot_cpu_id);
+}
+
+#else /* !CONFIG_SPARSE_IRQ */
+
+static struct irq_2_iommu irq_2_iommuX[NR_IRQS];
+
+static struct irq_2_iommu *irq_2_iommu(unsigned int irq)
+{
+	if (irq < nr_irqs)
+		return &irq_2_iommuX[irq];
+
+	return NULL;
+}
 static struct irq_2_iommu *irq_2_iommu_alloc(unsigned int irq)
 {
 	return irq_2_iommu(irq);
 }
+#endif
 
 static DEFINE_SPINLOCK(irq_2_ir_lock);
 
@@ -86,9 +144,11 @@ int alloc_irte(struct intel_iommu *iommu, int irq, u16 count)
 	if (!count)
 		return -1;
 
+#ifndef CONFIG_SPARSE_IRQ
 	/* protect irq_2_iommu_alloc later */
 	if (irq >= nr_irqs)
 		return -1;
+#endif
 
 	/*
 	 * start the IRTE search from index 0.
@@ -130,6 +190,12 @@ int alloc_irte(struct intel_iommu *iommu, int irq, u16 count)
 		table->base[i].present = 1;
 
 	irq_iommu = irq_2_iommu_alloc(irq);
+	if (!irq_iommu) {
+		spin_unlock(&irq_2_ir_lock);
+		printk(KERN_ERR "can't allocate irq_2_iommu\n");
+		return -1;
+	}
+
 	irq_iommu->iommu = iommu;
 	irq_iommu->irte_index =  index;
 	irq_iommu->sub_handle = 0;
@@ -177,6 +243,12 @@ int set_irte_irq(int irq, struct intel_iommu *iommu, u16 index, u16 subhandle)
 
 	irq_iommu = irq_2_iommu_alloc(irq);
 
+	if (!irq_iommu) {
+		spin_unlock(&irq_2_ir_lock);
+		printk(KERN_ERR "can't allocate irq_2_iommu\n");
+		return -1;
+	}
+
 	irq_iommu->iommu = iommu;
 	irq_iommu->irte_index = index;
 	irq_iommu->sub_handle = subhandle;
diff --git a/drivers/xen/events.c b/drivers/xen/events.c
index 1e3b934a4cf..2924faa7f6c 100644
--- a/drivers/xen/events.c
+++ b/drivers/xen/events.c
@@ -141,8 +141,12 @@ static void init_evtchn_cpu_bindings(void)
 	int i;
 
 	/* By default all event channels notify CPU#0. */
-	for_each_irq_desc(i, desc)
+	for_each_irq_desc(i, desc) {
+		if (!desc)
+			continue;
+
 		desc->affinity = cpumask_of_cpu(0);
+	}
 #endif
 
 	memset(cpu_evtchn, 0, sizeof(cpu_evtchn));
@@ -231,7 +235,7 @@ static int find_unbound_irq(void)
 	int irq;
 
 	/* Only allocate from dynirq range */
-	for_each_irq_nr(irq)
+	for (irq = 0; irq < nr_irqs; irq++)
 		if (irq_bindcount[irq] == 0)
 			break;
 
@@ -792,7 +796,7 @@ void xen_irq_resume(void)
 		mask_evtchn(evtchn);
 
 	/* No IRQ <-> event-channel mappings. */
-	for_each_irq_nr(irq)
+	for (irq = 0; irq < nr_irqs; irq++)
 		irq_info[irq].evtchn = 0; /* zap event-channel binding */
 
 	for (evtchn = 0; evtchn < NR_EVENT_CHANNELS; evtchn++)
@@ -824,7 +828,7 @@ void __init xen_init_IRQ(void)
 		mask_evtchn(i);
 
 	/* Dynamic IRQ space is currently unbound. Zero the refcnts. */
-	for_each_irq_nr(i)
+	for (i = 0; i < nr_irqs; i++)
 		irq_bindcount[i] = 0;
 
 	irq_ctx_init(smp_processor_id());
diff --git a/fs/proc/stat.c b/fs/proc/stat.c
index 81904f07679..a13431ab7c6 100644
--- a/fs/proc/stat.c
+++ b/fs/proc/stat.c
@@ -27,6 +27,7 @@ static int show_stat(struct seq_file *p, void *v)
 	u64 sum = 0;
 	struct timespec boottime;
 	unsigned int per_irq_sum;
+	struct irq_desc *desc;
 
 	user = nice = system = idle = iowait =
 		irq = softirq = steal = cputime64_zero;
@@ -44,10 +45,11 @@ static int show_stat(struct seq_file *p, void *v)
 		softirq = cputime64_add(softirq, kstat_cpu(i).cpustat.softirq);
 		steal = cputime64_add(steal, kstat_cpu(i).cpustat.steal);
 		guest = cputime64_add(guest, kstat_cpu(i).cpustat.guest);
-
-		for_each_irq_nr(j)
+		for_each_irq_desc(j, desc) {
+			if (!desc)
+				continue;
 			sum += kstat_irqs_cpu(j, i);
-
+		}
 		sum += arch_irq_stat_cpu(i);
 	}
 	sum += arch_irq_stat();
@@ -90,11 +92,14 @@ static int show_stat(struct seq_file *p, void *v)
 	seq_printf(p, "intr %llu", (unsigned long long)sum);
 
 	/* sum again ? it could be updated? */
-	for_each_irq_nr(j) {
+	for (j = 0; j < NR_IRQS; j++) {
+		desc = irq_to_desc(j);
 		per_irq_sum = 0;
 
-		for_each_possible_cpu(i)
-			per_irq_sum += kstat_irqs_cpu(j, i);
+		if (desc) {
+			for_each_possible_cpu(i)
+				per_irq_sum += kstat_irqs_cpu(j, i);
+		}
 
 		seq_printf(p, " %u", per_irq_sum);
 	}
diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index f58a0cf8929..79e915e7e8a 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -18,6 +18,8 @@
 #include <asm/ptrace.h>
 #include <asm/system.h>
 
+extern int nr_irqs;
+
 /*
  * These correspond to the IORESOURCE_IRQ_* defines in
  * linux/ioport.h to select the interrupt line behaviour.  When
diff --git a/include/linux/irq.h b/include/linux/irq.h
index 3dddfa703eb..63b00439d4d 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -129,6 +129,8 @@ struct irq_chip {
 	const char	*typename;
 };
 
+struct timer_rand_state;
+struct irq_2_iommu;
 /**
  * struct irq_desc - interrupt descriptor
  * @irq:		interrupt number for this descriptor
@@ -154,6 +156,13 @@ struct irq_chip {
  */
 struct irq_desc {
 	unsigned int		irq;
+#ifdef CONFIG_SPARSE_IRQ
+	struct timer_rand_state *timer_rand_state;
+	unsigned int            *kstat_irqs;
+# ifdef CONFIG_INTR_REMAP
+	struct irq_2_iommu      *irq_2_iommu;
+# endif
+#endif
 	irq_flow_handler_t	handle_irq;
 	struct irq_chip		*chip;
 	struct msi_desc		*msi_desc;
@@ -181,14 +190,52 @@ struct irq_desc {
 	const char		*name;
 } ____cacheline_internodealigned_in_smp;
 
+extern void early_irq_init(void);
+extern void arch_early_irq_init(void);
+extern void arch_init_chip_data(struct irq_desc *desc, int cpu);
+extern void arch_init_copy_chip_data(struct irq_desc *old_desc,
+					struct irq_desc *desc, int cpu);
+extern void arch_free_chip_data(struct irq_desc *old_desc, struct irq_desc *desc);
+
+#ifndef CONFIG_SPARSE_IRQ
 
 extern struct irq_desc irq_desc[NR_IRQS];
 
 static inline struct irq_desc *irq_to_desc(unsigned int irq)
 {
-	return (irq < nr_irqs) ? irq_desc + irq : NULL;
+	return (irq < NR_IRQS) ? irq_desc + irq : NULL;
+}
+static inline struct irq_desc *irq_to_desc_alloc_cpu(unsigned int irq, int cpu)
+{
+	return irq_to_desc(irq);
 }
 
+#ifdef CONFIG_GENERIC_HARDIRQS
+# define for_each_irq_desc(irq, desc)		\
+	for (irq = 0, desc = irq_desc; irq < nr_irqs; irq++, desc++)
+# define for_each_irq_desc_reverse(irq, desc)                          \
+	for (irq = nr_irqs - 1, desc = irq_desc + (nr_irqs - 1);        \
+	    irq >= 0; irq--, desc--)
+#endif
+
+#else
+
+extern struct irq_desc *irq_to_desc(unsigned int irq);
+extern struct irq_desc *irq_to_desc_alloc_cpu(unsigned int irq, int cpu);
+extern struct irq_desc *move_irq_desc(struct irq_desc *old_desc, int cpu);
+
+# define for_each_irq_desc(irq, desc)		\
+	for (irq = 0, desc = irq_to_desc(irq); irq < nr_irqs; irq++, desc = irq_to_desc(irq))
+# define for_each_irq_desc_reverse(irq, desc)                          \
+	for (irq = nr_irqs - 1, desc = irq_to_desc(irq); irq >= 0; irq--, desc = irq_to_desc(irq))
+
+#define kstat_irqs_this_cpu(DESC) \
+	((DESC)->kstat_irqs[smp_processor_id()])
+#define kstat_incr_irqs_this_cpu(irqno, DESC) \
+	((DESC)->kstat_irqs[smp_processor_id()]++)
+
+#endif
+
 /*
  * Migration helpers for obsolete names, they will go away:
  */
@@ -380,6 +427,11 @@ extern int set_irq_msi(unsigned int irq, struct msi_desc *entry);
 #define get_irq_data(irq)	(irq_to_desc(irq)->handler_data)
 #define get_irq_msi(irq)	(irq_to_desc(irq)->msi_desc)
 
+#define get_irq_desc_chip(desc)		((desc)->chip)
+#define get_irq_desc_chip_data(desc)	((desc)->chip_data)
+#define get_irq_desc_data(desc)		((desc)->handler_data)
+#define get_irq_desc_msi(desc)		((desc)->msi_desc)
+
 #endif /* CONFIG_GENERIC_HARDIRQS */
 
 #endif /* !CONFIG_S390 */
diff --git a/include/linux/irqnr.h b/include/linux/irqnr.h
index 452c280c811..7a299e989f8 100644
--- a/include/linux/irqnr.h
+++ b/include/linux/irqnr.h
@@ -7,18 +7,10 @@
 
 # define for_each_irq_desc(irq, desc)		\
 	for (irq = 0; irq < nr_irqs; irq++)
-#else
-extern int nr_irqs;
 
-# define for_each_irq_desc(irq, desc)		\
-	for (irq = 0, desc = irq_desc; irq < nr_irqs; irq++, desc++)
-
-# define for_each_irq_desc_reverse(irq, desc)				\
-	for (irq = nr_irqs - 1, desc = irq_desc + (nr_irqs - 1);	\
-	     irq >= 0; irq--, desc--)
+static inline early_sparse_irq_init(void)
+{
+}
 #endif
 
-#define for_each_irq_nr(irq)			\
-	for (irq = 0; irq < nr_irqs; irq++)
-
 #endif
diff --git a/include/linux/kernel_stat.h b/include/linux/kernel_stat.h
index 4a145caeee0..4ee4b3d2316 100644
--- a/include/linux/kernel_stat.h
+++ b/include/linux/kernel_stat.h
@@ -28,7 +28,9 @@ struct cpu_usage_stat {
 
 struct kernel_stat {
 	struct cpu_usage_stat	cpustat;
-	unsigned int irqs[NR_IRQS];
+#ifndef CONFIG_SPARSE_IRQ
+       unsigned int irqs[NR_IRQS];
+#endif
 };
 
 DECLARE_PER_CPU(struct kernel_stat, kstat);
@@ -39,6 +41,10 @@ DECLARE_PER_CPU(struct kernel_stat, kstat);
 
 extern unsigned long long nr_context_switches(void);
 
+#ifndef CONFIG_SPARSE_IRQ
+#define kstat_irqs_this_cpu(irq) \
+	(kstat_this_cpu.irqs[irq])
+
 struct irq_desc;
 
 static inline void kstat_incr_irqs_this_cpu(unsigned int irq,
@@ -46,11 +52,17 @@ static inline void kstat_incr_irqs_this_cpu(unsigned int irq,
 {
 	kstat_this_cpu.irqs[irq]++;
 }
+#endif
+
 
+#ifndef CONFIG_SPARSE_IRQ
 static inline unsigned int kstat_irqs_cpu(unsigned int irq, int cpu)
 {
        return kstat_cpu(cpu).irqs[irq];
 }
+#else
+extern unsigned int kstat_irqs_cpu(unsigned int irq, int cpu);
+#endif
 
 /*
  * Number of interrupts per specific IRQ source, since bootup
diff --git a/include/linux/random.h b/include/linux/random.h
index 36f125c0c60..ad9daa2374d 100644
--- a/include/linux/random.h
+++ b/include/linux/random.h
@@ -44,6 +44,57 @@ struct rand_pool_info {
 
 extern void rand_initialize_irq(int irq);
 
+struct timer_rand_state;
+#ifndef CONFIG_SPARSE_IRQ
+
+extern struct timer_rand_state *irq_timer_state[];
+
+extern int nr_irqs;
+static inline struct timer_rand_state *get_timer_rand_state(unsigned int irq)
+{
+	if (irq >= nr_irqs)
+		return NULL;
+
+	return irq_timer_state[irq];
+}
+
+static inline void set_timer_rand_state(unsigned int irq, struct timer_rand_state *state)
+{
+	if (irq >= nr_irqs)
+		return;
+
+	irq_timer_state[irq] = state;
+}
+
+#else
+
+#include <linux/irq.h>
+static inline struct timer_rand_state *get_timer_rand_state(unsigned int irq)
+{
+	struct irq_desc *desc;
+
+	desc = irq_to_desc(irq);
+
+	if (!desc)
+		return NULL;
+
+	return desc->timer_rand_state;
+}
+
+static inline void set_timer_rand_state(unsigned int irq, struct timer_rand_state *state)
+{
+	struct irq_desc *desc;
+
+	desc = irq_to_desc(irq);
+
+	if (!desc)
+		return;
+
+	desc->timer_rand_state = state;
+}
+#endif
+
+
 extern void add_input_randomness(unsigned int type, unsigned int code,
 				 unsigned int value);
 extern void add_interrupt_randomness(int irq);
diff --git a/init/main.c b/init/main.c
index 7e117a231af..c1f999a3cf3 100644
--- a/init/main.c
+++ b/init/main.c
@@ -539,6 +539,15 @@ void __init __weak thread_info_cache_init(void)
 {
 }
 
+void __init __weak arch_early_irq_init(void)
+{
+}
+
+void __init __weak early_irq_init(void)
+{
+	arch_early_irq_init();
+}
+
 asmlinkage void __init start_kernel(void)
 {
 	char * command_line;
@@ -603,6 +612,8 @@ asmlinkage void __init start_kernel(void)
 	sort_main_extable();
 	trap_init();
 	rcu_init();
+	/* init some links before init_ISA_irqs() */
+	early_irq_init();
 	init_IRQ();
 	pidhash_init();
 	init_timers();
diff --git a/kernel/irq/autoprobe.c b/kernel/irq/autoprobe.c
index cc0f7321b8c..650ce4102a6 100644
--- a/kernel/irq/autoprobe.c
+++ b/kernel/irq/autoprobe.c
@@ -40,6 +40,9 @@ unsigned long probe_irq_on(void)
 	 * flush such a longstanding irq before considering it as spurious.
 	 */
 	for_each_irq_desc_reverse(i, desc) {
+		if (!desc)
+			continue;
+
 		spin_lock_irq(&desc->lock);
 		if (!desc->action && !(desc->status & IRQ_NOPROBE)) {
 			/*
@@ -68,6 +71,9 @@ unsigned long probe_irq_on(void)
 	 * happened in the previous stage, it may have masked itself)
 	 */
 	for_each_irq_desc_reverse(i, desc) {
+		if (!desc)
+			continue;
+
 		spin_lock_irq(&desc->lock);
 		if (!desc->action && !(desc->status & IRQ_NOPROBE)) {
 			desc->status |= IRQ_AUTODETECT | IRQ_WAITING;
@@ -86,6 +92,9 @@ unsigned long probe_irq_on(void)
 	 * Now filter out any obviously spurious interrupts
 	 */
 	for_each_irq_desc(i, desc) {
+		if (!desc)
+			continue;
+
 		spin_lock_irq(&desc->lock);
 		status = desc->status;
 
@@ -124,6 +133,9 @@ unsigned int probe_irq_mask(unsigned long val)
 	int i;
 
 	for_each_irq_desc(i, desc) {
+		if (!desc)
+			continue;
+
 		spin_lock_irq(&desc->lock);
 		status = desc->status;
 
@@ -166,6 +178,9 @@ int probe_irq_off(unsigned long val)
 	unsigned int status;
 
 	for_each_irq_desc(i, desc) {
+		if (!desc)
+			continue;
+
 		spin_lock_irq(&desc->lock);
 		status = desc->status;
 
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index 10b5092e9bf..8e4fce4a1b1 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -24,9 +24,10 @@
  */
 void dynamic_irq_init(unsigned int irq)
 {
-	struct irq_desc *desc = irq_to_desc(irq);
+	struct irq_desc *desc;
 	unsigned long flags;
 
+	desc = irq_to_desc(irq);
 	if (!desc) {
 		WARN(1, KERN_ERR "Trying to initialize invalid IRQ%d\n", irq);
 		return;
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index c815b42d0f5..96ca203eb51 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -15,9 +15,16 @@
 #include <linux/random.h>
 #include <linux/interrupt.h>
 #include <linux/kernel_stat.h>
+#include <linux/rculist.h>
+#include <linux/hash.h>
 
 #include "internals.h"
 
+/*
+ * lockdep: we want to handle all irq_desc locks as a single lock-class:
+ */
+static struct lock_class_key irq_desc_lock_class;
+
 /**
  * handle_bad_irq - handle spurious and unhandled irqs
  * @irq:       the interrupt number
@@ -49,6 +56,155 @@ void handle_bad_irq(unsigned int irq, struct irq_desc *desc)
 int nr_irqs = NR_IRQS;
 EXPORT_SYMBOL_GPL(nr_irqs);
 
+void __init __attribute__((weak)) arch_early_irq_init(void)
+{
+}
+
+#ifdef CONFIG_SPARSE_IRQ
+static struct irq_desc irq_desc_init = {
+	.irq	    = -1,
+	.status	    = IRQ_DISABLED,
+	.chip	    = &no_irq_chip,
+	.handle_irq = handle_bad_irq,
+	.depth      = 1,
+	.lock       = __SPIN_LOCK_UNLOCKED(irq_desc_init.lock),
+#ifdef CONFIG_SMP
+	.affinity   = CPU_MASK_ALL
+#endif
+};
+
+static void init_kstat_irqs(struct irq_desc *desc, int cpu, int nr)
+{
+	unsigned long bytes;
+	char *ptr;
+	int node;
+
+	/* Compute how many bytes we need per irq and allocate them */
+	bytes = nr * sizeof(unsigned int);
+
+	node = cpu_to_node(cpu);
+	ptr = kzalloc_node(bytes, GFP_ATOMIC, node);
+	printk(KERN_DEBUG "  alloc kstat_irqs on cpu %d node %d\n", cpu, node);
+
+	if (ptr)
+		desc->kstat_irqs = (unsigned int *)ptr;
+}
+
+void __attribute__((weak)) arch_init_chip_data(struct irq_desc *desc, int cpu)
+{
+}
+
+static void init_one_irq_desc(int irq, struct irq_desc *desc, int cpu)
+{
+	memcpy(desc, &irq_desc_init, sizeof(struct irq_desc));
+	desc->irq = irq;
+#ifdef CONFIG_SMP
+	desc->cpu = cpu;
+#endif
+	lockdep_set_class(&desc->lock, &irq_desc_lock_class);
+	init_kstat_irqs(desc, cpu, nr_cpu_ids);
+	if (!desc->kstat_irqs) {
+		printk(KERN_ERR "can not alloc kstat_irqs\n");
+		BUG_ON(1);
+	}
+	arch_init_chip_data(desc, cpu);
+}
+
+/*
+ * Protect the sparse_irqs:
+ */
+static DEFINE_SPINLOCK(sparse_irq_lock);
+
+struct irq_desc *irq_desc_ptrs[NR_IRQS] __read_mostly;
+
+static struct irq_desc irq_desc_legacy[16] __cacheline_aligned_in_smp = {
+	[0 ... 15] = {
+		.irq	    = -1,
+		.status	    = IRQ_DISABLED,
+		.chip	    = &no_irq_chip,
+		.handle_irq = handle_bad_irq,
+		.depth	    = 1,
+		.lock	    = __SPIN_LOCK_UNLOCKED(irq_desc_init.lock),
+#ifdef CONFIG_SMP
+		.affinity   = CPU_MASK_ALL
+#endif
+	}
+};
+
+/* FIXME: use bootmem alloc ...*/
+static unsigned int kstat_irqs_legacy[16][NR_CPUS];
+
+void __init early_irq_init(void)
+{
+	struct irq_desc *desc;
+	int legacy_count;
+	int i;
+
+	desc = irq_desc_legacy;
+	legacy_count = ARRAY_SIZE(irq_desc_legacy);
+
+	for (i = 0; i < legacy_count; i++) {
+		desc[i].irq = i;
+		desc[i].kstat_irqs = kstat_irqs_legacy[i];
+
+		irq_desc_ptrs[i] = desc + i;
+	}
+
+	for (i = legacy_count; i < NR_IRQS; i++)
+		irq_desc_ptrs[i] = NULL;
+
+	arch_early_irq_init();
+}
+
+struct irq_desc *irq_to_desc(unsigned int irq)
+{
+	return (irq < NR_IRQS) ? irq_desc_ptrs[irq] : NULL;
+}
+
+struct irq_desc *irq_to_desc_alloc_cpu(unsigned int irq, int cpu)
+{
+	struct irq_desc *desc;
+	unsigned long flags;
+	int node;
+
+	if (irq >= NR_IRQS) {
+		printk(KERN_WARNING "irq >= NR_IRQS in irq_to_desc_alloc: %d %d\n",
+				irq, NR_IRQS);
+		WARN_ON(1);
+		return NULL;
+	}
+
+	desc = irq_desc_ptrs[irq];
+	if (desc)
+		return desc;
+
+	spin_lock_irqsave(&sparse_irq_lock, flags);
+
+	/* We have to check it to avoid races with another CPU */
+	desc = irq_desc_ptrs[irq];
+	if (desc)
+		goto out_unlock;
+
+	node = cpu_to_node(cpu);
+	desc = kzalloc_node(sizeof(*desc), GFP_ATOMIC, node);
+	printk(KERN_DEBUG "  alloc irq_desc for %d on cpu %d node %d\n",
+		 irq, cpu, node);
+	if (!desc) {
+		printk(KERN_ERR "can not alloc irq_desc\n");
+		BUG_ON(1);
+	}
+	init_one_irq_desc(irq, desc, cpu);
+
+	irq_desc_ptrs[irq] = desc;
+
+out_unlock:
+	spin_unlock_irqrestore(&sparse_irq_lock, flags);
+
+	return desc;
+}
+
+#else
+
 struct irq_desc irq_desc[NR_IRQS] __cacheline_aligned_in_smp = {
 	[0 ... NR_IRQS-1] = {
 		.status = IRQ_DISABLED,
@@ -62,6 +218,8 @@ struct irq_desc irq_desc[NR_IRQS] __cacheline_aligned_in_smp = {
 	}
 };
 
+#endif
+
 /*
  * What should we do if we get a hw irq event on an illegal vector?
  * Each architecture has to answer this themself.
@@ -261,17 +419,28 @@ out:
 
 
 #ifdef CONFIG_TRACE_IRQFLAGS
-/*
- * lockdep: we want to handle all irq_desc locks as a single lock-class:
- */
-static struct lock_class_key irq_desc_lock_class;
-
 void early_init_irq_lock_class(void)
 {
+#ifndef CONFIG_SPARSE_IRQ
 	struct irq_desc *desc;
 	int i;
 
-	for_each_irq_desc(i, desc)
+	for_each_irq_desc(i, desc) {
+		if (!desc)
+			continue;
+
 		lockdep_set_class(&desc->lock, &irq_desc_lock_class);
+	}
+#endif
+}
+#endif
+
+#ifdef CONFIG_SPARSE_IRQ
+unsigned int kstat_irqs_cpu(unsigned int irq, int cpu)
+{
+	struct irq_desc *desc = irq_to_desc(irq);
+	return desc->kstat_irqs[cpu];
 }
 #endif
+EXPORT_SYMBOL(kstat_irqs_cpu);
+
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index d257e7d6a8a..f6b3440f05b 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -243,7 +243,11 @@ void init_irq_proc(void)
 	/*
 	 * Create entries for all existing IRQs.
 	 */
-	for_each_irq_desc(irq, desc)
+	for_each_irq_desc(irq, desc) {
+		if (!desc)
+			continue;
+
 		register_irq_proc(irq, desc);
+	}
 }
 
diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c
index dd364c11e56..3738107531f 100644
--- a/kernel/irq/spurious.c
+++ b/kernel/irq/spurious.c
@@ -91,6 +91,9 @@ static int misrouted_irq(int irq)
 	int i, ok = 0;
 
 	for_each_irq_desc(i, desc) {
+		if (!desc)
+			continue;
+
 		if (!i)
 			 continue;
 
@@ -112,6 +115,8 @@ static void poll_spurious_irqs(unsigned long dummy)
 	for_each_irq_desc(i, desc) {
 		unsigned int status;
 
+		if (!desc)
+			continue;
 		if (!i)
 			 continue;
 
-- 
cgit v1.2.3-70-g09d2


From 3145e941fcfe2548fa2270afb1a05bab3a6bc418 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai@kernel.org>
Date: Fri, 5 Dec 2008 18:58:34 -0800
Subject: x86, MSI: pass irq_cfg and irq_desc

Impact: simplify code

Pass irq_desc and cfg around, instead of raw IRQ numbers - this way
we dont have to look it up again and again.

Signed-off-by: Yinghai Lu <yinghai@kernel.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/io_apic.c | 318 ++++++++++++++++++++++++++--------------------
 drivers/pci/msi.c         |  55 +++++---
 include/linux/msi.h       |   3 +
 3 files changed, 222 insertions(+), 154 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/kernel/io_apic.c b/arch/x86/kernel/io_apic.c
index 0dcde74abd1..a1a2e070f31 100644
--- a/arch/x86/kernel/io_apic.c
+++ b/arch/x86/kernel/io_apic.c
@@ -231,6 +231,10 @@ static struct irq_cfg *irq_cfg(unsigned int irq)
 
 #endif
 
+static inline void set_extra_move_desc(struct irq_desc *desc, cpumask_t mask)
+{
+}
+
 struct io_apic {
 	unsigned int index;
 	unsigned int unused[3];
@@ -272,11 +276,10 @@ static inline void io_apic_modify(unsigned int apic, unsigned int reg, unsigned
 	writel(value, &io_apic->data);
 }
 
-static bool io_apic_level_ack_pending(unsigned int irq)
+static bool io_apic_level_ack_pending(struct irq_cfg *cfg)
 {
 	struct irq_pin_list *entry;
 	unsigned long flags;
-	struct irq_cfg *cfg = irq_cfg(irq);
 
 	spin_lock_irqsave(&ioapic_lock, flags);
 	entry = cfg->irq_2_pin;
@@ -358,13 +361,12 @@ static void ioapic_mask_entry(int apic, int pin)
 }
 
 #ifdef CONFIG_SMP
-static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, u8 vector)
+static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, struct irq_cfg *cfg)
 {
 	int apic, pin;
-	struct irq_cfg *cfg;
 	struct irq_pin_list *entry;
+	u8 vector = cfg->vector;
 
-	cfg = irq_cfg(irq);
 	entry = cfg->irq_2_pin;
 	for (;;) {
 		unsigned int reg;
@@ -394,24 +396,27 @@ static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, u8 vector)
 	}
 }
 
-static int assign_irq_vector(int irq, cpumask_t mask);
+static int assign_irq_vector(int irq, struct irq_cfg *cfg, cpumask_t mask);
 
-static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t mask)
+static void set_ioapic_affinity_irq_desc(struct irq_desc *desc, cpumask_t mask)
 {
 	struct irq_cfg *cfg;
 	unsigned long flags;
 	unsigned int dest;
 	cpumask_t tmp;
-	struct irq_desc *desc;
+	unsigned int irq;
 
 	cpus_and(tmp, mask, cpu_online_map);
 	if (cpus_empty(tmp))
 		return;
 
-	cfg = irq_cfg(irq);
-	if (assign_irq_vector(irq, mask))
+	irq = desc->irq;
+	cfg = desc->chip_data;
+	if (assign_irq_vector(irq, cfg, mask))
 		return;
 
+	set_extra_move_desc(desc, mask);
+
 	cpus_and(tmp, cfg->domain, mask);
 	dest = cpu_mask_to_apicid(tmp);
 	/*
@@ -419,12 +424,20 @@ static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t mask)
 	 */
 	dest = SET_APIC_LOGICAL_ID(dest);
 
-	desc = irq_to_desc(irq);
 	spin_lock_irqsave(&ioapic_lock, flags);
-	__target_IO_APIC_irq(irq, dest, cfg->vector);
+	__target_IO_APIC_irq(irq, dest, cfg);
 	desc->affinity = mask;
 	spin_unlock_irqrestore(&ioapic_lock, flags);
 }
+
+static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t mask)
+{
+	struct irq_desc *desc;
+
+	desc = irq_to_desc(irq);
+
+	set_ioapic_affinity_irq_desc(desc, mask);
+}
 #endif /* CONFIG_SMP */
 
 /*
@@ -432,10 +445,9 @@ static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t mask)
  * shared ISA-space IRQs, so we have to support them. We are super
  * fast in the common case, and fast for shared ISA-space IRQs.
  */
-static void add_pin_to_irq_cpu(unsigned int irq, int cpu, int apic, int pin)
+static void add_pin_to_irq_cpu(struct irq_cfg *cfg, int cpu, int apic, int pin)
 {
 	struct irq_pin_list *entry;
-	struct irq_cfg *cfg = irq_cfg(irq);
 
 	entry = cfg->irq_2_pin;
 	if (!entry) {
@@ -468,11 +480,10 @@ static void add_pin_to_irq_cpu(unsigned int irq, int cpu, int apic, int pin)
 /*
  * Reroute an IRQ to a different pin.
  */
-static void __init replace_pin_at_irq(unsigned int irq, int cpu,
+static void __init replace_pin_at_irq_cpu(struct irq_cfg *cfg, int cpu,
 				      int oldapic, int oldpin,
 				      int newapic, int newpin)
 {
-	struct irq_cfg *cfg = irq_cfg(irq);
 	struct irq_pin_list *entry = cfg->irq_2_pin;
 	int replaced = 0;
 
@@ -489,18 +500,16 @@ static void __init replace_pin_at_irq(unsigned int irq, int cpu,
 
 	/* why? call replace before add? */
 	if (!replaced)
-		add_pin_to_irq_cpu(irq, cpu, newapic, newpin);
+		add_pin_to_irq_cpu(cfg, cpu, newapic, newpin);
 }
 
-static inline void io_apic_modify_irq(unsigned int irq,
+static inline void io_apic_modify_irq(struct irq_cfg *cfg,
 				int mask_and, int mask_or,
 				void (*final)(struct irq_pin_list *entry))
 {
 	int pin;
-	struct irq_cfg *cfg;
 	struct irq_pin_list *entry;
 
-	cfg = irq_cfg(irq);
 	for (entry = cfg->irq_2_pin; entry != NULL; entry = entry->next) {
 		unsigned int reg;
 		pin = entry->pin;
@@ -513,9 +522,9 @@ static inline void io_apic_modify_irq(unsigned int irq,
 	}
 }
 
-static void __unmask_IO_APIC_irq(unsigned int irq)
+static void __unmask_IO_APIC_irq(struct irq_cfg *cfg)
 {
-	io_apic_modify_irq(irq, ~IO_APIC_REDIR_MASKED, 0, NULL);
+	io_apic_modify_irq(cfg, ~IO_APIC_REDIR_MASKED, 0, NULL);
 }
 
 #ifdef CONFIG_X86_64
@@ -530,47 +539,64 @@ void io_apic_sync(struct irq_pin_list *entry)
 	readl(&io_apic->data);
 }
 
-static void __mask_IO_APIC_irq(unsigned int irq)
+static void __mask_IO_APIC_irq(struct irq_cfg *cfg)
 {
-	io_apic_modify_irq(irq, ~0, IO_APIC_REDIR_MASKED, &io_apic_sync);
+	io_apic_modify_irq(cfg, ~0, IO_APIC_REDIR_MASKED, &io_apic_sync);
 }
 #else /* CONFIG_X86_32 */
-static void __mask_IO_APIC_irq(unsigned int irq)
+static void __mask_IO_APIC_irq(struct irq_cfg *cfg)
 {
-	io_apic_modify_irq(irq, ~0, IO_APIC_REDIR_MASKED, NULL);
+	io_apic_modify_irq(cfg, ~0, IO_APIC_REDIR_MASKED, NULL);
 }
 
-static void __mask_and_edge_IO_APIC_irq(unsigned int irq)
+static void __mask_and_edge_IO_APIC_irq(struct irq_cfg *cfg)
 {
-	io_apic_modify_irq(irq, ~IO_APIC_REDIR_LEVEL_TRIGGER,
+	io_apic_modify_irq(cfg, ~IO_APIC_REDIR_LEVEL_TRIGGER,
 			IO_APIC_REDIR_MASKED, NULL);
 }
 
-static void __unmask_and_level_IO_APIC_irq(unsigned int irq)
+static void __unmask_and_level_IO_APIC_irq(struct irq_cfg *cfg)
 {
-	io_apic_modify_irq(irq, ~IO_APIC_REDIR_MASKED,
+	io_apic_modify_irq(cfg, ~IO_APIC_REDIR_MASKED,
 			IO_APIC_REDIR_LEVEL_TRIGGER, NULL);
 }
 #endif /* CONFIG_X86_32 */
 
-static void mask_IO_APIC_irq (unsigned int irq)
+static void mask_IO_APIC_irq_desc(struct irq_desc *desc)
 {
+	struct irq_cfg *cfg = desc->chip_data;
 	unsigned long flags;
 
+	BUG_ON(!cfg);
+
 	spin_lock_irqsave(&ioapic_lock, flags);
-	__mask_IO_APIC_irq(irq);
+	__mask_IO_APIC_irq(cfg);
 	spin_unlock_irqrestore(&ioapic_lock, flags);
 }
 
-static void unmask_IO_APIC_irq (unsigned int irq)
+static void unmask_IO_APIC_irq_desc(struct irq_desc *desc)
 {
+	struct irq_cfg *cfg = desc->chip_data;
 	unsigned long flags;
 
 	spin_lock_irqsave(&ioapic_lock, flags);
-	__unmask_IO_APIC_irq(irq);
+	__unmask_IO_APIC_irq(cfg);
 	spin_unlock_irqrestore(&ioapic_lock, flags);
 }
 
+static void mask_IO_APIC_irq(unsigned int irq)
+{
+	struct irq_desc *desc = irq_to_desc(irq);
+
+	mask_IO_APIC_irq_desc(desc);
+}
+static void unmask_IO_APIC_irq(unsigned int irq)
+{
+	struct irq_desc *desc = irq_to_desc(irq);
+
+	unmask_IO_APIC_irq_desc(desc);
+}
+
 static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin)
 {
 	struct IO_APIC_route_entry entry;
@@ -1072,7 +1098,7 @@ void unlock_vector_lock(void)
 	spin_unlock(&vector_lock);
 }
 
-static int __assign_irq_vector(int irq, cpumask_t mask)
+static int __assign_irq_vector(int irq, struct irq_cfg *cfg, cpumask_t mask)
 {
 	/*
 	 * NOTE! The local APIC isn't very good at handling
@@ -1088,16 +1114,13 @@ static int __assign_irq_vector(int irq, cpumask_t mask)
 	static int current_vector = FIRST_DEVICE_VECTOR, current_offset = 0;
 	unsigned int old_vector;
 	int cpu;
-	struct irq_cfg *cfg;
 
-	cfg = irq_cfg(irq);
+	if ((cfg->move_in_progress) || cfg->move_cleanup_count)
+		return -EBUSY;
 
 	/* Only try and allocate irqs on cpus that are present */
 	cpus_and(mask, mask, cpu_online_map);
 
-	if ((cfg->move_in_progress) || cfg->move_cleanup_count)
-		return -EBUSY;
-
 	old_vector = cfg->vector;
 	if (old_vector) {
 		cpumask_t tmp;
@@ -1151,24 +1174,22 @@ next:
 	return -ENOSPC;
 }
 
-static int assign_irq_vector(int irq, cpumask_t mask)
+static int assign_irq_vector(int irq, struct irq_cfg *cfg, cpumask_t mask)
 {
 	int err;
 	unsigned long flags;
 
 	spin_lock_irqsave(&vector_lock, flags);
-	err = __assign_irq_vector(irq, mask);
+	err = __assign_irq_vector(irq, cfg, mask);
 	spin_unlock_irqrestore(&vector_lock, flags);
 	return err;
 }
 
-static void __clear_irq_vector(int irq)
+static void __clear_irq_vector(int irq, struct irq_cfg *cfg)
 {
-	struct irq_cfg *cfg;
 	cpumask_t mask;
 	int cpu, vector;
 
-	cfg = irq_cfg(irq);
 	BUG_ON(!cfg->vector);
 
 	vector = cfg->vector;
@@ -1257,11 +1278,8 @@ static inline int IO_APIC_irq_trigger(int irq)
 }
 #endif
 
-static void ioapic_register_intr(int irq, unsigned long trigger)
+static void ioapic_register_intr(int irq, struct irq_desc *desc, unsigned long trigger)
 {
-	struct irq_desc *desc;
-
-	desc = irq_to_desc(irq);
 
 	if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) ||
 	    trigger == IOAPIC_LEVEL)
@@ -1353,7 +1371,7 @@ static int setup_ioapic_entry(int apic, int irq,
 	return 0;
 }
 
-static void setup_IO_APIC_irq(int apic, int pin, unsigned int irq,
+static void setup_IO_APIC_irq(int apic, int pin, unsigned int irq, struct irq_desc *desc,
 			      int trigger, int polarity)
 {
 	struct irq_cfg *cfg;
@@ -1363,10 +1381,10 @@ static void setup_IO_APIC_irq(int apic, int pin, unsigned int irq,
 	if (!IO_APIC_IRQ(irq))
 		return;
 
-	cfg = irq_cfg(irq);
+	cfg = desc->chip_data;
 
 	mask = TARGET_CPUS;
-	if (assign_irq_vector(irq, mask))
+	if (assign_irq_vector(irq, cfg, mask))
 		return;
 
 	cpus_and(mask, cfg->domain, mask);
@@ -1383,11 +1401,11 @@ static void setup_IO_APIC_irq(int apic, int pin, unsigned int irq,
 			       cfg->vector)) {
 		printk("Failed to setup ioapic entry for ioapic  %d, pin %d\n",
 		       mp_ioapics[apic].mp_apicid, pin);
-		__clear_irq_vector(irq);
+		__clear_irq_vector(irq, cfg);
 		return;
 	}
 
-	ioapic_register_intr(irq, trigger);
+	ioapic_register_intr(irq, desc, trigger);
 	if (irq < NR_IRQS_LEGACY)
 		disable_8259A_irq(irq);
 
@@ -1399,6 +1417,7 @@ static void __init setup_IO_APIC_irqs(void)
 	int apic, pin, idx, irq;
 	int notcon = 0;
 	struct irq_desc *desc;
+	struct irq_cfg *cfg;
 	int cpu = boot_cpu_id;
 
 	apic_printk(APIC_VERBOSE, KERN_DEBUG "init IO_APIC IRQs\n");
@@ -1436,9 +1455,10 @@ static void __init setup_IO_APIC_irqs(void)
 				printk(KERN_INFO "can not get irq_desc for %d\n", irq);
 				continue;
 			}
-			add_pin_to_irq_cpu(irq, cpu, apic, pin);
+			cfg = desc->chip_data;
+			add_pin_to_irq_cpu(cfg, cpu, apic, pin);
 
-			setup_IO_APIC_irq(apic, pin, irq,
+			setup_IO_APIC_irq(apic, pin, irq, desc,
 					irq_trigger(idx), irq_polarity(idx));
 		}
 	}
@@ -2086,7 +2106,7 @@ static unsigned int startup_ioapic_irq(unsigned int irq)
 			was_pending = 1;
 	}
 	cfg = irq_cfg(irq);
-	__unmask_IO_APIC_irq(irq);
+	__unmask_IO_APIC_irq(cfg);
 	spin_unlock_irqrestore(&ioapic_lock, flags);
 
 	return was_pending;
@@ -2149,35 +2169,37 @@ static DECLARE_DELAYED_WORK(ir_migration_work, ir_irq_migration);
  * as simple as edge triggered migration and we can do the irq migration
  * with a simple atomic update to IO-APIC RTE.
  */
-static void migrate_ioapic_irq(int irq, cpumask_t mask)
+static void migrate_ioapic_irq_desc(struct irq_desc *desc, cpumask_t mask)
 {
 	struct irq_cfg *cfg;
-	struct irq_desc *desc;
 	cpumask_t tmp, cleanup_mask;
 	struct irte irte;
 	int modify_ioapic_rte;
 	unsigned int dest;
 	unsigned long flags;
+	unsigned int irq;
 
 	cpus_and(tmp, mask, cpu_online_map);
 	if (cpus_empty(tmp))
 		return;
 
+	irq = desc->irq;
 	if (get_irte(irq, &irte))
 		return;
 
-	if (assign_irq_vector(irq, mask))
+	cfg = desc->chip_data;
+	if (assign_irq_vector(irq, cfg, mask))
 		return;
 
-	cfg = irq_cfg(irq);
+	set_extra_move_desc(desc, mask);
+
 	cpus_and(tmp, cfg->domain, mask);
 	dest = cpu_mask_to_apicid(tmp);
 
-	desc = irq_to_desc(irq);
 	modify_ioapic_rte = desc->status & IRQ_LEVEL;
 	if (modify_ioapic_rte) {
 		spin_lock_irqsave(&ioapic_lock, flags);
-		__target_IO_APIC_irq(irq, dest, cfg->vector);
+		__target_IO_APIC_irq(irq, dest, cfg);
 		spin_unlock_irqrestore(&ioapic_lock, flags);
 	}
 
@@ -2199,14 +2221,14 @@ static void migrate_ioapic_irq(int irq, cpumask_t mask)
 	desc->affinity = mask;
 }
 
-static int migrate_irq_remapped_level(int irq)
+static int migrate_irq_remapped_level_desc(struct irq_desc *desc)
 {
 	int ret = -1;
-	struct irq_desc *desc = irq_to_desc(irq);
+	struct irq_cfg *cfg = desc->chip_data;
 
-	mask_IO_APIC_irq(irq);
+	mask_IO_APIC_irq_desc(desc);
 
-	if (io_apic_level_ack_pending(irq)) {
+	if (io_apic_level_ack_pending(cfg)) {
 		/*
 		 * Interrupt in progress. Migrating irq now will change the
 		 * vector information in the IO-APIC RTE and that will confuse
@@ -2218,14 +2240,15 @@ static int migrate_irq_remapped_level(int irq)
 	}
 
 	/* everthing is clear. we have right of way */
-	migrate_ioapic_irq(irq, desc->pending_mask);
+	migrate_ioapic_irq_desc(desc, desc->pending_mask);
 
 	ret = 0;
 	desc->status &= ~IRQ_MOVE_PENDING;
 	cpus_clear(desc->pending_mask);
 
 unmask:
-	unmask_IO_APIC_irq(irq);
+	unmask_IO_APIC_irq_desc(desc);
+
 	return ret;
 }
 
@@ -2258,18 +2281,22 @@ static void ir_irq_migration(struct work_struct *work)
 /*
  * Migrates the IRQ destination in the process context.
  */
-static void set_ir_ioapic_affinity_irq(unsigned int irq, cpumask_t mask)
+static void set_ir_ioapic_affinity_irq_desc(struct irq_desc *desc, cpumask_t mask)
 {
-	struct irq_desc *desc = irq_to_desc(irq);
-
 	if (desc->status & IRQ_LEVEL) {
 		desc->status |= IRQ_MOVE_PENDING;
 		desc->pending_mask = mask;
-		migrate_irq_remapped_level(irq);
+		migrate_irq_remapped_level_desc(desc);
 		return;
 	}
 
-	migrate_ioapic_irq(irq, mask);
+	migrate_ioapic_irq_desc(desc, mask);
+}
+static void set_ir_ioapic_affinity_irq(unsigned int irq, cpumask_t mask)
+{
+	struct irq_desc *desc = irq_to_desc(irq);
+
+	set_ir_ioapic_affinity_irq_desc(desc, mask);
 }
 #endif
 
@@ -2313,9 +2340,10 @@ unlock:
 	irq_exit();
 }
 
-static void irq_complete_move(unsigned int irq)
+static void irq_complete_move(struct irq_desc **descp)
 {
-	struct irq_cfg *cfg = irq_cfg(irq);
+	struct irq_desc *desc = *descp;
+	struct irq_cfg *cfg = desc->chip_data;
 	unsigned vector, me;
 
 	if (likely(!cfg->move_in_progress))
@@ -2333,8 +2361,9 @@ static void irq_complete_move(unsigned int irq)
 	}
 }
 #else
-static inline void irq_complete_move(unsigned int irq) {}
+static inline void irq_complete_move(struct irq_desc **descp) {}
 #endif
+
 #ifdef CONFIG_INTR_REMAP
 static void ack_x2apic_level(unsigned int irq)
 {
@@ -2345,11 +2374,14 @@ static void ack_x2apic_edge(unsigned int irq)
 {
 	ack_x2APIC_irq();
 }
+
 #endif
 
 static void ack_apic_edge(unsigned int irq)
 {
-	irq_complete_move(irq);
+	struct irq_desc *desc = irq_to_desc(irq);
+
+	irq_complete_move(&desc);
 	move_native_irq(irq);
 	ack_APIC_irq();
 }
@@ -2358,18 +2390,21 @@ atomic_t irq_mis_count;
 
 static void ack_apic_level(unsigned int irq)
 {
+	struct irq_desc *desc = irq_to_desc(irq);
+
 #ifdef CONFIG_X86_32
 	unsigned long v;
 	int i;
 #endif
+	struct irq_cfg *cfg;
 	int do_unmask_irq = 0;
 
-	irq_complete_move(irq);
+	irq_complete_move(&desc);
 #ifdef CONFIG_GENERIC_PENDING_IRQ
 	/* If we are moving the irq we need to mask it */
-	if (unlikely(irq_to_desc(irq)->status & IRQ_MOVE_PENDING)) {
+	if (unlikely(desc->status & IRQ_MOVE_PENDING)) {
 		do_unmask_irq = 1;
-		mask_IO_APIC_irq(irq);
+		mask_IO_APIC_irq_desc(desc);
 	}
 #endif
 
@@ -2393,7 +2428,8 @@ static void ack_apic_level(unsigned int irq)
 	* operation to prevent an edge-triggered interrupt escaping meanwhile.
 	* The idea is from Manfred Spraul.  --macro
 	*/
-	i = irq_cfg(irq)->vector;
+	cfg = desc->chip_data;
+	i = cfg->vector;
 
 	v = apic_read(APIC_TMR + ((i & ~0x1f) >> 1));
 #endif
@@ -2432,17 +2468,18 @@ static void ack_apic_level(unsigned int irq)
 		 * accurate and is causing problems then it is a hardware bug
 		 * and you can go talk to the chipset vendor about it.
 		 */
-		if (!io_apic_level_ack_pending(irq))
+		cfg = desc->chip_data;
+		if (!io_apic_level_ack_pending(cfg))
 			move_masked_irq(irq);
-		unmask_IO_APIC_irq(irq);
+		unmask_IO_APIC_irq_desc(desc);
 	}
 
 #ifdef CONFIG_X86_32
 	if (!(v & (1 << (i & 0x1f)))) {
 		atomic_inc(&irq_mis_count);
 		spin_lock(&ioapic_lock);
-		__mask_and_edge_IO_APIC_irq(irq);
-		__unmask_and_level_IO_APIC_irq(irq);
+		__mask_and_edge_IO_APIC_irq(cfg);
+		__unmask_and_level_IO_APIC_irq(cfg);
 		spin_unlock(&ioapic_lock);
 	}
 #endif
@@ -2533,7 +2570,7 @@ static void unmask_lapic_irq(unsigned int irq)
 	apic_write(APIC_LVT0, v & ~APIC_LVT_MASKED);
 }
 
-static void ack_lapic_irq (unsigned int irq)
+static void ack_lapic_irq(unsigned int irq)
 {
 	ack_APIC_irq();
 }
@@ -2545,11 +2582,8 @@ static struct irq_chip lapic_chip __read_mostly = {
 	.ack		= ack_lapic_irq,
 };
 
-static void lapic_register_intr(int irq)
+static void lapic_register_intr(int irq, struct irq_desc *desc)
 {
-	struct irq_desc *desc;
-
-	desc = irq_to_desc(irq);
 	desc->status &= ~IRQ_LEVEL;
 	set_irq_chip_and_handler_name(irq, &lapic_chip, handle_edge_irq,
 				      "edge");
@@ -2653,7 +2687,9 @@ int timer_through_8259 __initdata;
  */
 static inline void __init check_timer(void)
 {
-	struct irq_cfg *cfg = irq_cfg(0);
+	struct irq_desc *desc = irq_to_desc(0);
+	struct irq_cfg *cfg = desc->chip_data;
+	int cpu = boot_cpu_id;
 	int apic1, pin1, apic2, pin2;
 	unsigned long flags;
 	unsigned int ver;
@@ -2668,7 +2704,7 @@ static inline void __init check_timer(void)
 	 * get/set the timer IRQ vector:
 	 */
 	disable_8259A_irq(0);
-	assign_irq_vector(0, TARGET_CPUS);
+	assign_irq_vector(0, cfg, TARGET_CPUS);
 
 	/*
 	 * As IRQ0 is to be enabled in the 8259A, the virtual
@@ -2719,10 +2755,10 @@ static inline void __init check_timer(void)
 		 * Ok, does IRQ0 through the IOAPIC work?
 		 */
 		if (no_pin1) {
-			add_pin_to_irq_cpu(0, boot_cpu_id, apic1, pin1);
+			add_pin_to_irq_cpu(cfg, cpu, apic1, pin1);
 			setup_timer_IRQ0_pin(apic1, pin1, cfg->vector);
 		}
-		unmask_IO_APIC_irq(0);
+		unmask_IO_APIC_irq_desc(desc);
 		if (timer_irq_works()) {
 			if (nmi_watchdog == NMI_IO_APIC) {
 				setup_nmi();
@@ -2748,9 +2784,9 @@ static inline void __init check_timer(void)
 		/*
 		 * legacy devices should be connected to IO APIC #0
 		 */
-		replace_pin_at_irq(0, boot_cpu_id, apic1, pin1, apic2, pin2);
+		replace_pin_at_irq_cpu(cfg, cpu, apic1, pin1, apic2, pin2);
 		setup_timer_IRQ0_pin(apic2, pin2, cfg->vector);
-		unmask_IO_APIC_irq(0);
+		unmask_IO_APIC_irq_desc(desc);
 		enable_8259A_irq(0);
 		if (timer_irq_works()) {
 			apic_printk(APIC_QUIET, KERN_INFO "....... works.\n");
@@ -2782,7 +2818,7 @@ static inline void __init check_timer(void)
 	apic_printk(APIC_QUIET, KERN_INFO
 		    "...trying to set up timer as Virtual Wire IRQ...\n");
 
-	lapic_register_intr(0);
+	lapic_register_intr(0, desc);
 	apic_write(APIC_LVT0, APIC_DM_FIXED | cfg->vector);	/* Fixed mode */
 	enable_8259A_irq(0);
 
@@ -2986,7 +3022,7 @@ unsigned int create_irq_nr(unsigned int irq_want)
 
 		if (cfg_new->vector != 0)
 			continue;
-		if (__assign_irq_vector(new, TARGET_CPUS) == 0)
+		if (__assign_irq_vector(new, cfg_new, TARGET_CPUS) == 0)
 			irq = new;
 		break;
 	}
@@ -3034,7 +3070,7 @@ void destroy_irq(unsigned int irq)
 	free_irte(irq);
 #endif
 	spin_lock_irqsave(&vector_lock, flags);
-	__clear_irq_vector(irq);
+	__clear_irq_vector(irq, cfg);
 	spin_unlock_irqrestore(&vector_lock, flags);
 }
 
@@ -3049,12 +3085,12 @@ static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_ms
 	unsigned dest;
 	cpumask_t tmp;
 
+	cfg = irq_cfg(irq);
 	tmp = TARGET_CPUS;
-	err = assign_irq_vector(irq, tmp);
+	err = assign_irq_vector(irq, cfg, tmp);
 	if (err)
 		return err;
 
-	cfg = irq_cfg(irq);
 	cpus_and(tmp, cfg->domain, tmp);
 	dest = cpu_mask_to_apicid(tmp);
 
@@ -3112,35 +3148,35 @@ static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_ms
 #ifdef CONFIG_SMP
 static void set_msi_irq_affinity(unsigned int irq, cpumask_t mask)
 {
+	struct irq_desc *desc = irq_to_desc(irq);
 	struct irq_cfg *cfg;
 	struct msi_msg msg;
 	unsigned int dest;
 	cpumask_t tmp;
-	struct irq_desc *desc;
 
 	cpus_and(tmp, mask, cpu_online_map);
 	if (cpus_empty(tmp))
 		return;
 
-	if (assign_irq_vector(irq, mask))
+	cfg = desc->chip_data;
+	if (assign_irq_vector(irq, cfg, mask))
 		return;
 
-	cfg = irq_cfg(irq);
+	set_extra_move_desc(desc, mask);
+
 	cpus_and(tmp, cfg->domain, mask);
 	dest = cpu_mask_to_apicid(tmp);
 
-	read_msi_msg(irq, &msg);
+	read_msi_msg_desc(desc, &msg);
 
 	msg.data &= ~MSI_DATA_VECTOR_MASK;
 	msg.data |= MSI_DATA_VECTOR(cfg->vector);
 	msg.address_lo &= ~MSI_ADDR_DEST_ID_MASK;
 	msg.address_lo |= MSI_ADDR_DEST_ID(dest);
 
-	write_msi_msg(irq, &msg);
-	desc = irq_to_desc(irq);
+	write_msi_msg_desc(desc, &msg);
 	desc->affinity = mask;
 }
-
 #ifdef CONFIG_INTR_REMAP
 /*
  * Migrate the MSI irq to another cpumask. This migration is
@@ -3148,11 +3184,11 @@ static void set_msi_irq_affinity(unsigned int irq, cpumask_t mask)
  */
 static void ir_set_msi_irq_affinity(unsigned int irq, cpumask_t mask)
 {
+	struct irq_desc *desc = irq_to_desc(irq);
 	struct irq_cfg *cfg;
 	unsigned int dest;
 	cpumask_t tmp, cleanup_mask;
 	struct irte irte;
-	struct irq_desc *desc;
 
 	cpus_and(tmp, mask, cpu_online_map);
 	if (cpus_empty(tmp))
@@ -3161,10 +3197,12 @@ static void ir_set_msi_irq_affinity(unsigned int irq, cpumask_t mask)
 	if (get_irte(irq, &irte))
 		return;
 
-	if (assign_irq_vector(irq, mask))
+	cfg = desc->chip_data;
+	if (assign_irq_vector(irq, cfg, mask))
 		return;
 
-	cfg = irq_cfg(irq);
+	set_extra_move_desc(desc, mask);
+
 	cpus_and(tmp, cfg->domain, mask);
 	dest = cpu_mask_to_apicid(tmp);
 
@@ -3188,9 +3226,9 @@ static void ir_set_msi_irq_affinity(unsigned int irq, cpumask_t mask)
 		cfg->move_in_progress = 0;
 	}
 
-	desc = irq_to_desc(irq);
 	desc->affinity = mask;
 }
+
 #endif
 #endif /* CONFIG_SMP */
 
@@ -3249,7 +3287,7 @@ static int msi_alloc_irte(struct pci_dev *dev, int irq, int nvec)
 }
 #endif
 
-static int setup_msi_irq(struct pci_dev *dev, struct msi_desc *desc, int irq)
+static int setup_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc, int irq)
 {
 	int ret;
 	struct msi_msg msg;
@@ -3258,7 +3296,7 @@ static int setup_msi_irq(struct pci_dev *dev, struct msi_desc *desc, int irq)
 	if (ret < 0)
 		return ret;
 
-	set_irq_msi(irq, desc);
+	set_irq_msi(irq, msidesc);
 	write_msi_msg(irq, &msg);
 
 #ifdef CONFIG_INTR_REMAP
@@ -3381,20 +3419,22 @@ void arch_teardown_msi_irq(unsigned int irq)
 #ifdef CONFIG_SMP
 static void dmar_msi_set_affinity(unsigned int irq, cpumask_t mask)
 {
+	struct irq_desc *desc = irq_to_desc(irq);
 	struct irq_cfg *cfg;
 	struct msi_msg msg;
 	unsigned int dest;
 	cpumask_t tmp;
-	struct irq_desc *desc;
 
 	cpus_and(tmp, mask, cpu_online_map);
 	if (cpus_empty(tmp))
 		return;
 
-	if (assign_irq_vector(irq, mask))
+	cfg = desc->chip_data;
+	if (assign_irq_vector(irq, cfg, mask))
 		return;
 
-	cfg = irq_cfg(irq);
+	set_extra_move_desc(desc, mask);
+
 	cpus_and(tmp, cfg->domain, mask);
 	dest = cpu_mask_to_apicid(tmp);
 
@@ -3406,9 +3446,9 @@ static void dmar_msi_set_affinity(unsigned int irq, cpumask_t mask)
 	msg.address_lo |= MSI_ADDR_DEST_ID(dest);
 
 	dmar_msi_write(irq, &msg);
-	desc = irq_to_desc(irq);
 	desc->affinity = mask;
 }
+
 #endif /* CONFIG_SMP */
 
 struct irq_chip dmar_msi_type = {
@@ -3442,8 +3482,8 @@ int arch_setup_dmar_msi(unsigned int irq)
 #ifdef CONFIG_SMP
 static void hpet_msi_set_affinity(unsigned int irq, cpumask_t mask)
 {
+	struct irq_desc *desc = irq_to_desc(irq);
 	struct irq_cfg *cfg;
-	struct irq_desc *desc;
 	struct msi_msg msg;
 	unsigned int dest;
 	cpumask_t tmp;
@@ -3452,10 +3492,12 @@ static void hpet_msi_set_affinity(unsigned int irq, cpumask_t mask)
 	if (cpus_empty(tmp))
 		return;
 
-	if (assign_irq_vector(irq, mask))
+	cfg = desc->chip_data;
+	if (assign_irq_vector(irq, cfg, mask))
 		return;
 
-	cfg = irq_cfg(irq);
+	set_extra_move_desc(desc, mask);
+
 	cpus_and(tmp, cfg->domain, mask);
 	dest = cpu_mask_to_apicid(tmp);
 
@@ -3467,9 +3509,9 @@ static void hpet_msi_set_affinity(unsigned int irq, cpumask_t mask)
 	msg.address_lo |= MSI_ADDR_DEST_ID(dest);
 
 	hpet_msi_write(irq, &msg);
-	desc = irq_to_desc(irq);
 	desc->affinity = mask;
 }
+
 #endif /* CONFIG_SMP */
 
 struct irq_chip hpet_msi_type = {
@@ -3524,26 +3566,28 @@ static void target_ht_irq(unsigned int irq, unsigned int dest, u8 vector)
 
 static void set_ht_irq_affinity(unsigned int irq, cpumask_t mask)
 {
+	struct irq_desc *desc = irq_to_desc(irq);
 	struct irq_cfg *cfg;
 	unsigned int dest;
 	cpumask_t tmp;
-	struct irq_desc *desc;
 
 	cpus_and(tmp, mask, cpu_online_map);
 	if (cpus_empty(tmp))
 		return;
 
-	if (assign_irq_vector(irq, mask))
+	cfg = desc->chip_data;
+	if (assign_irq_vector(irq, cfg, mask))
 		return;
 
-	cfg = irq_cfg(irq);
+	set_extra_move_desc(desc, mask);
+
 	cpus_and(tmp, cfg->domain, mask);
 	dest = cpu_mask_to_apicid(tmp);
 
 	target_ht_irq(irq, dest, cfg->vector);
-	desc = irq_to_desc(irq);
 	desc->affinity = mask;
 }
+
 #endif
 
 static struct irq_chip ht_irq_chip = {
@@ -3563,13 +3607,13 @@ int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev)
 	int err;
 	cpumask_t tmp;
 
+	cfg = irq_cfg(irq);
 	tmp = TARGET_CPUS;
-	err = assign_irq_vector(irq, tmp);
+	err = assign_irq_vector(irq, cfg, tmp);
 	if (!err) {
 		struct ht_irq_msg msg;
 		unsigned dest;
 
-		cfg = irq_cfg(irq);
 		cpus_and(tmp, cfg->domain, tmp);
 		dest = cpu_mask_to_apicid(tmp);
 
@@ -3615,7 +3659,9 @@ int arch_enable_uv_irq(char *irq_name, unsigned int irq, int cpu, int mmr_blade,
 	unsigned long flags;
 	int err;
 
-	err = assign_irq_vector(irq, *eligible_cpu);
+	cfg = irq_cfg(irq);
+
+	err = assign_irq_vector(irq, cfg, *eligible_cpu);
 	if (err != 0)
 		return err;
 
@@ -3624,8 +3670,6 @@ int arch_enable_uv_irq(char *irq_name, unsigned int irq, int cpu, int mmr_blade,
 				      irq_name);
 	spin_unlock_irqrestore(&vector_lock, flags);
 
-	cfg = irq_cfg(irq);
-
 	mmr_value = 0;
 	entry = (struct uv_IO_APIC_route_entry *)&mmr_value;
 	BUG_ON(sizeof(struct uv_IO_APIC_route_entry) != sizeof(unsigned long));
@@ -3806,10 +3850,10 @@ int io_apic_set_pci_routing (int ioapic, int pin, int irq, int triggering, int p
 	 */
 	if (irq >= NR_IRQS_LEGACY) {
 		cfg = desc->chip_data;
-		add_pin_to_irq_cpu(irq, cpu, ioapic, pin);
+		add_pin_to_irq_cpu(cfg, cpu, ioapic, pin);
 	}
 
-	setup_IO_APIC_irq(ioapic, pin, irq, triggering, polarity);
+	setup_IO_APIC_irq(ioapic, pin, irq, desc, triggering, polarity);
 
 	return 0;
 }
@@ -3866,7 +3910,7 @@ void __init setup_ioapic_dest(void)
 			desc = irq_to_desc(irq);
 			cfg = desc->chip_data;
 			if (!cfg->vector) {
-				setup_IO_APIC_irq(ioapic, pin, irq,
+				setup_IO_APIC_irq(ioapic, pin, irq, desc,
 						  irq_trigger(irq_entry),
 						  irq_polarity(irq_entry));
 				continue;
@@ -3884,10 +3928,10 @@ void __init setup_ioapic_dest(void)
 
 #ifdef CONFIG_INTR_REMAP
 			if (intr_remapping_enabled)
-				set_ir_ioapic_affinity_irq(irq, mask);
+				set_ir_ioapic_affinity_irq_desc(desc, mask);
 			else
 #endif
-				set_ioapic_affinity_irq(irq, mask);
+				set_ioapic_affinity_irq_desc(desc, mask);
 		}
 
 	}
diff --git a/drivers/pci/msi.c b/drivers/pci/msi.c
index 74801f7df9c..11a51f8ed3b 100644
--- a/drivers/pci/msi.c
+++ b/drivers/pci/msi.c
@@ -103,11 +103,11 @@ static void msix_set_enable(struct pci_dev *dev, int enable)
 	}
 }
 
-static void msix_flush_writes(unsigned int irq)
+static void msix_flush_writes(struct irq_desc *desc)
 {
 	struct msi_desc *entry;
 
-	entry = get_irq_msi(irq);
+	entry = get_irq_desc_msi(desc);
 	BUG_ON(!entry || !entry->dev);
 	switch (entry->msi_attrib.type) {
 	case PCI_CAP_ID_MSI:
@@ -135,11 +135,11 @@ static void msix_flush_writes(unsigned int irq)
  * Returns 1 if it succeeded in masking the interrupt and 0 if the device
  * doesn't support MSI masking.
  */
-static int msi_set_mask_bits(unsigned int irq, u32 mask, u32 flag)
+static int msi_set_mask_bits(struct irq_desc *desc, u32 mask, u32 flag)
 {
 	struct msi_desc *entry;
 
-	entry = get_irq_msi(irq);
+	entry = get_irq_desc_msi(desc);
 	BUG_ON(!entry || !entry->dev);
 	switch (entry->msi_attrib.type) {
 	case PCI_CAP_ID_MSI:
@@ -172,9 +172,9 @@ static int msi_set_mask_bits(unsigned int irq, u32 mask, u32 flag)
 	return 1;
 }
 
-void read_msi_msg(unsigned int irq, struct msi_msg *msg)
+void read_msi_msg_desc(struct irq_desc *desc, struct msi_msg *msg)
 {
-	struct msi_desc *entry = get_irq_msi(irq);
+	struct msi_desc *entry = get_irq_desc_msi(desc);
 	switch(entry->msi_attrib.type) {
 	case PCI_CAP_ID_MSI:
 	{
@@ -211,9 +211,16 @@ void read_msi_msg(unsigned int irq, struct msi_msg *msg)
 	}
 }
 
-void write_msi_msg(unsigned int irq, struct msi_msg *msg)
+void read_msi_msg(unsigned int irq, struct msi_msg *msg)
 {
-	struct msi_desc *entry = get_irq_msi(irq);
+	struct irq_desc *desc = irq_to_desc(irq);
+
+	read_msi_msg_desc(desc, msg);
+}
+
+void write_msi_msg_desc(struct irq_desc *desc, struct msi_msg *msg)
+{
+	struct msi_desc *entry = get_irq_desc_msi(desc);
 	switch (entry->msi_attrib.type) {
 	case PCI_CAP_ID_MSI:
 	{
@@ -252,21 +259,31 @@ void write_msi_msg(unsigned int irq, struct msi_msg *msg)
 	entry->msg = *msg;
 }
 
+void write_msi_msg(unsigned int irq, struct msi_msg *msg)
+{
+	struct irq_desc *desc = irq_to_desc(irq);
+
+	write_msi_msg_desc(desc, msg);
+}
+
 void mask_msi_irq(unsigned int irq)
 {
-	msi_set_mask_bits(irq, 1, 1);
-	msix_flush_writes(irq);
+	struct irq_desc *desc = irq_to_desc(irq);
+
+	msi_set_mask_bits(desc, 1, 1);
+	msix_flush_writes(desc);
 }
 
 void unmask_msi_irq(unsigned int irq)
 {
-	msi_set_mask_bits(irq, 1, 0);
-	msix_flush_writes(irq);
+	struct irq_desc *desc = irq_to_desc(irq);
+
+	msi_set_mask_bits(desc, 1, 0);
+	msix_flush_writes(desc);
 }
 
 static int msi_free_irqs(struct pci_dev* dev);
 
-
 static struct msi_desc* alloc_msi_entry(void)
 {
 	struct msi_desc *entry;
@@ -303,9 +320,11 @@ static void __pci_restore_msi_state(struct pci_dev *dev)
 	pci_intx_for_msi(dev, 0);
 	msi_set_enable(dev, 0);
 	write_msi_msg(dev->irq, &entry->msg);
-	if (entry->msi_attrib.maskbit)
-		msi_set_mask_bits(dev->irq, entry->msi_attrib.maskbits_mask,
+	if (entry->msi_attrib.maskbit) {
+		struct irq_desc *desc = irq_to_desc(dev->irq);
+		msi_set_mask_bits(desc, entry->msi_attrib.maskbits_mask,
 				  entry->msi_attrib.masked);
+	}
 
 	pci_read_config_word(dev, pos + PCI_MSI_FLAGS, &control);
 	control &= ~PCI_MSI_FLAGS_QSIZE;
@@ -327,8 +346,9 @@ static void __pci_restore_msix_state(struct pci_dev *dev)
 	msix_set_enable(dev, 0);
 
 	list_for_each_entry(entry, &dev->msi_list, list) {
+		struct irq_desc *desc = irq_to_desc(entry->irq);
 		write_msi_msg(entry->irq, &entry->msg);
-		msi_set_mask_bits(entry->irq, 1, entry->msi_attrib.masked);
+		msi_set_mask_bits(desc, 1, entry->msi_attrib.masked);
 	}
 
 	BUG_ON(list_empty(&dev->msi_list));
@@ -596,7 +616,8 @@ void pci_msi_shutdown(struct pci_dev* dev)
 	/* Return the the pci reset with msi irqs unmasked */
 	if (entry->msi_attrib.maskbit) {
 		u32 mask = entry->msi_attrib.maskbits_mask;
-		msi_set_mask_bits(dev->irq, mask, ~mask);
+		struct irq_desc *desc = irq_to_desc(dev->irq);
+		msi_set_mask_bits(desc, mask, ~mask);
 	}
 	if (!entry->dev || entry->msi_attrib.type != PCI_CAP_ID_MSI)
 		return;
diff --git a/include/linux/msi.h b/include/linux/msi.h
index 8f293922720..d2b8a1e8ca1 100644
--- a/include/linux/msi.h
+++ b/include/linux/msi.h
@@ -10,8 +10,11 @@ struct msi_msg {
 };
 
 /* Helper functions */
+struct irq_desc;
 extern void mask_msi_irq(unsigned int irq);
 extern void unmask_msi_irq(unsigned int irq);
+extern void read_msi_msg_desc(struct irq_desc *desc, struct msi_msg *msg);
+extern void write_msi_msg_desc(struct irq_desc *desc, struct msi_msg *msg);
 extern void read_msi_msg(unsigned int irq, struct msi_msg *msg);
 extern void write_msi_msg(unsigned int irq, struct msi_msg *msg);
 
-- 
cgit v1.2.3-70-g09d2


From 8b96f0119818964e4944fd1c423bf6770027d3ac Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Sat, 6 Dec 2008 03:40:00 +0100
Subject: tracing/function-graph-tracer: introduce __notrace_funcgraph to
 filter special functions

Impact: trace more functions

When the function graph tracer is configured, three more files are not
traced to prevent only four functions to be traced. And this impacts the
normal function tracer too.

arch/x86/kernel/process_64/32.c:

I had crashes when I let this file traced. After some debugging, I saw
that the "current" task point was changed inside__swtich_to(), ie:
"write_pda(pcurrent, next_p);" inside process_64.c Since the tracer store
the original return address of the function inside current, we had
crashes. Only __switch_to() has to be excluded from tracing.

kernel/module.c and kernel/extable.c:

Because of a function used internally by the function graph tracer:
__kernel_text_address()

To let the other functions inside these files to be traced, this patch
introduces the __notrace_funcgraph function prefix which is __notrace if
function graph tracer is configured and nothing if not.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/Makefile     |  6 ------
 arch/x86/kernel/process_32.c |  4 +++-
 arch/x86/kernel/process_64.c |  4 +++-
 include/linux/ftrace.h       | 11 +++++++++++
 kernel/Makefile              |  4 ----
 kernel/extable.c             |  5 +++--
 kernel/module.c              |  2 +-
 7 files changed, 21 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index a3049da6198..1cad9318d21 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -14,12 +14,6 @@ CFLAGS_REMOVE_paravirt-spinlocks.o = -pg
 CFLAGS_REMOVE_ftrace.o = -pg
 endif
 
-ifdef CONFIG_FUNCTION_GRAPH_TRACER
-# Don't trace __switch_to() but let it for function tracer
-CFLAGS_REMOVE_process_32.o = -pg
-CFLAGS_REMOVE_process_64.o = -pg
-endif
-
 #
 # vsyscalls (which work on the user stack) should have
 # no stack-protector checks:
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index 0a1302fe6d4..24c2276aa45 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -38,6 +38,7 @@
 #include <linux/percpu.h>
 #include <linux/prctl.h>
 #include <linux/dmi.h>
+#include <linux/ftrace.h>
 
 #include <asm/uaccess.h>
 #include <asm/pgtable.h>
@@ -548,7 +549,8 @@ __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p,
  * the task-switch, and shows up in ret_from_fork in entry.S,
  * for example.
  */
-struct task_struct * __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
+__notrace_funcgraph struct task_struct *
+__switch_to(struct task_struct *prev_p, struct task_struct *next_p)
 {
 	struct thread_struct *prev = &prev_p->thread,
 				 *next = &next_p->thread;
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index c958120fb1b..fbb321d53d3 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -39,6 +39,7 @@
 #include <linux/prctl.h>
 #include <linux/uaccess.h>
 #include <linux/io.h>
+#include <linux/ftrace.h>
 
 #include <asm/pgtable.h>
 #include <asm/system.h>
@@ -551,8 +552,9 @@ static inline void __switch_to_xtra(struct task_struct *prev_p,
  * - could test fs/gs bitsliced
  *
  * Kprobes not supported here. Set the probe on schedule instead.
+ * Function graph tracer not supported too.
  */
-struct task_struct *
+__notrace_funcgraph struct task_struct *
 __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
 {
 	struct thread_struct *prev = &prev_p->thread;
diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index b9b4d0a22d1..449fa8e9e34 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -369,6 +369,14 @@ struct ftrace_graph_ret {
 };
 
 #ifdef CONFIG_FUNCTION_GRAPH_TRACER
+
+/*
+ * Sometimes we don't want to trace a function with the function
+ * graph tracer but we want them to keep traced by the usual function
+ * tracer if the function graph tracer is not configured.
+ */
+#define __notrace_funcgraph		notrace
+
 #define FTRACE_RETFUNC_DEPTH 50
 #define FTRACE_RETSTACK_ALLOC_SIZE 32
 /* Type of the callback handlers for tracing function graph*/
@@ -394,6 +402,9 @@ static inline int task_curr_ret_stack(struct task_struct *t)
 	return t->curr_ret_stack;
 }
 #else
+
+#define __notrace_funcgraph
+
 static inline void ftrace_graph_init_task(struct task_struct *t) { }
 static inline void ftrace_graph_exit_task(struct task_struct *t) { }
 
diff --git a/kernel/Makefile b/kernel/Makefile
index 703cf3b7389..19fad003b19 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -21,10 +21,6 @@ CFLAGS_REMOVE_cgroup-debug.o = -pg
 CFLAGS_REMOVE_sched_clock.o = -pg
 CFLAGS_REMOVE_sched.o = -pg
 endif
-ifdef CONFIG_FUNCTION_GRAPH_TRACER
-CFLAGS_REMOVE_extable.o = -pg # For __kernel_text_address()
-CFLAGS_REMOVE_module.o = -pg # For __module_text_address()
-endif
 
 obj-$(CONFIG_FREEZER) += freezer.o
 obj-$(CONFIG_PROFILING) += profile.o
diff --git a/kernel/extable.c b/kernel/extable.c
index a26cb2e1702..feb0317cf09 100644
--- a/kernel/extable.c
+++ b/kernel/extable.c
@@ -17,6 +17,7 @@
 */
 #include <linux/module.h>
 #include <linux/init.h>
+#include <linux/ftrace.h>
 #include <asm/uaccess.h>
 #include <asm/sections.h>
 
@@ -40,7 +41,7 @@ const struct exception_table_entry *search_exception_tables(unsigned long addr)
 	return e;
 }
 
-int core_kernel_text(unsigned long addr)
+__notrace_funcgraph int core_kernel_text(unsigned long addr)
 {
 	if (addr >= (unsigned long)_stext &&
 	    addr <= (unsigned long)_etext)
@@ -53,7 +54,7 @@ int core_kernel_text(unsigned long addr)
 	return 0;
 }
 
-int __kernel_text_address(unsigned long addr)
+__notrace_funcgraph int __kernel_text_address(unsigned long addr)
 {
 	if (core_kernel_text(addr))
 		return 1;
diff --git a/kernel/module.c b/kernel/module.c
index 89bcf7c1327..dd2a54155b5 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -2704,7 +2704,7 @@ int is_module_address(unsigned long addr)
 
 
 /* Is this a valid kernel address? */
-struct module *__module_text_address(unsigned long addr)
+__notrace_funcgraph struct module *__module_text_address(unsigned long addr)
 {
 	struct module *mod;
 
-- 
cgit v1.2.3-70-g09d2


From 380c4b1411ccd6885f92b2c8ceb08433a720f44e Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Sat, 6 Dec 2008 03:43:41 +0100
Subject: tracing/function-graph-tracer: append the tracing_graph_flag

Impact: Provide a way to pause the function graph tracer

As suggested by Steven Rostedt, the previous patch that prevented from
spinlock function tracing shouldn't use the raw_spinlock to fix it.
It's much better to follow lockdep with normal spinlock, so this patch
adds a new flag for each task to make the function graph tracer able
to be paused. We also can send an ftrace_printk whithout worrying of
the irrelevant traced spinlock during insertion.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/ftrace.c |  5 ++++-
 include/linux/ftrace.h   | 13 +++++++++++++
 include/linux/sched.h    |  2 ++
 kernel/trace/ftrace.c    |  2 ++
 kernel/trace/trace.c     | 18 +++++-------------
 5 files changed, 26 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c
index f98c4076a17..1b43086b097 100644
--- a/arch/x86/kernel/ftrace.c
+++ b/arch/x86/kernel/ftrace.c
@@ -476,7 +476,10 @@ void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr)
 				&return_to_handler;
 
 	/* Nmi's are currently unsupported */
-	if (atomic_read(&in_nmi))
+	if (unlikely(atomic_read(&in_nmi)))
+		return;
+
+	if (unlikely(atomic_read(&current->tracing_graph_pause)))
 		return;
 
 	/*
diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index 449fa8e9e34..11cac81eed0 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -401,6 +401,16 @@ static inline int task_curr_ret_stack(struct task_struct *t)
 {
 	return t->curr_ret_stack;
 }
+
+static inline void pause_graph_tracing(void)
+{
+	atomic_inc(&current->tracing_graph_pause);
+}
+
+static inline void unpause_graph_tracing(void)
+{
+	atomic_dec(&current->tracing_graph_pause);
+}
 #else
 
 #define __notrace_funcgraph
@@ -412,6 +422,9 @@ static inline int task_curr_ret_stack(struct task_struct *tsk)
 {
 	return -1;
 }
+
+static inline void pause_graph_tracing(void) { }
+static inline void unpause_graph_tracing(void) { }
 #endif
 
 #ifdef CONFIG_TRACING
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 4c152e0acc9..4b81fc5f773 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1379,6 +1379,8 @@ struct task_struct {
 	 * because of depth overrun.
 	 */
 	atomic_t trace_overrun;
+	/* Pause for the tracing */
+	atomic_t tracing_graph_pause;
 #endif
 #ifdef CONFIG_TRACING
 	/* state flags for use by tracers */
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 2971fe48f55..a12f80efcea 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -1998,6 +1998,7 @@ static int alloc_retstack_tasklist(struct ftrace_ret_stack **ret_stack_list)
 			/* Make sure IRQs see the -1 first: */
 			barrier();
 			t->ret_stack = ret_stack_list[start++];
+			atomic_set(&t->tracing_graph_pause, 0);
 			atomic_set(&t->trace_overrun, 0);
 		}
 	} while_each_thread(g, t);
@@ -2077,6 +2078,7 @@ void ftrace_graph_init_task(struct task_struct *t)
 		if (!t->ret_stack)
 			return;
 		t->curr_ret_stack = -1;
+		atomic_set(&t->tracing_graph_pause, 0);
 		atomic_set(&t->trace_overrun, 0);
 	} else
 		t->ret_stack = NULL;
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 33549537f30..0b8659bd5ad 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -3590,14 +3590,7 @@ static __init int tracer_init_debugfs(void)
 
 int trace_vprintk(unsigned long ip, int depth, const char *fmt, va_list args)
 {
-	/*
-	 * Raw Spinlock because a normal spinlock would be traced here
-	 * and append an irrelevant couple spin_lock_irqsave/
-	 * spin_unlock_irqrestore traced by ftrace around this
-	 * TRACE_PRINTK trace.
-	 */
-	static raw_spinlock_t trace_buf_lock =
-				(raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED;
+	static DEFINE_SPINLOCK(trace_buf_lock);
 	static char trace_buf[TRACE_BUF_SIZE];
 
 	struct ring_buffer_event *event;
@@ -3618,8 +3611,8 @@ int trace_vprintk(unsigned long ip, int depth, const char *fmt, va_list args)
 	if (unlikely(atomic_read(&data->disabled)))
 		goto out;
 
-	local_irq_save(flags);
-	__raw_spin_lock(&trace_buf_lock);
+	pause_graph_tracing();
+	spin_lock_irqsave(&trace_buf_lock, irq_flags);
 	len = vsnprintf(trace_buf, TRACE_BUF_SIZE, fmt, args);
 
 	len = min(len, TRACE_BUF_SIZE-1);
@@ -3640,9 +3633,8 @@ int trace_vprintk(unsigned long ip, int depth, const char *fmt, va_list args)
 	ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
 
  out_unlock:
-	__raw_spin_unlock(&trace_buf_lock);
-	local_irq_restore(flags);
-
+	spin_unlock_irqrestore(&trace_buf_lock, irq_flags);
+	unpause_graph_tracing();
  out:
 	preempt_enable_notrace();
 
-- 
cgit v1.2.3-70-g09d2


From 240d367b4e6c6e3c5075e034db14dba60a6f5fa7 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai@kernel.org>
Date: Mon, 8 Dec 2008 14:06:17 -0800
Subject: sparseirq: fix Alpha build failure

Impact: build fix on Alpha

-tip testing found this build failure on the Alpha defconfig:

/home/mingo/tip/fs/proc/stat.c: In function 'show_stat':
/home/mingo/tip/fs/proc/stat.c:48: error: implicit declaration of function 'for_each_irq_desc'
/home/mingo/tip/fs/proc/stat.c:48: error: expected ';' before '{' token

can not use irq_desc() in stat.c on older architectures.

Signed-off-by: Yinghai Lu <yinghai@kernel.orgg>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 fs/proc/stat.c        | 20 +++++++++++++-------
 include/linux/irq.h   |  9 ---------
 include/linux/irqnr.h | 19 ++++++++++++++++---
 3 files changed, 29 insertions(+), 19 deletions(-)

(limited to 'include/linux')

diff --git a/fs/proc/stat.c b/fs/proc/stat.c
index a13431ab7c6..3cb9492801c 100644
--- a/fs/proc/stat.c
+++ b/fs/proc/stat.c
@@ -45,9 +45,12 @@ static int show_stat(struct seq_file *p, void *v)
 		softirq = cputime64_add(softirq, kstat_cpu(i).cpustat.softirq);
 		steal = cputime64_add(steal, kstat_cpu(i).cpustat.steal);
 		guest = cputime64_add(guest, kstat_cpu(i).cpustat.guest);
-		for_each_irq_desc(j, desc) {
+		for_each_irq_nr(j) {
+#ifdef CONFIG_SPARSE_IRQ
+			desc = irq_to_desc(j);
 			if (!desc)
 				continue;
+#endif
 			sum += kstat_irqs_cpu(j, i);
 		}
 		sum += arch_irq_stat_cpu(i);
@@ -92,14 +95,17 @@ static int show_stat(struct seq_file *p, void *v)
 	seq_printf(p, "intr %llu", (unsigned long long)sum);
 
 	/* sum again ? it could be updated? */
-	for (j = 0; j < NR_IRQS; j++) {
-		desc = irq_to_desc(j);
+	for_each_irq_nr(j) {
 		per_irq_sum = 0;
-
-		if (desc) {
-			for_each_possible_cpu(i)
-				per_irq_sum += kstat_irqs_cpu(j, i);
+#ifdef CONFIG_SPARSE_IRQ
+		desc = irq_to_desc(j);
+		if (!desc) {
+			seq_printf(p, " %u", per_irq_sum);
+			continue;
 		}
+#endif
+		for_each_possible_cpu(i)
+			per_irq_sum += kstat_irqs_cpu(j, i);
 
 		seq_printf(p, " %u", per_irq_sum);
 	}
diff --git a/include/linux/irq.h b/include/linux/irq.h
index 63b00439d4d..b5749db3e5a 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -198,7 +198,6 @@ extern void arch_init_copy_chip_data(struct irq_desc *old_desc,
 extern void arch_free_chip_data(struct irq_desc *old_desc, struct irq_desc *desc);
 
 #ifndef CONFIG_SPARSE_IRQ
-
 extern struct irq_desc irq_desc[NR_IRQS];
 
 static inline struct irq_desc *irq_to_desc(unsigned int irq)
@@ -210,14 +209,6 @@ static inline struct irq_desc *irq_to_desc_alloc_cpu(unsigned int irq, int cpu)
 	return irq_to_desc(irq);
 }
 
-#ifdef CONFIG_GENERIC_HARDIRQS
-# define for_each_irq_desc(irq, desc)		\
-	for (irq = 0, desc = irq_desc; irq < nr_irqs; irq++, desc++)
-# define for_each_irq_desc_reverse(irq, desc)                          \
-	for (irq = nr_irqs - 1, desc = irq_desc + (nr_irqs - 1);        \
-	    irq >= 0; irq--, desc--)
-#endif
-
 #else
 
 extern struct irq_desc *irq_to_desc(unsigned int irq);
diff --git a/include/linux/irqnr.h b/include/linux/irqnr.h
index 7a299e989f8..13754f81358 100644
--- a/include/linux/irqnr.h
+++ b/include/linux/irqnr.h
@@ -8,9 +8,22 @@
 # define for_each_irq_desc(irq, desc)		\
 	for (irq = 0; irq < nr_irqs; irq++)
 
-static inline early_sparse_irq_init(void)
-{
-}
+# define for_each_irq_desc_reverse(irq, desc)                          \
+	for (irq = nr_irqs - 1; irq >= 0; irq--)
+#else
+#ifndef CONFIG_SPARSE_IRQ
+
+struct irq_desc;
+extern int nr_irqs;
+# define for_each_irq_desc(irq, desc)		\
+	for (irq = 0, desc = irq_desc; irq < nr_irqs; irq++, desc++)
+# define for_each_irq_desc_reverse(irq, desc)                          \
+	for (irq = nr_irqs - 1, desc = irq_desc + (nr_irqs - 1);        \
+	    irq >= 0; irq--, desc--)
 #endif
+#endif
+
+#define for_each_irq_nr(irq)                   \
+       for (irq = 0; irq < nr_irqs; irq++)
 
 #endif
-- 
cgit v1.2.3-70-g09d2


From 1e641743f055f075ed9a4edd75f1fb1e05669ddc Mon Sep 17 00:00:00 2001
From: Al Viro <viro@ZenIV.linux.org.uk>
Date: Tue, 9 Dec 2008 09:23:33 +0000
Subject: Audit: Log TIOCSTI

AUDIT_TTY records currently log all data read by processes marked for
TTY input auditing, even if the data was "pushed back" using the TIOCSTI
ioctl, not typed by the user.

This patch records all TIOCSTI calls to disambiguate the input.  It
generates one audit message per character pushed back; considering
TIOCSTI is used very rarely, this simple solution is probably good
enough.  (The only program I could find that uses TIOCSTI is mailx/nail
in "header editing" mode, e.g. using the ~h escape.  mailx is used very
rarely, and the escapes are used even rarer.)

Signed-Off-By: Miloslav Trmac <mitr@redhat.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: James Morris <jmorris@namei.org>
---
 drivers/char/tty_audit.c | 78 +++++++++++++++++++++++++++++++++++++-----------
 drivers/char/tty_io.c    |  1 +
 include/linux/tty.h      |  4 +++
 3 files changed, 66 insertions(+), 17 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/char/tty_audit.c b/drivers/char/tty_audit.c
index d961fa9612c..34ab6d798f8 100644
--- a/drivers/char/tty_audit.c
+++ b/drivers/char/tty_audit.c
@@ -67,37 +67,45 @@ static void tty_audit_buf_put(struct tty_audit_buf *buf)
 		tty_audit_buf_free(buf);
 }
 
-/**
- *	tty_audit_buf_push	-	Push buffered data out
- *
- *	Generate an audit message from the contents of @buf, which is owned by
- *	@tsk with @loginuid.  @buf->mutex must be locked.
- */
-static void tty_audit_buf_push(struct task_struct *tsk, uid_t loginuid,
-			       unsigned int sessionid,
-			       struct tty_audit_buf *buf)
+static void tty_audit_log(const char *description, struct task_struct *tsk,
+			  uid_t loginuid, unsigned sessionid, int major,
+			  int minor, unsigned char *data, size_t size)
 {
 	struct audit_buffer *ab;
 
-	if (buf->valid == 0)
-		return;
-	if (audit_enabled == 0)
-		return;
 	ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_TTY);
 	if (ab) {
 		char name[sizeof(tsk->comm)];
 		uid_t uid = task_uid(tsk);
 
-		audit_log_format(ab, "tty pid=%u uid=%u auid=%u ses=%u "
-				 "major=%d minor=%d comm=",
+		audit_log_format(ab, "%s pid=%u uid=%u auid=%u ses=%u "
+				 "major=%d minor=%d comm=", description,
 				 tsk->pid, uid, loginuid, sessionid,
-				 buf->major, buf->minor);
+				 major, minor);
 		get_task_comm(name, tsk);
 		audit_log_untrustedstring(ab, name);
 		audit_log_format(ab, " data=");
-		audit_log_n_hex(ab, buf->data, buf->valid);
+		audit_log_n_hex(ab, data, size);
 		audit_log_end(ab);
 	}
+}
+
+/**
+ *	tty_audit_buf_push	-	Push buffered data out
+ *
+ *	Generate an audit message from the contents of @buf, which is owned by
+ *	@tsk with @loginuid.  @buf->mutex must be locked.
+ */
+static void tty_audit_buf_push(struct task_struct *tsk, uid_t loginuid,
+			       unsigned int sessionid,
+			       struct tty_audit_buf *buf)
+{
+	if (buf->valid == 0)
+		return;
+	if (audit_enabled == 0)
+		return;
+	tty_audit_log("tty", tsk, loginuid, sessionid, buf->major, buf->minor,
+		      buf->data, buf->valid);
 	buf->valid = 0;
 }
 
@@ -151,6 +159,42 @@ void tty_audit_fork(struct signal_struct *sig)
 	sig->tty_audit_buf = NULL;
 }
 
+/**
+ *	tty_audit_tiocsti	-	Log TIOCSTI
+ */
+void tty_audit_tiocsti(struct tty_struct *tty, char ch)
+{
+	struct tty_audit_buf *buf;
+	int major, minor, should_audit;
+
+	spin_lock_irq(&current->sighand->siglock);
+	should_audit = current->signal->audit_tty;
+	buf = current->signal->tty_audit_buf;
+	if (buf)
+		atomic_inc(&buf->count);
+	spin_unlock_irq(&current->sighand->siglock);
+
+	major = tty->driver->major;
+	minor = tty->driver->minor_start + tty->index;
+	if (buf) {
+		mutex_lock(&buf->mutex);
+		if (buf->major == major && buf->minor == minor)
+			tty_audit_buf_push_current(buf);
+		mutex_unlock(&buf->mutex);
+		tty_audit_buf_put(buf);
+	}
+
+	if (should_audit && audit_enabled) {
+		uid_t auid;
+		unsigned int sessionid;
+
+		auid = audit_get_loginuid(current);
+		sessionid = audit_get_sessionid(current);
+		tty_audit_log("ioctl=TIOCSTI", current, auid, sessionid, major,
+			      minor, &ch, 1);
+	}
+}
+
 /**
  *	tty_audit_push_task	-	Flush task's pending audit data
  */
diff --git a/drivers/char/tty_io.c b/drivers/char/tty_io.c
index 1412a8d1e58..db15f9ba7c0 100644
--- a/drivers/char/tty_io.c
+++ b/drivers/char/tty_io.c
@@ -2018,6 +2018,7 @@ static int tiocsti(struct tty_struct *tty, char __user *p)
 		return -EPERM;
 	if (get_user(ch, p))
 		return -EFAULT;
+	tty_audit_tiocsti(tty, ch);
 	ld = tty_ldisc_ref_wait(tty);
 	ld->ops->receive_buf(tty, &ch, &mbz, 1);
 	tty_ldisc_deref(ld);
diff --git a/include/linux/tty.h b/include/linux/tty.h
index 3b8121d4e36..580700f20a1 100644
--- a/include/linux/tty.h
+++ b/include/linux/tty.h
@@ -442,6 +442,7 @@ extern void tty_audit_add_data(struct tty_struct *tty, unsigned char *data,
 			       size_t size);
 extern void tty_audit_exit(void);
 extern void tty_audit_fork(struct signal_struct *sig);
+extern void tty_audit_tiocsti(struct tty_struct *tty, char ch);
 extern void tty_audit_push(struct tty_struct *tty);
 extern void tty_audit_push_task(struct task_struct *tsk,
 					uid_t loginuid, u32 sessionid);
@@ -450,6 +451,9 @@ static inline void tty_audit_add_data(struct tty_struct *tty,
 				      unsigned char *data, size_t size)
 {
 }
+static inline void tty_audit_tiocsti(struct tty_struct *tty, char ch)
+{
+}
 static inline void tty_audit_exit(void)
 {
 }
-- 
cgit v1.2.3-70-g09d2


From 7b363e440021a1cf9ed76944b2685f48dacefb3e Mon Sep 17 00:00:00 2001
From: Neil Horman <nhorman@tuxdriver.com>
Date: Tue, 9 Dec 2008 23:22:26 -0800
Subject: netpoll: fix race on poll_list resulting in garbage entry

	A few months back a race was discused between the netpoll napi service
path, and the fast path through net_rx_action:
http://kerneltrap.org/mailarchive/linux-netdev/2007/10/16/345470

A patch was submitted for that bug, but I think we missed a case.

Consider the following scenario:

INITIAL STATE
CPU0 has one napi_struct A on its poll_list
CPU1 is calling netpoll_send_skb and needs to call poll_napi on the same
napi_struct A that CPU0 has on its list


CPU0						CPU1
net_rx_action					poll_napi
!list_empty (returns true)			locks poll_lock for A
						 poll_one_napi
						  napi->poll
						   netif_rx_complete
						    __napi_complete
						    (removes A from poll_list)
list_entry(list->next)


In the above scenario, net_rx_action assumes that the per-cpu poll_list is
exclusive to that cpu.  netpoll of course violates that, and because the netpoll
path can dequeue from the poll list, its possible for CPU0 to detect a non-empty
list at the top of the while loop in net_rx_action, but have it become empty by
the time it calls list_entry.  Since the poll_list isn't surrounded by any other
structure, the returned data from that list_entry call in this situation is
garbage, and any number of crashes can result based on what exactly that garbage
is.

Given that its not fasible for performance reasons to place exclusive locks
arround each cpus poll list to provide that mutal exclusion, I think the best
solution is modify the netpoll path in such a way that we continue to guarantee
that the poll_list for a cpu is in fact exclusive to that cpu.  To do this I've
implemented the patch below.  It adds an additional bit to the state field in
the napi_struct.  When executing napi->poll from the netpoll_path, this bit will
be set. When a driver calls netif_rx_complete, if that bit is set, it will not
remove the napi_struct from the poll_list.  That work will be saved for the next
iteration of net_rx_action.

I've tested this and it seems to work well.  About the biggest drawback I can
see to it is the fact that it might result in an extra loop through
net_rx_action in the event that the device is actually contended for (i.e. the
netpoll path actually preforms all the needed work no the device, and the call
to net_rx_action winds up doing nothing, except removing the napi_struct from
the poll_list.  However I think this is probably a small price to pay, given
that the alternative is a crash.

Signed-off-by: Neil Horman <nhorman@tuxdriver.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h | 7 +++++++
 net/core/netpoll.c        | 2 ++
 2 files changed, 9 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 9d77b1d7dca..e26f5495289 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -319,6 +319,7 @@ enum
 {
 	NAPI_STATE_SCHED,	/* Poll is scheduled */
 	NAPI_STATE_DISABLE,	/* Disable pending */
+	NAPI_STATE_NPSVC,	/* Netpoll - don't dequeue from poll_list */
 };
 
 extern void __napi_schedule(struct napi_struct *n);
@@ -1497,6 +1498,12 @@ static inline void netif_rx_complete(struct net_device *dev,
 {
 	unsigned long flags;
 
+	/*
+	 * don't let napi dequeue from the cpu poll list
+	 * just in case its running on a different cpu
+	 */
+	if (unlikely(test_bit(NAPI_STATE_NPSVC, &napi->state)))
+		return;
 	local_irq_save(flags);
 	__netif_rx_complete(dev, napi);
 	local_irq_restore(flags);
diff --git a/net/core/netpoll.c b/net/core/netpoll.c
index 6c7af390be0..dadac6281f2 100644
--- a/net/core/netpoll.c
+++ b/net/core/netpoll.c
@@ -133,9 +133,11 @@ static int poll_one_napi(struct netpoll_info *npinfo,
 
 	npinfo->rx_flags |= NETPOLL_RX_DROP;
 	atomic_inc(&trapped);
+	set_bit(NAPI_STATE_NPSVC, &napi->state);
 
 	work = napi->poll(napi, budget);
 
+	clear_bit(NAPI_STATE_NPSVC, &napi->state);
 	atomic_dec(&trapped);
 	npinfo->rx_flags &= ~NETPOLL_RX_DROP;
 
-- 
cgit v1.2.3-70-g09d2


From 58494487581cb143a0d763e3056a894d5009d60a Mon Sep 17 00:00:00 2001
From: Robert Richter <robert.richter@amd.com>
Date: Wed, 26 Nov 2008 12:02:53 +0100
Subject: oprofile: update comment for oprofile_add_sample()

The cpu argument is no longer part of the parameter list.

Signed-off-by: Robert Richter <robert.richter@amd.com>
---
 include/linux/oprofile.h | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/oprofile.h b/include/linux/oprofile.h
index 5231861f357..1ce9fe572e5 100644
--- a/include/linux/oprofile.h
+++ b/include/linux/oprofile.h
@@ -86,8 +86,7 @@ int oprofile_arch_init(struct oprofile_operations * ops);
 void oprofile_arch_exit(void);
 
 /**
- * Add a sample. This may be called from any context. Pass
- * smp_processor_id() as cpu.
+ * Add a sample. This may be called from any context.
  */
 void oprofile_add_sample(struct pt_regs * const regs, unsigned long event);
 
-- 
cgit v1.2.3-70-g09d2


From e09373f22e76cc048ca5fe10a9ff9012f5d64309 Mon Sep 17 00:00:00 2001
From: Robert Richter <robert.richter@amd.com>
Date: Wed, 26 Nov 2008 14:04:19 +0100
Subject: ring_buffer: add remaining cpu functions to ring_buffer.h

These functions are not yet in ring_buffer.h though they seems to be
part of the API.

Cc: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Robert Richter <robert.richter@amd.com>
---
 include/linux/ring_buffer.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h
index e097c2e6b6d..de9d8c12e5e 100644
--- a/include/linux/ring_buffer.h
+++ b/include/linux/ring_buffer.h
@@ -116,6 +116,8 @@ void ring_buffer_record_enable_cpu(struct ring_buffer *buffer, int cpu);
 
 unsigned long ring_buffer_entries(struct ring_buffer *buffer);
 unsigned long ring_buffer_overruns(struct ring_buffer *buffer);
+unsigned long ring_buffer_entries_cpu(struct ring_buffer *buffer, int cpu);
+unsigned long ring_buffer_overrun_cpu(struct ring_buffer *buffer, int cpu);
 
 u64 ring_buffer_time_stamp(int cpu);
 void ring_buffer_normalize_time_stamp(int cpu, u64 *ts);
-- 
cgit v1.2.3-70-g09d2


From cdc693643271b2e6a693cf8f6afb258cce01f058 Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@opensource.wolfsonmicro.com>
Date: Wed, 10 Dec 2008 13:55:49 +0000
Subject: ALSA: Add support for mechanical jack insertion

Some systems support both mechanical and electrical jack detection,
allowing them to report that a jack is physically present but does
not have any functioning connections. Add a new jack type for these,
allowing user space to report faulty connections.

Thanks to Guillem Jover for the suggestion.

Signed-off-by: Mark Brown <broonie@opensource.wolfsonmicro.com>
Signed-off-by: Takashi Iwai <tiwai@suse.de>
---
 include/linux/input.h | 1 +
 include/sound/jack.h  | 1 +
 sound/core/jack.c     | 6 ++++++
 3 files changed, 8 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/input.h b/include/linux/input.h
index 7323d2ff515..abd223b0f58 100644
--- a/include/linux/input.h
+++ b/include/linux/input.h
@@ -645,6 +645,7 @@ struct input_absinfo {
 #define SW_MICROPHONE_INSERT	0x04  /* set = inserted */
 #define SW_DOCK			0x05  /* set = plugged into dock */
 #define SW_LINEOUT_INSERT	0x06  /* set = inserted */
+#define SW_JACK_PHYSICAL_INSERT 0x07  /* set = mechanical switch set */
 #define SW_MAX			0x0f
 #define SW_CNT			(SW_MAX+1)
 
diff --git a/include/sound/jack.h b/include/sound/jack.h
index 7cb25f4b50b..2e0315cdd0d 100644
--- a/include/sound/jack.h
+++ b/include/sound/jack.h
@@ -36,6 +36,7 @@ enum snd_jack_types {
 	SND_JACK_MICROPHONE	= 0x0002,
 	SND_JACK_HEADSET	= SND_JACK_HEADPHONE | SND_JACK_MICROPHONE,
 	SND_JACK_LINEOUT	= 0x0004,
+	SND_JACK_MECHANICAL	= 0x0008, /* If detected separately */
 };
 
 struct snd_jack {
diff --git a/sound/core/jack.c b/sound/core/jack.c
index c4bb9bad313..6ebd5f12bc5 100644
--- a/sound/core/jack.c
+++ b/sound/core/jack.c
@@ -108,6 +108,9 @@ int snd_jack_new(struct snd_card *card, const char *id, int type,
 	if (type & SND_JACK_MICROPHONE)
 		input_set_capability(jack->input_dev, EV_SW,
 				     SW_MICROPHONE_INSERT);
+	if (type & SND_JACK_MECHANICAL)
+		input_set_capability(jack->input_dev, EV_SW,
+				     SW_JACK_PHYSICAL_INSERT);
 
 	err = snd_device_new(card, SNDRV_DEV_JACK, jack, &ops);
 	if (err < 0)
@@ -159,6 +162,9 @@ void snd_jack_report(struct snd_jack *jack, int status)
 	if (jack->type & SND_JACK_MICROPHONE)
 		input_report_switch(jack->input_dev, SW_MICROPHONE_INSERT,
 				    status & SND_JACK_MICROPHONE);
+	if (jack->type & SND_JACK_MECHANICAL)
+		input_report_switch(jack->input_dev, SW_JACK_PHYSICAL_INSERT,
+				    status & SND_JACK_MECHANICAL);
 
 	input_sync(jack->input_dev);
 }
-- 
cgit v1.2.3-70-g09d2


From 71c5576fbd809f2015f4eddf72e501e298720cf3 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@linux-foundation.org>
Date: Tue, 9 Dec 2008 13:14:13 -0800
Subject: revert "percpu counter: clean up percpu_counter_sum_and_set()"

Revert

    commit 1f7c14c62ce63805f9574664a6c6de3633d4a354
    Author: Mingming Cao <cmm@us.ibm.com>
    Date:   Thu Oct 9 12:50:59 2008 -0400

        percpu counter: clean up percpu_counter_sum_and_set()

Before this patch we had the following:

percpu_counter_sum(): return the percpu_counter's value

percpu_counter_sum_and_set(): return the percpu_counter's value, copying
that value into the central value and zeroing the per-cpu counters before
returning.

After this patch, percpu_counter_sum_and_set() has gone, and
percpu_counter_sum() gets the old percpu_counter_sum_and_set()
functionality.

Problem is, as Eric points out, the old percpu_counter_sum_and_set()
functionality was racy and wrong.  It zeroes out counters on "other" cpus,
without holding any locks which will prevent races agaist updates from
those other CPUS.

This patch reverts 1f7c14c62ce63805f9574664a6c6de3633d4a354.  This means
that percpu_counter_sum_and_set() still has the race, but
percpu_counter_sum() does not.

Note that this is not a simple revert - ext4 has since started using
percpu_counter_sum() for its dirty_blocks counter as well.

Note that this revert patch changes percpu_counter_sum() semantics.

Before the patch, a call to percpu_counter_sum() will bring the counter's
central counter mostly up-to-date, so a following percpu_counter_read()
will return a close value.

After this patch, a call to percpu_counter_sum() will leave the counter's
central accumulator unaltered, so a subsequent call to
percpu_counter_read() can now return a significantly inaccurate result.

If there is any code in the tree which was introduced after
e8ced39d5e8911c662d4d69a342b9d053eaaac4e was merged, and which depends
upon the new percpu_counter_sum() semantics, that code will break.

Reported-by: Eric Dumazet <dada1@cosmosbay.com>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mingming Cao <cmm@us.ibm.com>
Cc: <linux-ext4@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ext4/balloc.c               |  4 ++--
 include/linux/percpu_counter.h | 12 +++++++++---
 lib/percpu_counter.c           |  8 +++++---
 3 files changed, 16 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index d2003cdc36a..c17f69bcd7d 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -609,8 +609,8 @@ int ext4_has_free_blocks(struct ext4_sb_info *sbi, s64 nblocks)
 
 	if (free_blocks - (nblocks + root_blocks + dirty_blocks) <
 						EXT4_FREEBLOCKS_WATERMARK) {
-		free_blocks  = percpu_counter_sum(fbc);
-		dirty_blocks = percpu_counter_sum(dbc);
+		free_blocks  = percpu_counter_sum_and_set(fbc);
+		dirty_blocks = percpu_counter_sum_and_set(dbc);
 		if (dirty_blocks < 0) {
 			printk(KERN_CRIT "Dirty block accounting "
 					"went wrong %lld\n",
diff --git a/include/linux/percpu_counter.h b/include/linux/percpu_counter.h
index 9007ccdfc11..20838883535 100644
--- a/include/linux/percpu_counter.h
+++ b/include/linux/percpu_counter.h
@@ -35,7 +35,7 @@ int percpu_counter_init_irq(struct percpu_counter *fbc, s64 amount);
 void percpu_counter_destroy(struct percpu_counter *fbc);
 void percpu_counter_set(struct percpu_counter *fbc, s64 amount);
 void __percpu_counter_add(struct percpu_counter *fbc, s64 amount, s32 batch);
-s64 __percpu_counter_sum(struct percpu_counter *fbc);
+s64 __percpu_counter_sum(struct percpu_counter *fbc, int set);
 
 static inline void percpu_counter_add(struct percpu_counter *fbc, s64 amount)
 {
@@ -44,13 +44,19 @@ static inline void percpu_counter_add(struct percpu_counter *fbc, s64 amount)
 
 static inline s64 percpu_counter_sum_positive(struct percpu_counter *fbc)
 {
-	s64 ret = __percpu_counter_sum(fbc);
+	s64 ret = __percpu_counter_sum(fbc, 0);
 	return ret < 0 ? 0 : ret;
 }
 
+static inline s64 percpu_counter_sum_and_set(struct percpu_counter *fbc)
+{
+	return __percpu_counter_sum(fbc, 1);
+}
+
+
 static inline s64 percpu_counter_sum(struct percpu_counter *fbc)
 {
-	return __percpu_counter_sum(fbc);
+	return __percpu_counter_sum(fbc, 0);
 }
 
 static inline s64 percpu_counter_read(struct percpu_counter *fbc)
diff --git a/lib/percpu_counter.c b/lib/percpu_counter.c
index 71b265c330c..dba1530a5b2 100644
--- a/lib/percpu_counter.c
+++ b/lib/percpu_counter.c
@@ -52,7 +52,7 @@ EXPORT_SYMBOL(__percpu_counter_add);
  * Add up all the per-cpu counts, return the result.  This is a more accurate
  * but much slower version of percpu_counter_read_positive()
  */
-s64 __percpu_counter_sum(struct percpu_counter *fbc)
+s64 __percpu_counter_sum(struct percpu_counter *fbc, int set)
 {
 	s64 ret;
 	int cpu;
@@ -62,9 +62,11 @@ s64 __percpu_counter_sum(struct percpu_counter *fbc)
 	for_each_online_cpu(cpu) {
 		s32 *pcount = per_cpu_ptr(fbc->counters, cpu);
 		ret += *pcount;
-		*pcount = 0;
+		if (set)
+			*pcount = 0;
 	}
-	fbc->count = ret;
+	if (set)
+		fbc->count = ret;
 
 	spin_unlock(&fbc->lock);
 	return ret;
-- 
cgit v1.2.3-70-g09d2


From 02d211688727ad02bb4555b1aa8ae2de16b21b39 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@linux-foundation.org>
Date: Tue, 9 Dec 2008 13:14:14 -0800
Subject: revert "percpu_counter: new function percpu_counter_sum_and_set"

Revert

    commit e8ced39d5e8911c662d4d69a342b9d053eaaac4e
    Author: Mingming Cao <cmm@us.ibm.com>
    Date:   Fri Jul 11 19:27:31 2008 -0400

        percpu_counter: new function percpu_counter_sum_and_set

As described in

	revert "percpu counter: clean up percpu_counter_sum_and_set()"

the new percpu_counter_sum_and_set() is racy against updates to the
cpu-local accumulators on other CPUs.  Revert that change.

This means that ext4 will be slow again.  But correct.

Reported-by: Eric Dumazet <dada1@cosmosbay.com>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mingming Cao <cmm@us.ibm.com>
Cc: <linux-ext4@vger.kernel.org>
Cc: <stable@kernel.org>		[2.6.27.x]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ext4/balloc.c               |  4 ++--
 include/linux/percpu_counter.h | 12 +++---------
 lib/percpu_counter.c           |  7 +------
 3 files changed, 6 insertions(+), 17 deletions(-)

(limited to 'include/linux')

diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index c17f69bcd7d..db35cfdb3c8 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -609,8 +609,8 @@ int ext4_has_free_blocks(struct ext4_sb_info *sbi, s64 nblocks)
 
 	if (free_blocks - (nblocks + root_blocks + dirty_blocks) <
 						EXT4_FREEBLOCKS_WATERMARK) {
-		free_blocks  = percpu_counter_sum_and_set(fbc);
-		dirty_blocks = percpu_counter_sum_and_set(dbc);
+		free_blocks  = percpu_counter_sum_positive(fbc);
+		dirty_blocks = percpu_counter_sum_positive(dbc);
 		if (dirty_blocks < 0) {
 			printk(KERN_CRIT "Dirty block accounting "
 					"went wrong %lld\n",
diff --git a/include/linux/percpu_counter.h b/include/linux/percpu_counter.h
index 20838883535..9007ccdfc11 100644
--- a/include/linux/percpu_counter.h
+++ b/include/linux/percpu_counter.h
@@ -35,7 +35,7 @@ int percpu_counter_init_irq(struct percpu_counter *fbc, s64 amount);
 void percpu_counter_destroy(struct percpu_counter *fbc);
 void percpu_counter_set(struct percpu_counter *fbc, s64 amount);
 void __percpu_counter_add(struct percpu_counter *fbc, s64 amount, s32 batch);
-s64 __percpu_counter_sum(struct percpu_counter *fbc, int set);
+s64 __percpu_counter_sum(struct percpu_counter *fbc);
 
 static inline void percpu_counter_add(struct percpu_counter *fbc, s64 amount)
 {
@@ -44,19 +44,13 @@ static inline void percpu_counter_add(struct percpu_counter *fbc, s64 amount)
 
 static inline s64 percpu_counter_sum_positive(struct percpu_counter *fbc)
 {
-	s64 ret = __percpu_counter_sum(fbc, 0);
+	s64 ret = __percpu_counter_sum(fbc);
 	return ret < 0 ? 0 : ret;
 }
 
-static inline s64 percpu_counter_sum_and_set(struct percpu_counter *fbc)
-{
-	return __percpu_counter_sum(fbc, 1);
-}
-
-
 static inline s64 percpu_counter_sum(struct percpu_counter *fbc)
 {
-	return __percpu_counter_sum(fbc, 0);
+	return __percpu_counter_sum(fbc);
 }
 
 static inline s64 percpu_counter_read(struct percpu_counter *fbc)
diff --git a/lib/percpu_counter.c b/lib/percpu_counter.c
index dba1530a5b2..b255b939bc1 100644
--- a/lib/percpu_counter.c
+++ b/lib/percpu_counter.c
@@ -52,7 +52,7 @@ EXPORT_SYMBOL(__percpu_counter_add);
  * Add up all the per-cpu counts, return the result.  This is a more accurate
  * but much slower version of percpu_counter_read_positive()
  */
-s64 __percpu_counter_sum(struct percpu_counter *fbc, int set)
+s64 __percpu_counter_sum(struct percpu_counter *fbc)
 {
 	s64 ret;
 	int cpu;
@@ -62,12 +62,7 @@ s64 __percpu_counter_sum(struct percpu_counter *fbc, int set)
 	for_each_online_cpu(cpu) {
 		s32 *pcount = per_cpu_ptr(fbc->counters, cpu);
 		ret += *pcount;
-		if (set)
-			*pcount = 0;
 	}
-	if (set)
-		fbc->count = ret;
-
 	spin_unlock(&fbc->lock);
 	return ret;
 }
-- 
cgit v1.2.3-70-g09d2


From 9c24624727f6d6c460e45762a408ca5f5b9b8ef2 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hugh@veritas.com>
Date: Tue, 9 Dec 2008 13:14:27 -0800
Subject: KSYM_SYMBOL_LEN fixes

Miles Lane tailing /sys files hit a BUG which Pekka Enberg has tracked
to my 966c8c12dc9e77f931e2281ba25d2f0244b06949 sprint_symbol(): use
less stack exposing a bug in slub's list_locations() -
kallsyms_lookup() writes a 0 to namebuf[KSYM_NAME_LEN-1], but that was
beyond the end of page provided.

The 100 slop which list_locations() allows at end of page looks roughly
enough for all the other stuff it might print after the symbol before
it checks again: break out KSYM_SYMBOL_LEN earlier than before.

Latencytop and ftrace and are using KSYM_NAME_LEN buffers where they
need KSYM_SYMBOL_LEN buffers, and vmallocinfo a 2*KSYM_NAME_LEN buffer
where it wants a KSYM_SYMBOL_LEN buffer: fix those before anyone copies
them.

[akpm@linux-foundation.org: ftrace.h needs module.h]
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Cc: Christoph Lameter <cl@linux-foundation.org>
Cc Miles Lane <miles.lane@gmail.com>
Acked-by: Pekka Enberg <penberg@cs.helsinki.fi>
Acked-by: Steven Rostedt <srostedt@redhat.com>
Acked-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/base.c         | 2 +-
 include/linux/ftrace.h | 3 ++-
 kernel/latencytop.c    | 2 +-
 mm/slub.c              | 2 +-
 mm/vmalloc.c           | 2 +-
 5 files changed, 6 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/fs/proc/base.c b/fs/proc/base.c
index 486cf3fe713..d4677603c88 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -371,7 +371,7 @@ static int lstats_show_proc(struct seq_file *m, void *v)
 				task->latency_record[i].time,
 				task->latency_record[i].max);
 			for (q = 0; q < LT_BACKTRACEDEPTH; q++) {
-				char sym[KSYM_NAME_LEN];
+				char sym[KSYM_SYMBOL_LEN];
 				char *c;
 				if (!task->latency_record[i].backtrace[q])
 					break;
diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index 703eb53cfa2..9c5bc6be2b0 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -6,6 +6,7 @@
 #include <linux/ktime.h>
 #include <linux/init.h>
 #include <linux/types.h>
+#include <linux/module.h>
 #include <linux/kallsyms.h>
 
 #ifdef CONFIG_FUNCTION_TRACER
@@ -231,7 +232,7 @@ ftrace_init_module(unsigned long *start, unsigned long *end) { }
 
 struct boot_trace {
 	pid_t			caller;
-	char			func[KSYM_NAME_LEN];
+	char			func[KSYM_SYMBOL_LEN];
 	int			result;
 	unsigned long long	duration;		/* usecs */
 	ktime_t			calltime;
diff --git a/kernel/latencytop.c b/kernel/latencytop.c
index 5e7b45c5692..449db466bdb 100644
--- a/kernel/latencytop.c
+++ b/kernel/latencytop.c
@@ -191,7 +191,7 @@ static int lstats_show(struct seq_file *m, void *v)
 				latency_record[i].time,
 				latency_record[i].max);
 			for (q = 0; q < LT_BACKTRACEDEPTH; q++) {
-				char sym[KSYM_NAME_LEN];
+				char sym[KSYM_SYMBOL_LEN];
 				char *c;
 				if (!latency_record[i].backtrace[q])
 					break;
diff --git a/mm/slub.c b/mm/slub.c
index 749588a50a5..a2cd47d89e0 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -3597,7 +3597,7 @@ static int list_locations(struct kmem_cache *s, char *buf,
 	for (i = 0; i < t.count; i++) {
 		struct location *l = &t.loc[i];
 
-		if (len > PAGE_SIZE - 100)
+		if (len > PAGE_SIZE - KSYM_SYMBOL_LEN - 100)
 			break;
 		len += sprintf(buf + len, "%7ld ", l->count);
 
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index f3f6e075856..1ddb77ba399 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -1717,7 +1717,7 @@ static int s_show(struct seq_file *m, void *p)
 		v->addr, v->addr + v->size, v->size);
 
 	if (v->caller) {
-		char buff[2 * KSYM_NAME_LEN];
+		char buff[KSYM_SYMBOL_LEN];
 
 		seq_putc(m, ' ');
 		sprint_symbol(buff, (unsigned long)v->caller);
-- 
cgit v1.2.3-70-g09d2


From 2107fb8b5bf018be691afdd4c6ffaecf0c3307be Mon Sep 17 00:00:00 2001
From: Steve Glendinning <steve.glendinning@smsc.com>
Date: Wed, 5 Nov 2008 00:35:38 +0000
Subject: smsc911x: add dynamic bus configuration

Convert the driver to select 16-bit or 32-bit bus access at runtime,
at a small performance cost.

Signed-off-by: Steve Glendinning <steve.glendinning@smsc.com>
Acked-by: Catalin Marinas <catalin.marinas@arm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/smsc911x.c   | 158 +++++++++++++++++++++++------------------------
 drivers/net/smsc911x.h   |   1 -
 include/linux/smsc911x.h |   5 ++
 3 files changed, 84 insertions(+), 80 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/smsc911x.c b/drivers/net/smsc911x.c
index fe517880fc9..4b8ff843e30 100644
--- a/drivers/net/smsc911x.c
+++ b/drivers/net/smsc911x.c
@@ -77,19 +77,16 @@ struct smsc911x_data {
 	unsigned int generation;
 
 	/* device configuration (copied from platform_data during probe) */
-	unsigned int irq_polarity;
-	unsigned int irq_type;
-	phy_interface_t phy_interface;
+	struct smsc911x_platform_config config;
 
 	/* This needs to be acquired before calling any of below:
 	 * smsc911x_mac_read(), smsc911x_mac_write()
 	 */
 	spinlock_t mac_lock;
 
-#if (!SMSC_CAN_USE_32BIT)
-	/* spinlock to ensure 16-bit accesses are serialised */
+	/* spinlock to ensure 16-bit accesses are serialised.
+	 * unused with a 32-bit bus */
 	spinlock_t dev_lock;
-#endif
 
 	struct phy_device *phy_dev;
 	struct mii_bus *mii_bus;
@@ -121,70 +118,56 @@ struct smsc911x_data {
 	unsigned int hashlo;
 };
 
-#if SMSC_CAN_USE_32BIT
-
-static inline u32 smsc911x_reg_read(struct smsc911x_data *pdata, u32 reg)
-{
-	return readl(pdata->ioaddr + reg);
-}
-
-static inline void smsc911x_reg_write(struct smsc911x_data *pdata, u32 reg,
-				      u32 val)
-{
-	writel(val, pdata->ioaddr + reg);
-}
-
-/* Writes a packet to the TX_DATA_FIFO */
-static inline void
-smsc911x_tx_writefifo(struct smsc911x_data *pdata, unsigned int *buf,
-		      unsigned int wordcount)
-{
-	writesl(pdata->ioaddr + TX_DATA_FIFO, buf, wordcount);
-}
-
-/* Reads a packet out of the RX_DATA_FIFO */
-static inline void
-smsc911x_rx_readfifo(struct smsc911x_data *pdata, unsigned int *buf,
-		     unsigned int wordcount)
-{
-	readsl(pdata->ioaddr + RX_DATA_FIFO, buf, wordcount);
-}
-
-#else				/* SMSC_CAN_USE_32BIT */
-
-/* These 16-bit access functions are significantly slower, due to the locking
+/* The 16-bit access functions are significantly slower, due to the locking
  * necessary.  If your bus hardware can be configured to do this for you
  * (in response to a single 32-bit operation from software), you should use
  * the 32-bit access functions instead. */
 
 static inline u32 smsc911x_reg_read(struct smsc911x_data *pdata, u32 reg)
 {
-	unsigned long flags;
-	u32 data;
-
-	/* these two 16-bit reads must be performed consecutively, so must
-	 * not be interrupted by our own ISR (which would start another
-	 * read operation) */
-	spin_lock_irqsave(&pdata->dev_lock, flags);
-	data = ((readw(pdata->ioaddr + reg) & 0xFFFF) |
-		((readw(pdata->ioaddr + reg + 2) & 0xFFFF) << 16));
-	spin_unlock_irqrestore(&pdata->dev_lock, flags);
+	if (pdata->config.flags & SMSC911X_USE_32BIT)
+		return readl(pdata->ioaddr + reg);
+
+	if (pdata->config.flags & SMSC911X_USE_16BIT) {
+		u32 data;
+		unsigned long flags;
+
+		/* these two 16-bit reads must be performed consecutively, so
+		 * must not be interrupted by our own ISR (which would start
+		 * another read operation) */
+		spin_lock_irqsave(&pdata->dev_lock, flags);
+		data = ((readw(pdata->ioaddr + reg) & 0xFFFF) |
+			((readw(pdata->ioaddr + reg + 2) & 0xFFFF) << 16));
+		spin_unlock_irqrestore(&pdata->dev_lock, flags);
+
+		return data;
+	}
 
-	return data;
+	BUG();
 }
 
 static inline void smsc911x_reg_write(struct smsc911x_data *pdata, u32 reg,
 				      u32 val)
 {
-	unsigned long flags;
+	if (pdata->config.flags & SMSC911X_USE_32BIT) {
+		writel(val, pdata->ioaddr + reg);
+		return;
+	}
+
+	if (pdata->config.flags & SMSC911X_USE_16BIT) {
+		unsigned long flags;
+
+		/* these two 16-bit writes must be performed consecutively, so
+		 * must not be interrupted by our own ISR (which would start
+		 * another read operation) */
+		spin_lock_irqsave(&pdata->dev_lock, flags);
+		writew(val & 0xFFFF, pdata->ioaddr + reg);
+		writew((val >> 16) & 0xFFFF, pdata->ioaddr + reg + 2);
+		spin_unlock_irqrestore(&pdata->dev_lock, flags);
+		return;
+	}
 
-	/* these two 16-bit writes must be performed consecutively, so must
-	 * not be interrupted by our own ISR (which would start another
-	 * read operation) */
-	spin_lock_irqsave(&pdata->dev_lock, flags);
-	writew(val & 0xFFFF, pdata->ioaddr + reg);
-	writew((val >> 16) & 0xFFFF, pdata->ioaddr + reg + 2);
-	spin_unlock_irqrestore(&pdata->dev_lock, flags);
+	BUG();
 }
 
 /* Writes a packet to the TX_DATA_FIFO */
@@ -192,8 +175,18 @@ static inline void
 smsc911x_tx_writefifo(struct smsc911x_data *pdata, unsigned int *buf,
 		      unsigned int wordcount)
 {
-	while (wordcount--)
-		smsc911x_reg_write(pdata, TX_DATA_FIFO, *buf++);
+	if (pdata->config.flags & SMSC911X_USE_32BIT) {
+		writesl(pdata->ioaddr + TX_DATA_FIFO, buf, wordcount);
+		return;
+	}
+
+	if (pdata->config.flags & SMSC911X_USE_16BIT) {
+		while (wordcount--)
+			smsc911x_reg_write(pdata, TX_DATA_FIFO, *buf++);
+		return;
+	}
+
+	BUG();
 }
 
 /* Reads a packet out of the RX_DATA_FIFO */
@@ -201,11 +194,19 @@ static inline void
 smsc911x_rx_readfifo(struct smsc911x_data *pdata, unsigned int *buf,
 		     unsigned int wordcount)
 {
-	while (wordcount--)
-		*buf++ = smsc911x_reg_read(pdata, RX_DATA_FIFO);
-}
+	if (pdata->config.flags & SMSC911X_USE_32BIT) {
+		readsl(pdata->ioaddr + RX_DATA_FIFO, buf, wordcount);
+		return;
+	}
 
-#endif				/* SMSC_CAN_USE_32BIT */
+	if (pdata->config.flags & SMSC911X_USE_16BIT) {
+		while (wordcount--)
+			*buf++ = smsc911x_reg_read(pdata, RX_DATA_FIFO);
+		return;
+	}
+
+	BUG();
+}
 
 /* waits for MAC not busy, with timeout.  Only called by smsc911x_mac_read
  * and smsc911x_mac_write, so assumes mac_lock is held */
@@ -790,7 +791,7 @@ static int smsc911x_mii_probe(struct net_device *dev)
 	}
 
 	phydev = phy_connect(dev, phydev->dev.bus_id,
-		&smsc911x_phy_adjust_link, 0, pdata->phy_interface);
+		&smsc911x_phy_adjust_link, 0, pdata->config.phy_interface);
 
 	if (IS_ERR(phydev)) {
 		pr_err("%s: Could not attach to PHY\n", dev->name);
@@ -1205,14 +1206,14 @@ static int smsc911x_open(struct net_device *dev)
 	/* Set interrupt deassertion to 100uS */
 	intcfg = ((10 << 24) | INT_CFG_IRQ_EN_);
 
-	if (pdata->irq_polarity) {
+	if (pdata->config.irq_polarity) {
 		SMSC_TRACE(IFUP, "irq polarity: active high");
 		intcfg |= INT_CFG_IRQ_POL_;
 	} else {
 		SMSC_TRACE(IFUP, "irq polarity: active low");
 	}
 
-	if (pdata->irq_type) {
+	if (pdata->config.irq_type) {
 		SMSC_TRACE(IFUP, "irq type: push-pull");
 		intcfg |= INT_CFG_IRQ_TYPE_;
 	} else {
@@ -1767,9 +1768,7 @@ static int __devinit smsc911x_init(struct net_device *dev)
 	SMSC_TRACE(PROBE, "IRQ: %d", dev->irq);
 	SMSC_TRACE(PROBE, "PHY will be autodetected.");
 
-#if (!SMSC_CAN_USE_32BIT)
 	spin_lock_init(&pdata->dev_lock);
-#endif
 
 	if (pdata->ioaddr == 0) {
 		SMSC_WARNING(PROBE, "pdata->ioaddr: 0x00000000");
@@ -1910,6 +1909,7 @@ static int __devinit smsc911x_drv_probe(struct platform_device *pdev)
 {
 	struct net_device *dev;
 	struct smsc911x_data *pdata;
+	struct smsc911x_platform_config *config = pdev->dev.platform_data;
 	struct resource *res;
 	unsigned int intcfg = 0;
 	int res_size;
@@ -1918,6 +1918,13 @@ static int __devinit smsc911x_drv_probe(struct platform_device *pdev)
 
 	pr_info("%s: Driver version %s.\n", SMSC_CHIPNAME, SMSC_DRV_VERSION);
 
+	/* platform data specifies irq & dynamic bus configuration */
+	if (!pdev->dev.platform_data) {
+		pr_warning("%s: platform_data not provided\n", SMSC_CHIPNAME);
+		retval = -ENODEV;
+		goto out_0;
+	}
+
 	res = platform_get_resource_byname(pdev, IORESOURCE_MEM,
 					   "smsc911x-memory");
 	if (!res)
@@ -1949,15 +1956,8 @@ static int __devinit smsc911x_drv_probe(struct platform_device *pdev)
 	dev->irq = platform_get_irq(pdev, 0);
 	pdata->ioaddr = ioremap_nocache(res->start, res_size);
 
-	/* copy config parameters across if present, otherwise pdata
-	 * defaults to zeros */
-	if (pdev->dev.platform_data) {
-		struct smsc911x_platform_config *config =
-			pdev->dev.platform_data;
-		pdata->irq_polarity = config->irq_polarity;
-		pdata->irq_type  = config->irq_type;
-		pdata->phy_interface = config->phy_interface;
-	}
+	/* copy config parameters across to pdata */
+	memcpy(&pdata->config, config, sizeof(pdata->config));
 
 	pdata->dev = dev;
 	pdata->msg_enable = ((1 << debug) - 1);
@@ -1974,10 +1974,10 @@ static int __devinit smsc911x_drv_probe(struct platform_device *pdev)
 		goto out_unmap_io_3;
 
 	/* configure irq polarity and type before connecting isr */
-	if (pdata->irq_polarity == SMSC911X_IRQ_POLARITY_ACTIVE_HIGH)
+	if (pdata->config.irq_polarity == SMSC911X_IRQ_POLARITY_ACTIVE_HIGH)
 		intcfg |= INT_CFG_IRQ_POL_;
 
-	if (pdata->irq_type == SMSC911X_IRQ_TYPE_PUSH_PULL)
+	if (pdata->config.irq_type == SMSC911X_IRQ_TYPE_PUSH_PULL)
 		intcfg |= INT_CFG_IRQ_TYPE_;
 
 	smsc911x_reg_write(pdata, INT_CFG, intcfg);
diff --git a/drivers/net/smsc911x.h b/drivers/net/smsc911x.h
index feb36de274c..f818cf0415f 100644
--- a/drivers/net/smsc911x.h
+++ b/drivers/net/smsc911x.h
@@ -21,7 +21,6 @@
 #ifndef __SMSC911X_H__
 #define __SMSC911X_H__
 
-#define SMSC_CAN_USE_32BIT	1
 #define TX_FIFO_LOW_THRESHOLD	((u32)1600)
 #define SMSC911X_EEPROM_SIZE	((u32)7)
 #define USE_DEBUG		0
diff --git a/include/linux/smsc911x.h b/include/linux/smsc911x.h
index 47c4ffd10db..1cbf0313add 100644
--- a/include/linux/smsc911x.h
+++ b/include/linux/smsc911x.h
@@ -28,6 +28,7 @@
 struct smsc911x_platform_config {
 	unsigned int irq_polarity;
 	unsigned int irq_type;
+	unsigned int flags;
 	phy_interface_t phy_interface;
 };
 
@@ -39,4 +40,8 @@ struct smsc911x_platform_config {
 #define SMSC911X_IRQ_TYPE_OPEN_DRAIN		0
 #define SMSC911X_IRQ_TYPE_PUSH_PULL		1
 
+/* Constants for flags */
+#define SMSC911X_USE_16BIT 			(BIT(0))
+#define SMSC911X_USE_32BIT 			(BIT(1))
+
 #endif /* __LINUX_SMSC911X_H__ */
-- 
cgit v1.2.3-70-g09d2


From bd91b8bf372911c1e4d66d6bb44fe409349a6791 Mon Sep 17 00:00:00 2001
From: Benjamin Thery <benjamin.thery@bull.net>
Date: Wed, 10 Dec 2008 16:07:08 -0800
Subject: netns: ip6mr: allocate mroute6_socket per-namespace.

Preliminary work to make IPv6 multicast forwarding netns-aware.

Make IPv6 multicast forwarding mroute6_socket per-namespace,
moves it into struct netns_ipv6.

At the moment, mroute6_socket is only referenced in init_net.

Signed-off-by: Benjamin Thery <benjamin.thery@bull.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/mroute6.h  |  8 ++++++--
 include/net/netns/ipv6.h |  3 +++
 net/ipv6/ip6_output.c    |  3 ++-
 net/ipv6/ip6mr.c         | 22 ++++++++++------------
 4 files changed, 21 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mroute6.h b/include/linux/mroute6.h
index 6f4c180179e..2cd9901ee5c 100644
--- a/include/linux/mroute6.h
+++ b/include/linux/mroute6.h
@@ -117,6 +117,7 @@ struct sioc_mif_req6
 
 #include <linux/pim.h>
 #include <linux/skbuff.h>	/* for struct sk_buff_head */
+#include <net/net_namespace.h>
 
 #ifdef CONFIG_IPV6_MROUTE
 static inline int ip6_mroute_opt(int opt)
@@ -232,10 +233,13 @@ struct rtmsg;
 extern int ip6mr_get_route(struct sk_buff *skb, struct rtmsg *rtm, int nowait);
 
 #ifdef CONFIG_IPV6_MROUTE
-extern struct sock *mroute6_socket;
+static inline struct sock *mroute6_socket(struct net *net)
+{
+	return net->ipv6.mroute6_sk;
+}
 extern int ip6mr_sk_done(struct sock *sk);
 #else
-#define mroute6_socket NULL
+static inline struct sock *mroute6_socket(struct net *net) { return NULL; }
 static inline int ip6mr_sk_done(struct sock *sk) { return 0; }
 #endif
 #endif
diff --git a/include/net/netns/ipv6.h b/include/net/netns/ipv6.h
index 2932721180c..8a0a67d073b 100644
--- a/include/net/netns/ipv6.h
+++ b/include/net/netns/ipv6.h
@@ -55,5 +55,8 @@ struct netns_ipv6 {
 	struct sock             *ndisc_sk;
 	struct sock             *tcp_sk;
 	struct sock             *igmp_sk;
+#ifdef CONFIG_IPV6_MROUTE
+	struct sock		*mroute6_sk;
+#endif
 };
 #endif
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index 7d92fd97cfb..4b15938bef4 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -137,7 +137,8 @@ static int ip6_output2(struct sk_buff *skb)
 		struct inet6_dev *idev = ip6_dst_idev(skb->dst);
 
 		if (!(dev->flags & IFF_LOOPBACK) && (!np || np->mc_loop) &&
-		    ((mroute6_socket && !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
+		    ((mroute6_socket(dev_net(dev)) &&
+		     !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
 		     ipv6_chk_mcast_addr(dev, &ipv6_hdr(skb)->daddr,
 					 &ipv6_hdr(skb)->saddr))) {
 			struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c
index d1008e6891e..02163dbf84c 100644
--- a/net/ipv6/ip6mr.c
+++ b/net/ipv6/ip6mr.c
@@ -49,9 +49,6 @@
 #include <net/addrconf.h>
 #include <linux/netfilter_ipv6.h>
 
-struct sock *mroute6_socket;
-
-
 /* Big lock, protecting vif table, mrt cache and mroute socket state.
    Note that the changes are semaphored via rtnl_lock.
  */
@@ -820,7 +817,7 @@ static int ip6mr_cache_report(struct sk_buff *pkt, mifi_t mifi, int assert)
 	skb_pull(skb, sizeof(struct ipv6hdr));
 	}
 
-	if (mroute6_socket == NULL) {
+	if (init_net.ipv6.mroute6_sk == NULL) {
 		kfree_skb(skb);
 		return -EINVAL;
 	}
@@ -828,7 +825,8 @@ static int ip6mr_cache_report(struct sk_buff *pkt, mifi_t mifi, int assert)
 	/*
 	 *	Deliver to user space multicast routing algorithms
 	 */
-	if ((ret = sock_queue_rcv_skb(mroute6_socket, skb)) < 0) {
+	ret = sock_queue_rcv_skb(init_net.ipv6.mroute6_sk, skb);
+	if (ret < 0) {
 		if (net_ratelimit())
 			printk(KERN_WARNING "mroute6: pending queue full, dropping entries.\n");
 		kfree_skb(skb);
@@ -1145,8 +1143,8 @@ static int ip6mr_sk_init(struct sock *sk)
 
 	rtnl_lock();
 	write_lock_bh(&mrt_lock);
-	if (likely(mroute6_socket == NULL))
-		mroute6_socket = sk;
+	if (likely(init_net.ipv6.mroute6_sk == NULL))
+		init_net.ipv6.mroute6_sk = sk;
 	else
 		err = -EADDRINUSE;
 	write_unlock_bh(&mrt_lock);
@@ -1161,9 +1159,9 @@ int ip6mr_sk_done(struct sock *sk)
 	int err = 0;
 
 	rtnl_lock();
-	if (sk == mroute6_socket) {
+	if (sk == init_net.ipv6.mroute6_sk) {
 		write_lock_bh(&mrt_lock);
-		mroute6_socket = NULL;
+		init_net.ipv6.mroute6_sk = NULL;
 		write_unlock_bh(&mrt_lock);
 
 		mroute_clean_tables(sk);
@@ -1189,7 +1187,7 @@ int ip6_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, int
 	mifi_t mifi;
 
 	if (optname != MRT6_INIT) {
-		if (sk != mroute6_socket && !capable(CAP_NET_ADMIN))
+		if (sk != init_net.ipv6.mroute6_sk && !capable(CAP_NET_ADMIN))
 			return -EACCES;
 	}
 
@@ -1214,7 +1212,7 @@ int ip6_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, int
 		if (vif.mif6c_mifi >= MAXMIFS)
 			return -ENFILE;
 		rtnl_lock();
-		ret = mif6_add(&vif, sk == mroute6_socket);
+		ret = mif6_add(&vif, sk == init_net.ipv6.mroute6_sk);
 		rtnl_unlock();
 		return ret;
 
@@ -1242,7 +1240,7 @@ int ip6_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, int
 		if (optname == MRT6_DEL_MFC)
 			ret = ip6mr_mfc_delete(&mfc);
 		else
-			ret = ip6mr_mfc_add(&mfc, sk == mroute6_socket);
+			ret = ip6mr_mfc_add(&mfc, sk == init_net.ipv6.mroute6_sk);
 		rtnl_unlock();
 		return ret;
 
-- 
cgit v1.2.3-70-g09d2


From 58701ad41105638baa0b38ffe9ac5b10469c1fd3 Mon Sep 17 00:00:00 2001
From: Benjamin Thery <benjamin.thery@bull.net>
Date: Wed, 10 Dec 2008 16:22:34 -0800
Subject: netns: ip6mr: store netns in struct mfc6_cache

This patch stores into struct mfc6_cache the network namespace each
mfc6_cache belongs to. The new member is mfc6_net.

mfc6_net is assigned at cache allocation and doesn't change during
the rest of the cache entry life.

This will help to retrieve the current netns around the IPv6 multicast
forwarding code.

At the moment, all mfc6_cache are allocated in init_net.

Changelog:
==========
* Use write_pnet()/read_pnet() to set and get mfc6_net.

Signed-off-by: Benjamin Thery <benjamin.thery@bull.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/mroute6.h | 15 +++++++++++++++
 net/ipv6/ip6mr.c        | 26 +++++++++++++++++---------
 2 files changed, 32 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mroute6.h b/include/linux/mroute6.h
index 2cd9901ee5c..15d85fe12bb 100644
--- a/include/linux/mroute6.h
+++ b/include/linux/mroute6.h
@@ -188,6 +188,9 @@ struct mif_device
 struct mfc6_cache
 {
 	struct mfc6_cache *next;		/* Next entry on cache line 	*/
+#ifdef CONFIG_NET_NS
+	struct net *mfc6_net;
+#endif
 	struct in6_addr mf6c_mcastgrp;			/* Group the entry belongs to 	*/
 	struct in6_addr mf6c_origin;			/* Source of packet 		*/
 	mifi_t mf6c_parent;			/* Source interface		*/
@@ -210,6 +213,18 @@ struct mfc6_cache
 	} mfc_un;
 };
 
+static inline
+struct net *mfc6_net(const struct mfc6_cache *mfc)
+{
+	return read_pnet(&mfc->mfc6_net);
+}
+
+static inline
+void mfc6_net_set(struct mfc6_cache *mfc, struct net *net)
+{
+	write_pnet(&mfc->mfc6_net, hold_net(net));
+}
+
 #define MFC_STATIC		1
 #define MFC_NOTIFY		2
 
diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c
index bae3ef64966..fab5aba28a2 100644
--- a/net/ipv6/ip6mr.c
+++ b/net/ipv6/ip6mr.c
@@ -506,6 +506,12 @@ static int mif6_delete(int vifi)
 	return 0;
 }
 
+static inline void ip6mr_cache_free(struct mfc6_cache *c)
+{
+	release_net(mfc6_net(c));
+	kmem_cache_free(mrt_cachep, c);
+}
+
 /* Destroy an unresolved cache entry, killing queued skbs
    and reporting error to netlink readers.
  */
@@ -528,7 +534,7 @@ static void ip6mr_destroy_unres(struct mfc6_cache *c)
 			kfree_skb(skb);
 	}
 
-	kmem_cache_free(mrt_cachep, c);
+	ip6mr_cache_free(c);
 }
 
 
@@ -685,22 +691,24 @@ static struct mfc6_cache *ip6mr_cache_find(struct in6_addr *origin, struct in6_a
 /*
  *	Allocate a multicast cache entry
  */
-static struct mfc6_cache *ip6mr_cache_alloc(void)
+static struct mfc6_cache *ip6mr_cache_alloc(struct net *net)
 {
 	struct mfc6_cache *c = kmem_cache_zalloc(mrt_cachep, GFP_KERNEL);
 	if (c == NULL)
 		return NULL;
 	c->mfc_un.res.minvif = MAXMIFS;
+	mfc6_net_set(c, net);
 	return c;
 }
 
-static struct mfc6_cache *ip6mr_cache_alloc_unres(void)
+static struct mfc6_cache *ip6mr_cache_alloc_unres(struct net *net)
 {
 	struct mfc6_cache *c = kmem_cache_zalloc(mrt_cachep, GFP_ATOMIC);
 	if (c == NULL)
 		return NULL;
 	skb_queue_head_init(&c->mfc_un.unres.unresolved);
 	c->mfc_un.unres.expires = jiffies + 10 * HZ;
+	mfc6_net_set(c, net);
 	return c;
 }
 
@@ -856,7 +864,7 @@ ip6mr_cache_unresolved(mifi_t mifi, struct sk_buff *skb)
 		 */
 
 		if (atomic_read(&cache_resolve_queue_len) >= 10 ||
-		    (c = ip6mr_cache_alloc_unres()) == NULL) {
+		    (c = ip6mr_cache_alloc_unres(&init_net)) == NULL) {
 			spin_unlock_bh(&mfc_unres_lock);
 
 			kfree_skb(skb);
@@ -879,7 +887,7 @@ ip6mr_cache_unresolved(mifi_t mifi, struct sk_buff *skb)
 			 */
 			spin_unlock_bh(&mfc_unres_lock);
 
-			kmem_cache_free(mrt_cachep, c);
+			ip6mr_cache_free(c);
 			kfree_skb(skb);
 			return err;
 		}
@@ -924,7 +932,7 @@ static int ip6mr_mfc_delete(struct mf6cctl *mfc)
 			*cp = c->next;
 			write_unlock_bh(&mrt_lock);
 
-			kmem_cache_free(mrt_cachep, c);
+			ip6mr_cache_free(c);
 			return 0;
 		}
 	}
@@ -1073,7 +1081,7 @@ static int ip6mr_mfc_add(struct mf6cctl *mfc, int mrtsock)
 	if (!ipv6_addr_is_multicast(&mfc->mf6cc_mcastgrp.sin6_addr))
 		return -EINVAL;
 
-	c = ip6mr_cache_alloc();
+	c = ip6mr_cache_alloc(&init_net);
 	if (c == NULL)
 		return -ENOMEM;
 
@@ -1108,7 +1116,7 @@ static int ip6mr_mfc_add(struct mf6cctl *mfc, int mrtsock)
 
 	if (uc) {
 		ip6mr_cache_resolve(uc, c);
-		kmem_cache_free(mrt_cachep, uc);
+		ip6mr_cache_free(uc);
 	}
 	return 0;
 }
@@ -1145,7 +1153,7 @@ static void mroute_clean_tables(struct sock *sk)
 			*cp = c->next;
 			write_unlock_bh(&mrt_lock);
 
-			kmem_cache_free(mrt_cachep, c);
+			ip6mr_cache_free(c);
 		}
 	}
 
-- 
cgit v1.2.3-70-g09d2


From 8229efdaef1e7913ae1712c0ba752f267e5fcd5e Mon Sep 17 00:00:00 2001
From: Benjamin Thery <benjamin.thery@bull.net>
Date: Wed, 10 Dec 2008 16:30:15 -0800
Subject: netns: ip6mr: enable namespace support in ipv6 multicast forwarding
 code

This last patch makes the appropriate changes to use and propagate the
network namespace where needed in IPv6 multicast forwarding code.

This consists mainly in replacing all the remaining init_net occurences
with current netns pointer retrieved from sockets, net devices or
mfc6_caches depending on the routines' contexts.

Some routines receive a new 'struct net' parameter to propagate the current
netns:
* ip6mr_get_route
* ip6mr_cache_report
* ip6mr_cache_find
* ip6mr_cache_unresolved
* mif6_add/mif6_delete
* ip6mr_mfc_add/ip6mr_mfc_delete
* ip6mr_reg_vif

All the IPv6 multicast forwarding variables moved to struct netns_ipv6 by
the previous patches are now referenced in the correct namespace.

Changelog:
==========
* Take into account the net associated to mfc6_cache when matching entries in
  mfc_unres_queue list.
* Call mroute_clean_tables() in ip6mr_net_exit() to free memory allocated
  per-namespace.
* Call dev_net_set() in ip6mr_reg_vif() to initialize dev->nd_net
  correctly.

Signed-off-by: Benjamin Thery <benjamin.thery@bull.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/mroute6.h |   3 +-
 net/ipv6/ip6mr.c        | 226 +++++++++++++++++++++++++++---------------------
 net/ipv6/route.c        |   2 +-
 3 files changed, 129 insertions(+), 102 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mroute6.h b/include/linux/mroute6.h
index 15d85fe12bb..5375faca1f7 100644
--- a/include/linux/mroute6.h
+++ b/include/linux/mroute6.h
@@ -245,7 +245,8 @@ void mfc6_net_set(struct mfc6_cache *mfc, struct net *net)
 
 #ifdef __KERNEL__
 struct rtmsg;
-extern int ip6mr_get_route(struct sk_buff *skb, struct rtmsg *rtm, int nowait);
+extern int ip6mr_get_route(struct net *net, struct sk_buff *skb,
+			   struct rtmsg *rtm, int nowait);
 
 #ifdef CONFIG_IPV6_MROUTE
 static inline struct sock *mroute6_socket(struct net *net)
diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c
index d619faf68d9..9eed2422b3e 100644
--- a/net/ipv6/ip6mr.c
+++ b/net/ipv6/ip6mr.c
@@ -77,8 +77,10 @@ static DEFINE_SPINLOCK(mfc_unres_lock);
 static struct kmem_cache *mrt_cachep __read_mostly;
 
 static int ip6_mr_forward(struct sk_buff *skb, struct mfc6_cache *cache);
-static int ip6mr_cache_report(struct sk_buff *pkt, mifi_t mifi, int assert);
+static int ip6mr_cache_report(struct net *net, struct sk_buff *pkt,
+			      mifi_t mifi, int assert);
 static int ip6mr_fill_mroute(struct sk_buff *skb, struct mfc6_cache *c, struct rtmsg *rtm);
+static void mroute_clean_tables(struct net *net);
 
 #ifdef CONFIG_IPV6_PIMSM_V2
 static struct inet6_protocol pim6_protocol;
@@ -354,7 +356,8 @@ static int pim6_rcv(struct sk_buff *skb)
 	struct pimreghdr *pim;
 	struct ipv6hdr   *encap;
 	struct net_device  *reg_dev = NULL;
-	int reg_vif_num = init_net.ipv6.mroute_reg_vif_num;
+	struct net *net = dev_net(skb->dev);
+	int reg_vif_num = net->ipv6.mroute_reg_vif_num;
 
 	if (!pskb_may_pull(skb, sizeof(*pim) + sizeof(*encap)))
 		goto drop;
@@ -377,7 +380,7 @@ static int pim6_rcv(struct sk_buff *skb)
 
 	read_lock(&mrt_lock);
 	if (reg_vif_num >= 0)
-		reg_dev = init_net.ipv6.vif6_table[reg_vif_num].dev;
+		reg_dev = net->ipv6.vif6_table[reg_vif_num].dev;
 	if (reg_dev)
 		dev_hold(reg_dev);
 	read_unlock(&mrt_lock);
@@ -413,10 +416,13 @@ static struct inet6_protocol pim6_protocol = {
 
 static int reg_vif_xmit(struct sk_buff *skb, struct net_device *dev)
 {
+	struct net *net = dev_net(dev);
+
 	read_lock(&mrt_lock);
 	dev->stats.tx_bytes += skb->len;
 	dev->stats.tx_packets++;
-	ip6mr_cache_report(skb, init_net.ipv6.mroute_reg_vif_num, MRT6MSG_WHOLEPKT);
+	ip6mr_cache_report(net, skb, net->ipv6.mroute_reg_vif_num,
+			   MRT6MSG_WHOLEPKT);
 	read_unlock(&mrt_lock);
 	kfree_skb(skb);
 	return 0;
@@ -435,7 +441,7 @@ static void reg_vif_setup(struct net_device *dev)
 	dev->destructor		= free_netdev;
 }
 
-static struct net_device *ip6mr_reg_vif(void)
+static struct net_device *ip6mr_reg_vif(struct net *net)
 {
 	struct net_device *dev;
 
@@ -443,6 +449,8 @@ static struct net_device *ip6mr_reg_vif(void)
 	if (dev == NULL)
 		return NULL;
 
+	dev_net_set(dev, net);
+
 	if (register_netdevice(dev)) {
 		free_netdev(dev);
 		return NULL;
@@ -469,14 +477,14 @@ failure:
  *	Delete a VIF entry
  */
 
-static int mif6_delete(int vifi)
+static int mif6_delete(struct net *net, int vifi)
 {
 	struct mif_device *v;
 	struct net_device *dev;
-	if (vifi < 0 || vifi >= init_net.ipv6.maxvif)
+	if (vifi < 0 || vifi >= net->ipv6.maxvif)
 		return -EADDRNOTAVAIL;
 
-	v = &init_net.ipv6.vif6_table[vifi];
+	v = &net->ipv6.vif6_table[vifi];
 
 	write_lock_bh(&mrt_lock);
 	dev = v->dev;
@@ -488,17 +496,17 @@ static int mif6_delete(int vifi)
 	}
 
 #ifdef CONFIG_IPV6_PIMSM_V2
-	if (vifi == init_net.ipv6.mroute_reg_vif_num)
-		init_net.ipv6.mroute_reg_vif_num = -1;
+	if (vifi == net->ipv6.mroute_reg_vif_num)
+		net->ipv6.mroute_reg_vif_num = -1;
 #endif
 
-	if (vifi + 1 == init_net.ipv6.maxvif) {
+	if (vifi + 1 == net->ipv6.maxvif) {
 		int tmp;
 		for (tmp = vifi - 1; tmp >= 0; tmp--) {
-			if (MIF_EXISTS(&init_net, tmp))
+			if (MIF_EXISTS(net, tmp))
 				break;
 		}
-		init_net.ipv6.maxvif = tmp + 1;
+		net->ipv6.maxvif = tmp + 1;
 	}
 
 	write_unlock_bh(&mrt_lock);
@@ -525,8 +533,9 @@ static inline void ip6mr_cache_free(struct mfc6_cache *c)
 static void ip6mr_destroy_unres(struct mfc6_cache *c)
 {
 	struct sk_buff *skb;
+	struct net *net = mfc6_net(c);
 
-	atomic_dec(&init_net.ipv6.cache_resolve_queue_len);
+	atomic_dec(&net->ipv6.cache_resolve_queue_len);
 
 	while((skb = skb_dequeue(&c->mfc_un.unres.unresolved)) != NULL) {
 		if (ipv6_hdr(skb)->version == 0) {
@@ -535,7 +544,7 @@ static void ip6mr_destroy_unres(struct mfc6_cache *c)
 			nlh->nlmsg_len = NLMSG_LENGTH(sizeof(struct nlmsgerr));
 			skb_trim(skb, nlh->nlmsg_len);
 			((struct nlmsgerr *)NLMSG_DATA(nlh))->error = -ETIMEDOUT;
-			rtnl_unicast(skb, &init_net, NETLINK_CB(skb).pid);
+			rtnl_unicast(skb, net, NETLINK_CB(skb).pid);
 		} else
 			kfree_skb(skb);
 	}
@@ -590,13 +599,14 @@ static void ipmr_expire_process(unsigned long dummy)
 static void ip6mr_update_thresholds(struct mfc6_cache *cache, unsigned char *ttls)
 {
 	int vifi;
+	struct net *net = mfc6_net(cache);
 
 	cache->mfc_un.res.minvif = MAXMIFS;
 	cache->mfc_un.res.maxvif = 0;
 	memset(cache->mfc_un.res.ttls, 255, MAXMIFS);
 
-	for (vifi = 0; vifi < init_net.ipv6.maxvif; vifi++) {
-		if (MIF_EXISTS(&init_net, vifi) &&
+	for (vifi = 0; vifi < net->ipv6.maxvif; vifi++) {
+		if (MIF_EXISTS(net, vifi) &&
 		    ttls[vifi] && ttls[vifi] < 255) {
 			cache->mfc_un.res.ttls[vifi] = ttls[vifi];
 			if (cache->mfc_un.res.minvif > vifi)
@@ -607,15 +617,15 @@ static void ip6mr_update_thresholds(struct mfc6_cache *cache, unsigned char *ttl
 	}
 }
 
-static int mif6_add(struct mif6ctl *vifc, int mrtsock)
+static int mif6_add(struct net *net, struct mif6ctl *vifc, int mrtsock)
 {
 	int vifi = vifc->mif6c_mifi;
-	struct mif_device *v = &init_net.ipv6.vif6_table[vifi];
+	struct mif_device *v = &net->ipv6.vif6_table[vifi];
 	struct net_device *dev;
 	int err;
 
 	/* Is vif busy ? */
-	if (MIF_EXISTS(&init_net, vifi))
+	if (MIF_EXISTS(net, vifi))
 		return -EADDRINUSE;
 
 	switch (vifc->mif6c_flags) {
@@ -625,9 +635,9 @@ static int mif6_add(struct mif6ctl *vifc, int mrtsock)
 		 * Special Purpose VIF in PIM
 		 * All the packets will be sent to the daemon
 		 */
-		if (init_net.ipv6.mroute_reg_vif_num >= 0)
+		if (net->ipv6.mroute_reg_vif_num >= 0)
 			return -EADDRINUSE;
-		dev = ip6mr_reg_vif();
+		dev = ip6mr_reg_vif(net);
 		if (!dev)
 			return -ENOBUFS;
 		err = dev_set_allmulti(dev, 1);
@@ -639,7 +649,7 @@ static int mif6_add(struct mif6ctl *vifc, int mrtsock)
 		break;
 #endif
 	case 0:
-		dev = dev_get_by_index(&init_net, vifc->mif6c_pifi);
+		dev = dev_get_by_index(net, vifc->mif6c_pifi);
 		if (!dev)
 			return -EADDRNOTAVAIL;
 		err = dev_set_allmulti(dev, 1);
@@ -673,20 +683,22 @@ static int mif6_add(struct mif6ctl *vifc, int mrtsock)
 	v->dev = dev;
 #ifdef CONFIG_IPV6_PIMSM_V2
 	if (v->flags & MIFF_REGISTER)
-		init_net.ipv6.mroute_reg_vif_num = vifi;
+		net->ipv6.mroute_reg_vif_num = vifi;
 #endif
-	if (vifi + 1 > init_net.ipv6.maxvif)
-		init_net.ipv6.maxvif = vifi + 1;
+	if (vifi + 1 > net->ipv6.maxvif)
+		net->ipv6.maxvif = vifi + 1;
 	write_unlock_bh(&mrt_lock);
 	return 0;
 }
 
-static struct mfc6_cache *ip6mr_cache_find(struct in6_addr *origin, struct in6_addr *mcastgrp)
+static struct mfc6_cache *ip6mr_cache_find(struct net *net,
+					   struct in6_addr *origin,
+					   struct in6_addr *mcastgrp)
 {
 	int line = MFC6_HASH(mcastgrp, origin);
 	struct mfc6_cache *c;
 
-	for (c = init_net.ipv6.mfc6_cache_array[line]; c; c = c->next) {
+	for (c = net->ipv6.mfc6_cache_array[line]; c; c = c->next) {
 		if (ipv6_addr_equal(&c->mf6c_origin, origin) &&
 		    ipv6_addr_equal(&c->mf6c_mcastgrp, mcastgrp))
 			break;
@@ -743,7 +755,7 @@ static void ip6mr_cache_resolve(struct mfc6_cache *uc, struct mfc6_cache *c)
 				skb_trim(skb, nlh->nlmsg_len);
 				((struct nlmsgerr *)NLMSG_DATA(nlh))->error = -EMSGSIZE;
 			}
-			err = rtnl_unicast(skb, &init_net, NETLINK_CB(skb).pid);
+			err = rtnl_unicast(skb, mfc6_net(uc), NETLINK_CB(skb).pid);
 		} else
 			ip6_mr_forward(skb, c);
 	}
@@ -756,7 +768,8 @@ static void ip6mr_cache_resolve(struct mfc6_cache *uc, struct mfc6_cache *c)
  *	Called under mrt_lock.
  */
 
-static int ip6mr_cache_report(struct sk_buff *pkt, mifi_t mifi, int assert)
+static int ip6mr_cache_report(struct net *net, struct sk_buff *pkt, mifi_t mifi,
+			      int assert)
 {
 	struct sk_buff *skb;
 	struct mrt6msg *msg;
@@ -792,7 +805,7 @@ static int ip6mr_cache_report(struct sk_buff *pkt, mifi_t mifi, int assert)
 		msg = (struct mrt6msg *)skb_transport_header(skb);
 		msg->im6_mbz = 0;
 		msg->im6_msgtype = MRT6MSG_WHOLEPKT;
-		msg->im6_mif = init_net.ipv6.mroute_reg_vif_num;
+		msg->im6_mif = net->ipv6.mroute_reg_vif_num;
 		msg->im6_pad = 0;
 		ipv6_addr_copy(&msg->im6_src, &ipv6_hdr(pkt)->saddr);
 		ipv6_addr_copy(&msg->im6_dst, &ipv6_hdr(pkt)->daddr);
@@ -829,7 +842,7 @@ static int ip6mr_cache_report(struct sk_buff *pkt, mifi_t mifi, int assert)
 	skb_pull(skb, sizeof(struct ipv6hdr));
 	}
 
-	if (init_net.ipv6.mroute6_sk == NULL) {
+	if (net->ipv6.mroute6_sk == NULL) {
 		kfree_skb(skb);
 		return -EINVAL;
 	}
@@ -837,7 +850,7 @@ static int ip6mr_cache_report(struct sk_buff *pkt, mifi_t mifi, int assert)
 	/*
 	 *	Deliver to user space multicast routing algorithms
 	 */
-	ret = sock_queue_rcv_skb(init_net.ipv6.mroute6_sk, skb);
+	ret = sock_queue_rcv_skb(net->ipv6.mroute6_sk, skb);
 	if (ret < 0) {
 		if (net_ratelimit())
 			printk(KERN_WARNING "mroute6: pending queue full, dropping entries.\n");
@@ -852,14 +865,14 @@ static int ip6mr_cache_report(struct sk_buff *pkt, mifi_t mifi, int assert)
  */
 
 static int
-ip6mr_cache_unresolved(mifi_t mifi, struct sk_buff *skb)
+ip6mr_cache_unresolved(struct net *net, mifi_t mifi, struct sk_buff *skb)
 {
 	int err;
 	struct mfc6_cache *c;
 
 	spin_lock_bh(&mfc_unres_lock);
 	for (c = mfc_unres_queue; c; c = c->next) {
-		if (net_eq(mfc6_net(c), &init_net) &&
+		if (net_eq(mfc6_net(c), net) &&
 		    ipv6_addr_equal(&c->mf6c_mcastgrp, &ipv6_hdr(skb)->daddr) &&
 		    ipv6_addr_equal(&c->mf6c_origin, &ipv6_hdr(skb)->saddr))
 			break;
@@ -870,8 +883,8 @@ ip6mr_cache_unresolved(mifi_t mifi, struct sk_buff *skb)
 		 *	Create a new entry if allowable
 		 */
 
-		if (atomic_read(&init_net.ipv6.cache_resolve_queue_len) >= 10 ||
-		    (c = ip6mr_cache_alloc_unres(&init_net)) == NULL) {
+		if (atomic_read(&net->ipv6.cache_resolve_queue_len) >= 10 ||
+		    (c = ip6mr_cache_alloc_unres(net)) == NULL) {
 			spin_unlock_bh(&mfc_unres_lock);
 
 			kfree_skb(skb);
@@ -888,7 +901,8 @@ ip6mr_cache_unresolved(mifi_t mifi, struct sk_buff *skb)
 		/*
 		 *	Reflect first query at pim6sd
 		 */
-		if ((err = ip6mr_cache_report(skb, mifi, MRT6MSG_NOCACHE)) < 0) {
+		err = ip6mr_cache_report(net, skb, mifi, MRT6MSG_NOCACHE);
+		if (err < 0) {
 			/* If the report failed throw the cache entry
 			   out - Brad Parker
 			 */
@@ -899,7 +913,7 @@ ip6mr_cache_unresolved(mifi_t mifi, struct sk_buff *skb)
 			return err;
 		}
 
-		atomic_inc(&init_net.ipv6.cache_resolve_queue_len);
+		atomic_inc(&net->ipv6.cache_resolve_queue_len);
 		c->next = mfc_unres_queue;
 		mfc_unres_queue = c;
 
@@ -925,14 +939,14 @@ ip6mr_cache_unresolved(mifi_t mifi, struct sk_buff *skb)
  *	MFC6 cache manipulation by user space
  */
 
-static int ip6mr_mfc_delete(struct mf6cctl *mfc)
+static int ip6mr_mfc_delete(struct net *net, struct mf6cctl *mfc)
 {
 	int line;
 	struct mfc6_cache *c, **cp;
 
 	line = MFC6_HASH(&mfc->mf6cc_mcastgrp.sin6_addr, &mfc->mf6cc_origin.sin6_addr);
 
-	for (cp = &init_net.ipv6.mfc6_cache_array[line];
+	for (cp = &net->ipv6.mfc6_cache_array[line];
 	     (c = *cp) != NULL; cp = &c->next) {
 		if (ipv6_addr_equal(&c->mf6c_origin, &mfc->mf6cc_origin.sin6_addr) &&
 		    ipv6_addr_equal(&c->mf6c_mcastgrp, &mfc->mf6cc_mcastgrp.sin6_addr)) {
@@ -951,19 +965,17 @@ static int ip6mr_device_event(struct notifier_block *this,
 			      unsigned long event, void *ptr)
 {
 	struct net_device *dev = ptr;
+	struct net *net = dev_net(dev);
 	struct mif_device *v;
 	int ct;
 
-	if (!net_eq(dev_net(dev), &init_net))
-		return NOTIFY_DONE;
-
 	if (event != NETDEV_UNREGISTER)
 		return NOTIFY_DONE;
 
-	v = &init_net.ipv6.vif6_table[0];
-	for (ct = 0; ct < init_net.ipv6.maxvif; ct++, v++) {
+	v = &net->ipv6.vif6_table[0];
+	for (ct = 0; ct < net->ipv6.maxvif; ct++, v++) {
 		if (v->dev == dev)
-			mif6_delete(ct);
+			mif6_delete(net, ct);
 	}
 	return NOTIFY_DONE;
 }
@@ -1026,6 +1038,7 @@ static void __net_exit ip6mr_net_exit(struct net *net)
 	proc_net_remove(net, "ip6_mr_cache");
 	proc_net_remove(net, "ip6_mr_vif");
 #endif
+	mroute_clean_tables(net);
 	kfree(net->ipv6.mfc6_cache_array);
 	kfree(net->ipv6.vif6_table);
 }
@@ -1071,7 +1084,7 @@ void ip6_mr_cleanup(void)
 	kmem_cache_destroy(mrt_cachep);
 }
 
-static int ip6mr_mfc_add(struct mf6cctl *mfc, int mrtsock)
+static int ip6mr_mfc_add(struct net *net, struct mf6cctl *mfc, int mrtsock)
 {
 	int line;
 	struct mfc6_cache *uc, *c, **cp;
@@ -1087,7 +1100,7 @@ static int ip6mr_mfc_add(struct mf6cctl *mfc, int mrtsock)
 
 	line = MFC6_HASH(&mfc->mf6cc_mcastgrp.sin6_addr, &mfc->mf6cc_origin.sin6_addr);
 
-	for (cp = &init_net.ipv6.mfc6_cache_array[line];
+	for (cp = &net->ipv6.mfc6_cache_array[line];
 	     (c = *cp) != NULL; cp = &c->next) {
 		if (ipv6_addr_equal(&c->mf6c_origin, &mfc->mf6cc_origin.sin6_addr) &&
 		    ipv6_addr_equal(&c->mf6c_mcastgrp, &mfc->mf6cc_mcastgrp.sin6_addr))
@@ -1107,7 +1120,7 @@ static int ip6mr_mfc_add(struct mf6cctl *mfc, int mrtsock)
 	if (!ipv6_addr_is_multicast(&mfc->mf6cc_mcastgrp.sin6_addr))
 		return -EINVAL;
 
-	c = ip6mr_cache_alloc(&init_net);
+	c = ip6mr_cache_alloc(net);
 	if (c == NULL)
 		return -ENOMEM;
 
@@ -1119,8 +1132,8 @@ static int ip6mr_mfc_add(struct mf6cctl *mfc, int mrtsock)
 		c->mfc_flags |= MFC_STATIC;
 
 	write_lock_bh(&mrt_lock);
-	c->next = init_net.ipv6.mfc6_cache_array[line];
-	init_net.ipv6.mfc6_cache_array[line] = c;
+	c->next = net->ipv6.mfc6_cache_array[line];
+	net->ipv6.mfc6_cache_array[line] = c;
 	write_unlock_bh(&mrt_lock);
 
 	/*
@@ -1130,11 +1143,11 @@ static int ip6mr_mfc_add(struct mf6cctl *mfc, int mrtsock)
 	spin_lock_bh(&mfc_unres_lock);
 	for (cp = &mfc_unres_queue; (uc = *cp) != NULL;
 	     cp = &uc->next) {
-		if (net_eq(mfc6_net(uc), &init_net) &&
+		if (net_eq(mfc6_net(uc), net) &&
 		    ipv6_addr_equal(&uc->mf6c_origin, &c->mf6c_origin) &&
 		    ipv6_addr_equal(&uc->mf6c_mcastgrp, &c->mf6c_mcastgrp)) {
 			*cp = uc->next;
-			atomic_dec(&init_net.ipv6.cache_resolve_queue_len);
+			atomic_dec(&net->ipv6.cache_resolve_queue_len);
 			break;
 		}
 	}
@@ -1153,16 +1166,16 @@ static int ip6mr_mfc_add(struct mf6cctl *mfc, int mrtsock)
  *	Close the multicast socket, and clear the vif tables etc
  */
 
-static void mroute_clean_tables(struct sock *sk)
+static void mroute_clean_tables(struct net *net)
 {
 	int i;
 
 	/*
 	 *	Shut down all active vif entries
 	 */
-	for (i = 0; i < init_net.ipv6.maxvif; i++) {
-		if (!(init_net.ipv6.vif6_table[i].flags & VIFF_STATIC))
-			mif6_delete(i);
+	for (i = 0; i < net->ipv6.maxvif; i++) {
+		if (!(net->ipv6.vif6_table[i].flags & VIFF_STATIC))
+			mif6_delete(net, i);
 	}
 
 	/*
@@ -1171,7 +1184,7 @@ static void mroute_clean_tables(struct sock *sk)
 	for (i = 0; i < MFC6_LINES; i++) {
 		struct mfc6_cache *c, **cp;
 
-		cp = &init_net.ipv6.mfc6_cache_array[i];
+		cp = &net->ipv6.mfc6_cache_array[i];
 		while ((c = *cp) != NULL) {
 			if (c->mfc_flags & MFC_STATIC) {
 				cp = &c->next;
@@ -1185,13 +1198,13 @@ static void mroute_clean_tables(struct sock *sk)
 		}
 	}
 
-	if (atomic_read(&init_net.ipv6.cache_resolve_queue_len) != 0) {
+	if (atomic_read(&net->ipv6.cache_resolve_queue_len) != 0) {
 		struct mfc6_cache *c, **cp;
 
 		spin_lock_bh(&mfc_unres_lock);
 		cp = &mfc_unres_queue;
 		while ((c = *cp) != NULL) {
-			if (!net_eq(mfc6_net(c), &init_net)) {
+			if (!net_eq(mfc6_net(c), net)) {
 				cp = &c->next;
 				continue;
 			}
@@ -1205,11 +1218,12 @@ static void mroute_clean_tables(struct sock *sk)
 static int ip6mr_sk_init(struct sock *sk)
 {
 	int err = 0;
+	struct net *net = sock_net(sk);
 
 	rtnl_lock();
 	write_lock_bh(&mrt_lock);
-	if (likely(init_net.ipv6.mroute6_sk == NULL))
-		init_net.ipv6.mroute6_sk = sk;
+	if (likely(net->ipv6.mroute6_sk == NULL))
+		net->ipv6.mroute6_sk = sk;
 	else
 		err = -EADDRINUSE;
 	write_unlock_bh(&mrt_lock);
@@ -1222,14 +1236,15 @@ static int ip6mr_sk_init(struct sock *sk)
 int ip6mr_sk_done(struct sock *sk)
 {
 	int err = 0;
+	struct net *net = sock_net(sk);
 
 	rtnl_lock();
-	if (sk == init_net.ipv6.mroute6_sk) {
+	if (sk == net->ipv6.mroute6_sk) {
 		write_lock_bh(&mrt_lock);
-		init_net.ipv6.mroute6_sk = NULL;
+		net->ipv6.mroute6_sk = NULL;
 		write_unlock_bh(&mrt_lock);
 
-		mroute_clean_tables(sk);
+		mroute_clean_tables(net);
 	} else
 		err = -EACCES;
 	rtnl_unlock();
@@ -1250,9 +1265,10 @@ int ip6_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, int
 	struct mif6ctl vif;
 	struct mf6cctl mfc;
 	mifi_t mifi;
+	struct net *net = sock_net(sk);
 
 	if (optname != MRT6_INIT) {
-		if (sk != init_net.ipv6.mroute6_sk && !capable(CAP_NET_ADMIN))
+		if (sk != net->ipv6.mroute6_sk && !capable(CAP_NET_ADMIN))
 			return -EACCES;
 	}
 
@@ -1277,7 +1293,7 @@ int ip6_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, int
 		if (vif.mif6c_mifi >= MAXMIFS)
 			return -ENFILE;
 		rtnl_lock();
-		ret = mif6_add(&vif, sk == init_net.ipv6.mroute6_sk);
+		ret = mif6_add(net, &vif, sk == net->ipv6.mroute6_sk);
 		rtnl_unlock();
 		return ret;
 
@@ -1287,7 +1303,7 @@ int ip6_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, int
 		if (copy_from_user(&mifi, optval, sizeof(mifi_t)))
 			return -EFAULT;
 		rtnl_lock();
-		ret = mif6_delete(mifi);
+		ret = mif6_delete(net, mifi);
 		rtnl_unlock();
 		return ret;
 
@@ -1303,9 +1319,10 @@ int ip6_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, int
 			return -EFAULT;
 		rtnl_lock();
 		if (optname == MRT6_DEL_MFC)
-			ret = ip6mr_mfc_delete(&mfc);
+			ret = ip6mr_mfc_delete(net, &mfc);
 		else
-			ret = ip6mr_mfc_add(&mfc, sk == init_net.ipv6.mroute6_sk);
+			ret = ip6mr_mfc_add(net, &mfc,
+					    sk == net->ipv6.mroute6_sk);
 		rtnl_unlock();
 		return ret;
 
@@ -1317,7 +1334,7 @@ int ip6_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, int
 		int v;
 		if (get_user(v, (int __user *)optval))
 			return -EFAULT;
-		init_net.ipv6.mroute_do_assert = !!v;
+		net->ipv6.mroute_do_assert = !!v;
 		return 0;
 	}
 
@@ -1330,10 +1347,10 @@ int ip6_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, int
 		v = !!v;
 		rtnl_lock();
 		ret = 0;
-		if (v != init_net.ipv6.mroute_do_pim) {
-			init_net.ipv6.mroute_do_pim = v;
-			init_net.ipv6.mroute_do_assert = v;
-			if (init_net.ipv6.mroute_do_pim)
+		if (v != net->ipv6.mroute_do_pim) {
+			net->ipv6.mroute_do_pim = v;
+			net->ipv6.mroute_do_assert = v;
+			if (net->ipv6.mroute_do_pim)
 				ret = inet6_add_protocol(&pim6_protocol,
 							 IPPROTO_PIM);
 			else
@@ -1365,6 +1382,7 @@ int ip6_mroute_getsockopt(struct sock *sk, int optname, char __user *optval,
 {
 	int olr;
 	int val;
+	struct net *net = sock_net(sk);
 
 	switch (optname) {
 	case MRT6_VERSION:
@@ -1372,11 +1390,11 @@ int ip6_mroute_getsockopt(struct sock *sk, int optname, char __user *optval,
 		break;
 #ifdef CONFIG_IPV6_PIMSM_V2
 	case MRT6_PIM:
-		val = init_net.ipv6.mroute_do_pim;
+		val = net->ipv6.mroute_do_pim;
 		break;
 #endif
 	case MRT6_ASSERT:
-		val = init_net.ipv6.mroute_do_assert;
+		val = net->ipv6.mroute_do_assert;
 		break;
 	default:
 		return -ENOPROTOOPT;
@@ -1406,16 +1424,17 @@ int ip6mr_ioctl(struct sock *sk, int cmd, void __user *arg)
 	struct sioc_mif_req6 vr;
 	struct mif_device *vif;
 	struct mfc6_cache *c;
+	struct net *net = sock_net(sk);
 
 	switch (cmd) {
 	case SIOCGETMIFCNT_IN6:
 		if (copy_from_user(&vr, arg, sizeof(vr)))
 			return -EFAULT;
-		if (vr.mifi >= init_net.ipv6.maxvif)
+		if (vr.mifi >= net->ipv6.maxvif)
 			return -EINVAL;
 		read_lock(&mrt_lock);
-		vif = &init_net.ipv6.vif6_table[vr.mifi];
-		if (MIF_EXISTS(&init_net, vr.mifi)) {
+		vif = &net->ipv6.vif6_table[vr.mifi];
+		if (MIF_EXISTS(net, vr.mifi)) {
 			vr.icount = vif->pkt_in;
 			vr.ocount = vif->pkt_out;
 			vr.ibytes = vif->bytes_in;
@@ -1433,7 +1452,7 @@ int ip6mr_ioctl(struct sock *sk, int cmd, void __user *arg)
 			return -EFAULT;
 
 		read_lock(&mrt_lock);
-		c = ip6mr_cache_find(&sr.src.sin6_addr, &sr.grp.sin6_addr);
+		c = ip6mr_cache_find(net, &sr.src.sin6_addr, &sr.grp.sin6_addr);
 		if (c) {
 			sr.pktcnt = c->mfc_un.res.pkt;
 			sr.bytecnt = c->mfc_un.res.bytes;
@@ -1466,7 +1485,8 @@ static inline int ip6mr_forward2_finish(struct sk_buff *skb)
 static int ip6mr_forward2(struct sk_buff *skb, struct mfc6_cache *c, int vifi)
 {
 	struct ipv6hdr *ipv6h;
-	struct mif_device *vif = &init_net.ipv6.vif6_table[vifi];
+	struct net *net = mfc6_net(c);
+	struct mif_device *vif = &net->ipv6.vif6_table[vifi];
 	struct net_device *dev;
 	struct dst_entry *dst;
 	struct flowi fl;
@@ -1480,7 +1500,7 @@ static int ip6mr_forward2(struct sk_buff *skb, struct mfc6_cache *c, int vifi)
 		vif->bytes_out += skb->len;
 		vif->dev->stats.tx_bytes += skb->len;
 		vif->dev->stats.tx_packets++;
-		ip6mr_cache_report(skb, vifi, MRT6MSG_WHOLEPKT);
+		ip6mr_cache_report(net, skb, vifi, MRT6MSG_WHOLEPKT);
 		kfree_skb(skb);
 		return 0;
 	}
@@ -1495,7 +1515,7 @@ static int ip6mr_forward2(struct sk_buff *skb, struct mfc6_cache *c, int vifi)
 		}
 	};
 
-	dst = ip6_route_output(&init_net, NULL, &fl);
+	dst = ip6_route_output(net, NULL, &fl);
 	if (!dst)
 		goto out_free;
 
@@ -1538,9 +1558,10 @@ out_free:
 
 static int ip6mr_find_vif(struct net_device *dev)
 {
+	struct net *net = dev_net(dev);
 	int ct;
-	for (ct = init_net.ipv6.maxvif - 1; ct >= 0; ct--) {
-		if (init_net.ipv6.vif6_table[ct].dev == dev)
+	for (ct = net->ipv6.maxvif - 1; ct >= 0; ct--) {
+		if (net->ipv6.vif6_table[ct].dev == dev)
 			break;
 	}
 	return ct;
@@ -1550,6 +1571,7 @@ static int ip6_mr_forward(struct sk_buff *skb, struct mfc6_cache *cache)
 {
 	int psend = -1;
 	int vif, ct;
+	struct net *net = mfc6_net(cache);
 
 	vif = cache->mf6c_parent;
 	cache->mfc_un.res.pkt++;
@@ -1558,30 +1580,30 @@ static int ip6_mr_forward(struct sk_buff *skb, struct mfc6_cache *cache)
 	/*
 	 * Wrong interface: drop packet and (maybe) send PIM assert.
 	 */
-	if (init_net.ipv6.vif6_table[vif].dev != skb->dev) {
+	if (net->ipv6.vif6_table[vif].dev != skb->dev) {
 		int true_vifi;
 
 		cache->mfc_un.res.wrong_if++;
 		true_vifi = ip6mr_find_vif(skb->dev);
 
-		if (true_vifi >= 0 && init_net.ipv6.mroute_do_assert &&
+		if (true_vifi >= 0 && net->ipv6.mroute_do_assert &&
 		    /* pimsm uses asserts, when switching from RPT to SPT,
 		       so that we cannot check that packet arrived on an oif.
 		       It is bad, but otherwise we would need to move pretty
 		       large chunk of pimd to kernel. Ough... --ANK
 		     */
-		    (init_net.ipv6.mroute_do_pim ||
+		    (net->ipv6.mroute_do_pim ||
 		     cache->mfc_un.res.ttls[true_vifi] < 255) &&
 		    time_after(jiffies,
 			       cache->mfc_un.res.last_assert + MFC_ASSERT_THRESH)) {
 			cache->mfc_un.res.last_assert = jiffies;
-			ip6mr_cache_report(skb, true_vifi, MRT6MSG_WRONGMIF);
+			ip6mr_cache_report(net, skb, true_vifi, MRT6MSG_WRONGMIF);
 		}
 		goto dont_forward;
 	}
 
-	init_net.ipv6.vif6_table[vif].pkt_in++;
-	init_net.ipv6.vif6_table[vif].bytes_in += skb->len;
+	net->ipv6.vif6_table[vif].pkt_in++;
+	net->ipv6.vif6_table[vif].bytes_in += skb->len;
 
 	/*
 	 *	Forward the frame
@@ -1614,9 +1636,11 @@ dont_forward:
 int ip6_mr_input(struct sk_buff *skb)
 {
 	struct mfc6_cache *cache;
+	struct net *net = dev_net(skb->dev);
 
 	read_lock(&mrt_lock);
-	cache = ip6mr_cache_find(&ipv6_hdr(skb)->saddr, &ipv6_hdr(skb)->daddr);
+	cache = ip6mr_cache_find(net,
+				 &ipv6_hdr(skb)->saddr, &ipv6_hdr(skb)->daddr);
 
 	/*
 	 *	No usable cache entry
@@ -1626,7 +1650,7 @@ int ip6_mr_input(struct sk_buff *skb)
 
 		vif = ip6mr_find_vif(skb->dev);
 		if (vif >= 0) {
-			int err = ip6mr_cache_unresolved(vif, skb);
+			int err = ip6mr_cache_unresolved(net, vif, skb);
 			read_unlock(&mrt_lock);
 
 			return err;
@@ -1649,7 +1673,8 @@ ip6mr_fill_mroute(struct sk_buff *skb, struct mfc6_cache *c, struct rtmsg *rtm)
 {
 	int ct;
 	struct rtnexthop *nhp;
-	struct net_device *dev = init_net.ipv6.vif6_table[c->mf6c_parent].dev;
+	struct net *net = mfc6_net(c);
+	struct net_device *dev = net->ipv6.vif6_table[c->mf6c_parent].dev;
 	u8 *b = skb_tail_pointer(skb);
 	struct rtattr *mp_head;
 
@@ -1665,7 +1690,7 @@ ip6mr_fill_mroute(struct sk_buff *skb, struct mfc6_cache *c, struct rtmsg *rtm)
 			nhp = (struct rtnexthop *)skb_put(skb, RTA_ALIGN(sizeof(*nhp)));
 			nhp->rtnh_flags = 0;
 			nhp->rtnh_hops = c->mfc_un.res.ttls[ct];
-			nhp->rtnh_ifindex = init_net.ipv6.vif6_table[ct].dev->ifindex;
+			nhp->rtnh_ifindex = net->ipv6.vif6_table[ct].dev->ifindex;
 			nhp->rtnh_len = sizeof(*nhp);
 		}
 	}
@@ -1679,14 +1704,15 @@ rtattr_failure:
 	return -EMSGSIZE;
 }
 
-int ip6mr_get_route(struct sk_buff *skb, struct rtmsg *rtm, int nowait)
+int ip6mr_get_route(struct net *net,
+		    struct sk_buff *skb, struct rtmsg *rtm, int nowait)
 {
 	int err;
 	struct mfc6_cache *cache;
 	struct rt6_info *rt = (struct rt6_info *)skb->dst;
 
 	read_lock(&mrt_lock);
-	cache = ip6mr_cache_find(&rt->rt6i_src.addr, &rt->rt6i_dst.addr);
+	cache = ip6mr_cache_find(net, &rt->rt6i_src.addr, &rt->rt6i_dst.addr);
 
 	if (!cache) {
 		struct sk_buff *skb2;
@@ -1729,7 +1755,7 @@ int ip6mr_get_route(struct sk_buff *skb, struct rtmsg *rtm, int nowait)
 		ipv6_addr_copy(&iph->saddr, &rt->rt6i_src.addr);
 		ipv6_addr_copy(&iph->daddr, &rt->rt6i_dst.addr);
 
-		err = ip6mr_cache_unresolved(vif, skb2);
+		err = ip6mr_cache_unresolved(net, vif, skb2);
 		read_unlock(&mrt_lock);
 
 		return err;
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 9da1ece466a..18c486cf498 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -2194,7 +2194,7 @@ static int rt6_fill_node(struct net *net,
 	if (iif) {
 #ifdef CONFIG_IPV6_MROUTE
 		if (ipv6_addr_is_multicast(&rt->rt6i_dst.addr)) {
-			int err = ip6mr_get_route(skb, rtm, nowait);
+			int err = ip6mr_get_route(net, skb, rtm, nowait);
 			if (err <= 0) {
 				if (!nowait) {
 					if (err == 0)
-- 
cgit v1.2.3-70-g09d2


From 4d4be482a4d78ca906f45e99fd9fdb91e907f5ad Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Tue, 9 Dec 2008 04:47:33 -0500
Subject: [XFS] add a FMODE flag to make XFS invisible I/O less hacky

XFS has a mode called invisble I/O that doesn't update any of the
timestamps.  It's used for HSM-style applications and exposed through
the nasty open by handle ioctl.

Instead of doing directly assignment of file operations that set an
internal flag for it add a new FMODE_NOCMTIME flag that we can check
in the normal file operations.

(addition of the generic VFS flag has been ACKed by Al as an interims
 solution)

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Lachlan McIlroy <lachlan@sgi.com>
---
 fs/xfs/linux-2.6/xfs_file.c    | 145 ++++++-----------------------------------
 fs/xfs/linux-2.6/xfs_ioctl.c   |  29 ++++++---
 fs/xfs/linux-2.6/xfs_ioctl.h   |   8 +--
 fs/xfs/linux-2.6/xfs_ioctl32.c |  56 ++++++----------
 fs/xfs/linux-2.6/xfs_iops.h    |   1 -
 fs/xfs/xfs_vnodeops.h          |   2 -
 include/linux/fs.h             |   8 +++
 7 files changed, 71 insertions(+), 178 deletions(-)

(limited to 'include/linux')

diff --git a/fs/xfs/linux-2.6/xfs_file.c b/fs/xfs/linux-2.6/xfs_file.c
index f999d20a429..a0c45cc8a6b 100644
--- a/fs/xfs/linux-2.6/xfs_file.c
+++ b/fs/xfs/linux-2.6/xfs_file.c
@@ -45,80 +45,44 @@
 
 static struct vm_operations_struct xfs_file_vm_ops;
 
-STATIC_INLINE ssize_t
-__xfs_file_read(
+STATIC ssize_t
+xfs_file_aio_read(
 	struct kiocb		*iocb,
 	const struct iovec	*iov,
 	unsigned long		nr_segs,
-	int			ioflags,
 	loff_t			pos)
 {
 	struct file		*file = iocb->ki_filp;
+	int			ioflags = IO_ISAIO;
 
 	BUG_ON(iocb->ki_pos != pos);
 	if (unlikely(file->f_flags & O_DIRECT))
 		ioflags |= IO_ISDIRECT;
+	if (file->f_mode & FMODE_NOCMTIME)
+		ioflags |= IO_INVIS;
 	return xfs_read(XFS_I(file->f_path.dentry->d_inode), iocb, iov,
 				nr_segs, &iocb->ki_pos, ioflags);
 }
 
 STATIC ssize_t
-xfs_file_aio_read(
-	struct kiocb		*iocb,
-	const struct iovec	*iov,
-	unsigned long		nr_segs,
-	loff_t			pos)
-{
-	return __xfs_file_read(iocb, iov, nr_segs, IO_ISAIO, pos);
-}
-
-STATIC ssize_t
-xfs_file_aio_read_invis(
-	struct kiocb		*iocb,
-	const struct iovec	*iov,
-	unsigned long		nr_segs,
-	loff_t			pos)
-{
-	return __xfs_file_read(iocb, iov, nr_segs, IO_ISAIO|IO_INVIS, pos);
-}
-
-STATIC_INLINE ssize_t
-__xfs_file_write(
+xfs_file_aio_write(
 	struct kiocb		*iocb,
 	const struct iovec	*iov,
 	unsigned long		nr_segs,
-	int			ioflags,
 	loff_t			pos)
 {
-	struct file	*file = iocb->ki_filp;
+	struct file		*file = iocb->ki_filp;
+	int			ioflags = IO_ISAIO;
 
 	BUG_ON(iocb->ki_pos != pos);
 	if (unlikely(file->f_flags & O_DIRECT))
 		ioflags |= IO_ISDIRECT;
+	if (file->f_mode & FMODE_NOCMTIME)
+		ioflags |= IO_INVIS;
 	return xfs_write(XFS_I(file->f_mapping->host), iocb, iov, nr_segs,
 				&iocb->ki_pos, ioflags);
 }
 
-STATIC ssize_t
-xfs_file_aio_write(
-	struct kiocb		*iocb,
-	const struct iovec	*iov,
-	unsigned long		nr_segs,
-	loff_t			pos)
-{
-	return __xfs_file_write(iocb, iov, nr_segs, IO_ISAIO, pos);
-}
-
-STATIC ssize_t
-xfs_file_aio_write_invis(
-	struct kiocb		*iocb,
-	const struct iovec	*iov,
-	unsigned long		nr_segs,
-	loff_t			pos)
-{
-	return __xfs_file_write(iocb, iov, nr_segs, IO_ISAIO|IO_INVIS, pos);
-}
-
 STATIC ssize_t
 xfs_file_splice_read(
 	struct file		*infilp,
@@ -127,20 +91,13 @@ xfs_file_splice_read(
 	size_t			len,
 	unsigned int		flags)
 {
-	return xfs_splice_read(XFS_I(infilp->f_path.dentry->d_inode),
-				   infilp, ppos, pipe, len, flags, 0);
-}
+	int			ioflags = 0;
+
+	if (infilp->f_mode & FMODE_NOCMTIME)
+		ioflags |= IO_INVIS;
 
-STATIC ssize_t
-xfs_file_splice_read_invis(
-	struct file		*infilp,
-	loff_t			*ppos,
-	struct pipe_inode_info	*pipe,
-	size_t			len,
-	unsigned int		flags)
-{
 	return xfs_splice_read(XFS_I(infilp->f_path.dentry->d_inode),
-				   infilp, ppos, pipe, len, flags, IO_INVIS);
+				   infilp, ppos, pipe, len, flags, ioflags);
 }
 
 STATIC ssize_t
@@ -151,20 +108,13 @@ xfs_file_splice_write(
 	size_t			len,
 	unsigned int		flags)
 {
-	return xfs_splice_write(XFS_I(outfilp->f_path.dentry->d_inode),
-				    pipe, outfilp, ppos, len, flags, 0);
-}
+	int			ioflags = 0;
+
+	if (outfilp->f_mode & FMODE_NOCMTIME)
+		ioflags |= IO_INVIS;
 
-STATIC ssize_t
-xfs_file_splice_write_invis(
-	struct pipe_inode_info	*pipe,
-	struct file		*outfilp,
-	loff_t			*ppos,
-	size_t			len,
-	unsigned int		flags)
-{
 	return xfs_splice_write(XFS_I(outfilp->f_path.dentry->d_inode),
-				    pipe, outfilp, ppos, len, flags, IO_INVIS);
+				    pipe, outfilp, ppos, len, flags, ioflags);
 }
 
 STATIC int
@@ -275,42 +225,6 @@ xfs_file_mmap(
 	return 0;
 }
 
-STATIC long
-xfs_file_ioctl(
-	struct file	*filp,
-	unsigned int	cmd,
-	unsigned long	p)
-{
-	struct inode	*inode = filp->f_path.dentry->d_inode;
-
-
-	/* NOTE:  some of the ioctl's return positive #'s as a
-	 *	  byte count indicating success, such as
-	 *	  readlink_by_handle.  So we don't "sign flip"
-	 *	  like most other routines.  This means true
-	 *	  errors need to be returned as a negative value.
-	 */
-	return xfs_ioctl(XFS_I(inode), filp, 0, cmd, (void __user *)p);
-}
-
-STATIC long
-xfs_file_ioctl_invis(
-	struct file	*filp,
-	unsigned int	cmd,
-	unsigned long	p)
-{
-	struct inode	*inode = filp->f_path.dentry->d_inode;
-
-
-	/* NOTE:  some of the ioctl's return positive #'s as a
-	 *	  byte count indicating success, such as
-	 *	  readlink_by_handle.  So we don't "sign flip"
-	 *	  like most other routines.  This means true
-	 *	  errors need to be returned as a negative value.
-	 */
-	return xfs_ioctl(XFS_I(inode), filp, IO_INVIS, cmd, (void __user *)p);
-}
-
 /*
  * mmap()d file has taken write protection fault and is being made
  * writable. We can set the page state up correctly for a writable
@@ -346,25 +260,6 @@ const struct file_operations xfs_file_operations = {
 #endif
 };
 
-const struct file_operations xfs_invis_file_operations = {
-	.llseek		= generic_file_llseek,
-	.read		= do_sync_read,
-	.write		= do_sync_write,
-	.aio_read	= xfs_file_aio_read_invis,
-	.aio_write	= xfs_file_aio_write_invis,
-	.splice_read	= xfs_file_splice_read_invis,
-	.splice_write	= xfs_file_splice_write_invis,
-	.unlocked_ioctl	= xfs_file_ioctl_invis,
-#ifdef CONFIG_COMPAT
-	.compat_ioctl	= xfs_file_compat_invis_ioctl,
-#endif
-	.mmap		= xfs_file_mmap,
-	.open		= xfs_file_open,
-	.release	= xfs_file_release,
-	.fsync		= xfs_file_fsync,
-};
-
-
 const struct file_operations xfs_dir_file_operations = {
 	.open		= xfs_dir_open,
 	.read		= generic_read_dir,
diff --git a/fs/xfs/linux-2.6/xfs_ioctl.c b/fs/xfs/linux-2.6/xfs_ioctl.c
index c8f1e632ba9..0264c8719ff 100644
--- a/fs/xfs/linux-2.6/xfs_ioctl.c
+++ b/fs/xfs/linux-2.6/xfs_ioctl.c
@@ -319,10 +319,11 @@ xfs_open_by_handle(
 		put_unused_fd(new_fd);
 		return -XFS_ERROR(-PTR_ERR(filp));
 	}
+
 	if (inode->i_mode & S_IFREG) {
 		/* invisible operation should not change atime */
 		filp->f_flags |= O_NOATIME;
-		filp->f_op = &xfs_invis_file_operations;
+		filp->f_mode |= FMODE_NOCMTIME;
 	}
 
 	fd_install(new_fd, filp);
@@ -1328,21 +1329,31 @@ xfs_ioc_getbmapx(
 	return 0;
 }
 
-int
-xfs_ioctl(
-	xfs_inode_t		*ip,
+/*
+ * Note: some of the ioctl's return positive numbers as a
+ * byte count indicating success, such as readlink_by_handle.
+ * So we don't "sign flip" like most other routines.  This means
+ * true errors need to be returned as a negative value.
+ */
+long
+xfs_file_ioctl(
 	struct file		*filp,
-	int			ioflags,
 	unsigned int		cmd,
-	void			__user *arg)
+	unsigned long		p)
 {
 	struct inode		*inode = filp->f_path.dentry->d_inode;
-	xfs_mount_t		*mp = ip->i_mount;
+	struct xfs_inode	*ip = XFS_I(inode);
+	struct xfs_mount	*mp = ip->i_mount;
+	void			__user *arg = (void __user *)p;
+	int			ioflags = 0;
 	int			error;
 
-	xfs_itrace_entry(XFS_I(inode));
-	switch (cmd) {
+	if (filp->f_mode & FMODE_NOCMTIME)
+		ioflags |= IO_INVIS;
 
+	xfs_itrace_entry(ip);
+
+	switch (cmd) {
 	case XFS_IOC_ALLOCSP:
 	case XFS_IOC_FREESP:
 	case XFS_IOC_RESVSP:
diff --git a/fs/xfs/linux-2.6/xfs_ioctl.h b/fs/xfs/linux-2.6/xfs_ioctl.h
index d92131c827b..8c16bf2d7e0 100644
--- a/fs/xfs/linux-2.6/xfs_ioctl.h
+++ b/fs/xfs/linux-2.6/xfs_ioctl.h
@@ -68,13 +68,13 @@ xfs_attrmulti_attr_remove(
 	__uint32_t		flags);
 
 extern long
-xfs_file_compat_ioctl(
-	struct file		*file,
+xfs_file_ioctl(
+	struct file		*filp,
 	unsigned int		cmd,
-	unsigned long		arg);
+	unsigned long		p);
 
 extern long
-xfs_file_compat_invis_ioctl(
+xfs_file_compat_ioctl(
 	struct file		*file,
 	unsigned int		cmd,
 	unsigned long		arg);
diff --git a/fs/xfs/linux-2.6/xfs_ioctl32.c b/fs/xfs/linux-2.6/xfs_ioctl32.c
index b34b3d8892a..0504cece9f6 100644
--- a/fs/xfs/linux-2.6/xfs_ioctl32.c
+++ b/fs/xfs/linux-2.6/xfs_ioctl32.c
@@ -599,19 +599,24 @@ out:
 	return error;
 }
 
-STATIC long
-xfs_compat_ioctl(
-	xfs_inode_t	*ip,
-	struct file	*filp,
-	int		ioflags,
-	unsigned	cmd,
-	void		__user *arg)
+long
+xfs_file_compat_ioctl(
+	struct file		*filp,
+	unsigned		cmd,
+	unsigned long		p)
 {
-	struct inode	*inode = filp->f_path.dentry->d_inode;
-	xfs_mount_t	*mp = ip->i_mount;
-	int		error;
+	struct inode		*inode = filp->f_path.dentry->d_inode;
+	struct xfs_inode	*ip = XFS_I(inode);
+	struct xfs_mount	*mp = ip->i_mount;
+	void			__user *arg = (void __user *)p;
+	int			ioflags = 0;
+	int			error;
+
+	if (filp->f_mode & FMODE_NOCMTIME)
+		ioflags |= IO_INVIS;
+
+	xfs_itrace_entry(ip);
 
-	xfs_itrace_entry(XFS_I(inode));
 	switch (cmd) {
 	/* No size or alignment issues on any arch */
 	case XFS_IOC_DIOINFO:
@@ -632,7 +637,7 @@ xfs_compat_ioctl(
 	case XFS_IOC_GOINGDOWN:
 	case XFS_IOC_ERROR_INJECTION:
 	case XFS_IOC_ERROR_CLEARALL:
-		return xfs_ioctl(ip, filp, ioflags, cmd, arg);
+		return xfs_file_ioctl(filp, cmd, p);
 #ifndef BROKEN_X86_ALIGNMENT
 	/* These are handled fine if no alignment issues */
 	case XFS_IOC_ALLOCSP:
@@ -646,7 +651,7 @@ xfs_compat_ioctl(
 	case XFS_IOC_FSGEOMETRY_V1:
 	case XFS_IOC_FSGROWFSDATA:
 	case XFS_IOC_FSGROWFSRT:
-		return xfs_ioctl(ip, filp, ioflags, cmd, arg);
+		return xfs_file_ioctl(filp, cmd, p);
 #else
 	case XFS_IOC_ALLOCSP_32:
 	case XFS_IOC_FREESP_32:
@@ -687,7 +692,7 @@ xfs_compat_ioctl(
 	case XFS_IOC_SETXFLAGS_32:
 	case XFS_IOC_GETVERSION_32:
 		cmd = _NATIVE_IOC(cmd, long);
-		return xfs_ioctl(ip, filp, ioflags, cmd, arg);
+		return xfs_file_ioctl(filp, cmd, p);
 	case XFS_IOC_SWAPEXT: {
 		struct xfs_swapext	  sxp;
 		struct compat_xfs_swapext __user *sxu = arg;
@@ -738,26 +743,3 @@ xfs_compat_ioctl(
 		return -XFS_ERROR(ENOIOCTLCMD);
 	}
 }
-
-long
-xfs_file_compat_ioctl(
-	struct file		*filp,
-	unsigned int		cmd,
-	unsigned long		p)
-{
-	struct inode	*inode = filp->f_path.dentry->d_inode;
-
-	return xfs_compat_ioctl(XFS_I(inode), filp, 0, cmd, (void __user *)p);
-}
-
-long
-xfs_file_compat_invis_ioctl(
-	struct file		*filp,
-	unsigned int		cmd,
-	unsigned long		p)
-{
-	struct inode	*inode = filp->f_path.dentry->d_inode;
-
-	return xfs_compat_ioctl(XFS_I(inode), filp, IO_INVIS, cmd,
-				(void __user *)p);
-}
diff --git a/fs/xfs/linux-2.6/xfs_iops.h b/fs/xfs/linux-2.6/xfs_iops.h
index 8b1a1e31dc2..ef41c92ce66 100644
--- a/fs/xfs/linux-2.6/xfs_iops.h
+++ b/fs/xfs/linux-2.6/xfs_iops.h
@@ -22,7 +22,6 @@ struct xfs_inode;
 
 extern const struct file_operations xfs_file_operations;
 extern const struct file_operations xfs_dir_file_operations;
-extern const struct file_operations xfs_invis_file_operations;
 
 extern ssize_t xfs_vn_listxattr(struct dentry *, char *data, size_t size);
 
diff --git a/fs/xfs/xfs_vnodeops.h b/fs/xfs/xfs_vnodeops.h
index 2a45b00ad32..55d955ec4ec 100644
--- a/fs/xfs/xfs_vnodeops.h
+++ b/fs/xfs/xfs_vnodeops.h
@@ -53,8 +53,6 @@ int xfs_attr_set(struct xfs_inode *dp, const char *name, char *value,
 int xfs_attr_remove(struct xfs_inode *dp, const char *name, int flags);
 int xfs_attr_list(struct xfs_inode *dp, char *buffer, int bufsize,
 		int flags, struct attrlist_cursor_kern *cursor);
-int xfs_ioctl(struct xfs_inode *ip, struct file *filp,
-		int ioflags, unsigned int cmd, void __user *arg);
 ssize_t xfs_read(struct xfs_inode *ip, struct kiocb *iocb,
 		const struct iovec *iovp, unsigned int segs,
 		loff_t *offset, int ioflags);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 51bd9370d43..965b9ba3865 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -81,6 +81,14 @@ extern int dir_notify_enable;
 #define FMODE_WRITE_IOCTL	((__force fmode_t)128)
 #define FMODE_NDELAY_NOW	((__force fmode_t)256)
 
+/*
+ * Don't update ctime and mtime.
+ *
+ * Currently a special hack for the XFS open_by_handle ioctl, but we'll
+ * hopefully graduate it to a proper O_CMTIME flag supported by open(2) soon.
+ */
+#define FMODE_NOCMTIME		((__force fmode_t)2048)
+
 #define RW_MASK		1
 #define RWA_MASK	2
 #define READ 0
-- 
cgit v1.2.3-70-g09d2


From c2724775ce57c98b8af9694857b941dc61056516 Mon Sep 17 00:00:00 2001
From: Markus Metzger <markus.t.metzger@intel.com>
Date: Thu, 11 Dec 2008 13:49:59 +0100
Subject: x86, bts: provide in-kernel branch-trace interface

Impact: cleanup

Move the BTS bits from ptrace.c into ds.c.

Signed-off-by: Markus Metzger <markus.t.metzger@intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/include/asm/ds.h          | 241 ++++++-----
 arch/x86/include/asm/processor.h   |  13 +
 arch/x86/include/asm/ptrace.h      |  36 --
 arch/x86/include/asm/thread_info.h |   5 +-
 arch/x86/kernel/cpu/intel.c        |   4 -
 arch/x86/kernel/ds.c               | 857 +++++++++++++++++++++++--------------
 arch/x86/kernel/process_32.c       |  59 +--
 arch/x86/kernel/process_64.c       |  50 +--
 arch/x86/kernel/ptrace.c           | 416 +++++-------------
 include/linux/sched.h              |   1 +
 10 files changed, 811 insertions(+), 871 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/include/asm/ds.h b/arch/x86/include/asm/ds.h
index 99b6c39774a..ee0ea3a96c1 100644
--- a/arch/x86/include/asm/ds.h
+++ b/arch/x86/include/asm/ds.h
@@ -6,13 +6,13 @@
  * precise-event based sampling (PEBS).
  *
  * It manages:
- * - per-thread and per-cpu allocation of BTS and PEBS
+ * - DS and BTS hardware configuration
  * - buffer overflow handling (to be done)
  * - buffer access
  *
- * It assumes:
- * - get_task_struct on all traced tasks
- * - current is allowed to trace tasks
+ * It does not do:
+ * - security checking (is the caller allowed to trace the task)
+ * - buffer allocation (memory accounting)
  *
  *
  * Copyright (C) 2007-2008 Intel Corporation.
@@ -31,6 +31,7 @@
 #ifdef CONFIG_X86_DS
 
 struct task_struct;
+struct ds_context;
 struct ds_tracer;
 struct bts_tracer;
 struct pebs_tracer;
@@ -38,6 +39,38 @@ struct pebs_tracer;
 typedef void (*bts_ovfl_callback_t)(struct bts_tracer *);
 typedef void (*pebs_ovfl_callback_t)(struct pebs_tracer *);
 
+
+/*
+ * A list of features plus corresponding macros to talk about them in
+ * the ds_request function's flags parameter.
+ *
+ * We use the enum to index an array of corresponding control bits;
+ * we use the macro to index a flags bit-vector.
+ */
+enum ds_feature {
+	dsf_bts = 0,
+	dsf_bts_kernel,
+#define BTS_KERNEL (1 << dsf_bts_kernel)
+	/* trace kernel-mode branches */
+
+	dsf_bts_user,
+#define BTS_USER (1 << dsf_bts_user)
+	/* trace user-mode branches */
+
+	dsf_bts_overflow,
+	dsf_bts_max,
+	dsf_pebs = dsf_bts_max,
+
+	dsf_pebs_max,
+	dsf_ctl_max = dsf_pebs_max,
+	dsf_bts_timestamps = dsf_ctl_max,
+#define BTS_TIMESTAMPS (1 << dsf_bts_timestamps)
+	/* add timestamps into BTS trace */
+
+#define BTS_USER_FLAGS (BTS_KERNEL | BTS_USER | BTS_TIMESTAMPS)
+};
+
+
 /*
  * Request BTS or PEBS
  *
@@ -58,92 +91,135 @@ typedef void (*pebs_ovfl_callback_t)(struct pebs_tracer *);
  *       NULL if cyclic buffer requested
  * th: the interrupt threshold in records from the end of the buffer;
  *     -1 if no interrupt threshold is requested.
+ * flags: a bit-mask of the above flags
  */
 extern struct bts_tracer *ds_request_bts(struct task_struct *task,
 					 void *base, size_t size,
-					 bts_ovfl_callback_t ovfl, size_t th);
+					 bts_ovfl_callback_t ovfl,
+					 size_t th, unsigned int flags);
 extern struct pebs_tracer *ds_request_pebs(struct task_struct *task,
 					   void *base, size_t size,
 					   pebs_ovfl_callback_t ovfl,
-					   size_t th);
+					   size_t th, unsigned int flags);
 
 /*
  * Release BTS or PEBS resources
- *
- * Returns 0 on success; -Eerrno otherwise
+ * Suspend and resume BTS or PEBS tracing
  *
  * tracer: the tracer handle returned from ds_request_~()
  */
-extern int ds_release_bts(struct bts_tracer *tracer);
-extern int ds_release_pebs(struct pebs_tracer *tracer);
+extern void ds_release_bts(struct bts_tracer *tracer);
+extern void ds_suspend_bts(struct bts_tracer *tracer);
+extern void ds_resume_bts(struct bts_tracer *tracer);
+extern void ds_release_pebs(struct pebs_tracer *tracer);
+extern void ds_suspend_pebs(struct pebs_tracer *tracer);
+extern void ds_resume_pebs(struct pebs_tracer *tracer);
+
 
 /*
- * Get the (array) index of the write pointer.
- * (assuming an array of BTS/PEBS records)
- *
- * Returns 0 on success; -Eerrno on error
+ * The raw DS buffer state as it is used for BTS and PEBS recording.
  *
- * tracer: the tracer handle returned from ds_request_~()
- * pos (out): will hold the result
+ * This is the low-level, arch-dependent interface for working
+ * directly on the raw trace data.
  */
-extern int ds_get_bts_index(struct bts_tracer *tracer, size_t *pos);
-extern int ds_get_pebs_index(struct pebs_tracer *tracer, size_t *pos);
+struct ds_trace {
+	/* the number of bts/pebs records */
+	size_t n;
+	/* the size of a bts/pebs record in bytes */
+	size_t size;
+	/* pointers into the raw buffer:
+	   - to the first entry */
+	void *begin;
+	/* - one beyond the last entry */
+	void *end;
+	/* - one beyond the newest entry */
+	void *top;
+	/* - the interrupt threshold */
+	void *ith;
+	/* flags given on ds_request() */
+	unsigned int flags;
+};
 
 /*
- * Get the (array) index one record beyond the end of the array.
- * (assuming an array of BTS/PEBS records)
- *
- * Returns 0 on success; -Eerrno on error
- *
- * tracer: the tracer handle returned from ds_request_~()
- * pos (out): will hold the result
+ * An arch-independent view on branch trace data.
  */
-extern int ds_get_bts_end(struct bts_tracer *tracer, size_t *pos);
-extern int ds_get_pebs_end(struct pebs_tracer *tracer, size_t *pos);
+enum bts_qualifier {
+	bts_invalid,
+#define BTS_INVALID bts_invalid
+
+	bts_branch,
+#define BTS_BRANCH bts_branch
+
+	bts_task_arrives,
+#define BTS_TASK_ARRIVES bts_task_arrives
+
+	bts_task_departs,
+#define BTS_TASK_DEPARTS bts_task_departs
+
+	bts_qual_bit_size = 4,
+	bts_qual_max = (1 << bts_qual_bit_size),
+};
+
+struct bts_struct {
+	__u64 qualifier;
+	union {
+		/* BTS_BRANCH */
+		struct {
+			__u64 from;
+			__u64 to;
+		} lbr;
+		/* BTS_TASK_ARRIVES or BTS_TASK_DEPARTS */
+		struct {
+			__u64 jiffies;
+			pid_t pid;
+		} timestamp;
+	} variant;
+};
+
 
 /*
- * Provide a pointer to the BTS/PEBS record at parameter index.
- * (assuming an array of BTS/PEBS records)
- *
- * The pointer points directly into the buffer. The user is
- * responsible for copying the record.
- *
- * Returns the size of a single record on success; -Eerrno on error
+ * The BTS state.
  *
- * tracer: the tracer handle returned from ds_request_~()
- * index: the index of the requested record
- * record (out): pointer to the requested record
+ * This gives access to the raw DS state and adds functions to provide
+ * an arch-independent view of the BTS data.
  */
-extern int ds_access_bts(struct bts_tracer *tracer,
-			 size_t index, const void **record);
-extern int ds_access_pebs(struct pebs_tracer *tracer,
-			  size_t index, const void **record);
+struct bts_trace {
+	struct ds_trace ds;
+
+	int (*read)(struct bts_tracer *tracer, const void *at,
+		    struct bts_struct *out);
+	int (*write)(struct bts_tracer *tracer, const struct bts_struct *in);
+};
+
 
 /*
- * Write one or more BTS/PEBS records at the write pointer index and
- * advance the write pointer.
+ * The PEBS state.
  *
- * If size is not a multiple of the record size, trailing bytes are
- * zeroed out.
- *
- * May result in one or more overflow notifications.
- *
- * If called during overflow handling, that is, with index >=
- * interrupt threshold, the write will wrap around.
+ * This gives access to the raw DS state and the PEBS-specific counter
+ * reset value.
+ */
+struct pebs_trace {
+	struct ds_trace ds;
+
+	/* the PEBS reset value */
+	unsigned long long reset_value;
+};
+
+
+/*
+ * Read the BTS or PEBS trace.
  *
- * An overflow notification is given if and when the interrupt
- * threshold is reached during or after the write.
+ * Returns a view on the trace collected for the parameter tracer.
  *
- * Returns the number of bytes written or -Eerrno.
+ * The view remains valid as long as the traced task is not running or
+ * the tracer is suspended.
+ * Writes into the trace buffer are not reflected.
  *
  * tracer: the tracer handle returned from ds_request_~()
- * buffer: the buffer to write
- * size: the size of the buffer
  */
-extern int ds_write_bts(struct bts_tracer *tracer,
-			const void *buffer, size_t size);
-extern int ds_write_pebs(struct pebs_tracer *tracer,
-			 const void *buffer, size_t size);
+extern const struct bts_trace *ds_read_bts(struct bts_tracer *tracer);
+extern const struct pebs_trace *ds_read_pebs(struct pebs_tracer *tracer);
+
 
 /*
  * Reset the write pointer of the BTS/PEBS buffer.
@@ -155,27 +231,6 @@ extern int ds_write_pebs(struct pebs_tracer *tracer,
 extern int ds_reset_bts(struct bts_tracer *tracer);
 extern int ds_reset_pebs(struct pebs_tracer *tracer);
 
-/*
- * Clear the BTS/PEBS buffer and reset the write pointer.
- * The entire buffer will be zeroed out.
- *
- * Returns 0 on success; -Eerrno on error
- *
- * tracer: the tracer handle returned from ds_request_~()
- */
-extern int ds_clear_bts(struct bts_tracer *tracer);
-extern int ds_clear_pebs(struct pebs_tracer *tracer);
-
-/*
- * Provide the PEBS counter reset value.
- *
- * Returns 0 on success; -Eerrno on error
- *
- * tracer: the tracer handle returned from ds_request_pebs()
- * value (out): the counter reset value
- */
-extern int ds_get_pebs_reset(struct pebs_tracer *tracer, u64 *value);
-
 /*
  * Set the PEBS counter reset value.
  *
@@ -192,35 +247,17 @@ extern int ds_set_pebs_reset(struct pebs_tracer *tracer, u64 value);
 struct cpuinfo_x86;
 extern void __cpuinit ds_init_intel(struct cpuinfo_x86 *);
 
-
-
 /*
- * The DS context - part of struct thread_struct.
+ * Context switch work
  */
-#define MAX_SIZEOF_DS (12 * 8)
-
-struct ds_context {
-	/* pointer to the DS configuration; goes into MSR_IA32_DS_AREA */
-	unsigned char ds[MAX_SIZEOF_DS];
-	/* the owner of the BTS and PEBS configuration, respectively */
-	struct ds_tracer  *owner[2];
-	/* use count */
-	unsigned long count;
-	/* a pointer to the context location inside the thread_struct
-	 * or the per_cpu context array */
-	struct ds_context **this;
-	/* a pointer to the task owning this context, or NULL, if the
-	 * context is owned by a cpu */
-	struct task_struct *task;
-};
-
-/* called by exit_thread() to free leftover contexts */
-extern void ds_free(struct ds_context *context);
+extern void ds_switch_to(struct task_struct *prev, struct task_struct *next);
 
 #else /* CONFIG_X86_DS */
 
 struct cpuinfo_x86;
 static inline void __cpuinit ds_init_intel(struct cpuinfo_x86 *ignored) {}
+static inline void ds_switch_to(struct task_struct *prev,
+				struct task_struct *next) {}
 
 #endif /* CONFIG_X86_DS */
 #endif /* _ASM_X86_DS_H */
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 5ca01e38326..aa5914f8e50 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -752,6 +752,19 @@ extern void switch_to_new_gdt(void);
 extern void cpu_init(void);
 extern void init_gdt(int cpu);
 
+static inline unsigned long get_debugctlmsr(void)
+{
+    unsigned long debugctlmsr = 0;
+
+#ifndef CONFIG_X86_DEBUGCTLMSR
+	if (boot_cpu_data.x86 < 6)
+		return 0;
+#endif
+	rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctlmsr);
+
+    return debugctlmsr;
+}
+
 static inline void update_debugctlmsr(unsigned long debugctlmsr)
 {
 #ifndef CONFIG_X86_DEBUGCTLMSR
diff --git a/arch/x86/include/asm/ptrace.h b/arch/x86/include/asm/ptrace.h
index eefb0594b05..fbf74421591 100644
--- a/arch/x86/include/asm/ptrace.h
+++ b/arch/x86/include/asm/ptrace.h
@@ -6,7 +6,6 @@
 #include <asm/processor-flags.h>
 
 #ifdef __KERNEL__
-#include <asm/ds.h>		/* the DS BTS struct is used for ptrace too */
 #include <asm/segment.h>
 #endif
 
@@ -128,34 +127,6 @@ struct pt_regs {
 #endif /* !__i386__ */
 
 
-#ifdef CONFIG_X86_PTRACE_BTS
-/* a branch trace record entry
- *
- * In order to unify the interface between various processor versions,
- * we use the below data structure for all processors.
- */
-enum bts_qualifier {
-	BTS_INVALID = 0,
-	BTS_BRANCH,
-	BTS_TASK_ARRIVES,
-	BTS_TASK_DEPARTS
-};
-
-struct bts_struct {
-	__u64 qualifier;
-	union {
-		/* BTS_BRANCH */
-		struct {
-			__u64 from_ip;
-			__u64 to_ip;
-		} lbr;
-		/* BTS_TASK_ARRIVES or
-		   BTS_TASK_DEPARTS */
-		__u64 jiffies;
-	} variant;
-};
-#endif /* CONFIG_X86_PTRACE_BTS */
-
 #ifdef __KERNEL__
 
 #include <linux/init.h>
@@ -163,13 +134,6 @@ struct bts_struct {
 struct cpuinfo_x86;
 struct task_struct;
 
-#ifdef CONFIG_X86_PTRACE_BTS
-extern void __cpuinit ptrace_bts_init_intel(struct cpuinfo_x86 *);
-extern void ptrace_bts_take_timestamp(struct task_struct *, enum bts_qualifier);
-#else
-#define ptrace_bts_init_intel(config) do {} while (0)
-#endif /* CONFIG_X86_PTRACE_BTS */
-
 extern unsigned long profile_pc(struct pt_regs *regs);
 
 extern unsigned long
diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
index 0921b4018c1..bf8113d16a3 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -93,7 +93,6 @@ struct thread_info {
 #define TIF_FORCED_TF		24	/* true if TF in eflags artificially */
 #define TIF_DEBUGCTLMSR		25	/* uses thread_struct.debugctlmsr */
 #define TIF_DS_AREA_MSR		26      /* uses thread_struct.ds_area_msr */
-#define TIF_BTS_TRACE_TS	27      /* record scheduling event timestamps */
 
 #define _TIF_SYSCALL_TRACE	(1 << TIF_SYSCALL_TRACE)
 #define _TIF_NOTIFY_RESUME	(1 << TIF_NOTIFY_RESUME)
@@ -115,7 +114,6 @@ struct thread_info {
 #define _TIF_FORCED_TF		(1 << TIF_FORCED_TF)
 #define _TIF_DEBUGCTLMSR	(1 << TIF_DEBUGCTLMSR)
 #define _TIF_DS_AREA_MSR	(1 << TIF_DS_AREA_MSR)
-#define _TIF_BTS_TRACE_TS	(1 << TIF_BTS_TRACE_TS)
 
 /* work to do in syscall_trace_enter() */
 #define _TIF_WORK_SYSCALL_ENTRY	\
@@ -141,8 +139,7 @@ struct thread_info {
 
 /* flags to check in __switch_to() */
 #define _TIF_WORK_CTXSW							\
-	(_TIF_IO_BITMAP|_TIF_DEBUGCTLMSR|_TIF_DS_AREA_MSR|_TIF_BTS_TRACE_TS| \
-								_TIF_NOTSC)
+	(_TIF_IO_BITMAP|_TIF_DEBUGCTLMSR|_TIF_DS_AREA_MSR|_TIF_NOTSC)
 
 #define _TIF_WORK_CTXSW_PREV _TIF_WORK_CTXSW
 #define _TIF_WORK_CTXSW_NEXT (_TIF_WORK_CTXSW|_TIF_DEBUG)
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index 816f27f289b..cd413d9a021 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -11,7 +11,6 @@
 #include <asm/pgtable.h>
 #include <asm/msr.h>
 #include <asm/uaccess.h>
-#include <asm/ptrace.h>
 #include <asm/ds.h>
 #include <asm/bugs.h>
 
@@ -309,9 +308,6 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c)
 		set_cpu_cap(c, X86_FEATURE_P3);
 #endif
 
-	if (cpu_has_bts)
-		ptrace_bts_init_intel(c);
-
 	detect_extended_topology(c);
 	if (!cpu_has(c, X86_FEATURE_XTOPOLOGY)) {
 		/*
diff --git a/arch/x86/kernel/ds.c b/arch/x86/kernel/ds.c
index 09530698866..f0583005b75 100644
--- a/arch/x86/kernel/ds.c
+++ b/arch/x86/kernel/ds.c
@@ -6,13 +6,13 @@
  * precise-event based sampling (PEBS).
  *
  * It manages:
- * - per-thread and per-cpu allocation of BTS and PEBS
+ * - DS and BTS hardware configuration
  * - buffer overflow handling (to be done)
  * - buffer access
  *
- * It assumes:
- * - get_task_struct on all traced tasks
- * - current is allowed to trace tasks
+ * It does not do:
+ * - security checking (is the caller allowed to trace the task)
+ * - buffer allocation (memory accounting)
  *
  *
  * Copyright (C) 2007-2008 Intel Corporation.
@@ -34,15 +34,30 @@
  * The configuration for a particular DS hardware implementation.
  */
 struct ds_configuration {
-	/* the size of the DS structure in bytes */
-	unsigned char  sizeof_ds;
-	/* the size of one pointer-typed field in the DS structure in bytes;
-	   this covers the first 8 fields related to buffer management. */
+	/* the name of the configuration */
+	const char *name;
+	/* the size of one pointer-typed field in the DS structure and
+	   in the BTS and PEBS buffers in bytes;
+	   this covers the first 8 DS fields related to buffer management. */
 	unsigned char  sizeof_field;
 	/* the size of a BTS/PEBS record in bytes */
 	unsigned char  sizeof_rec[2];
+	/* a series of bit-masks to control various features indexed
+	 * by enum ds_feature */
+	unsigned long ctl[dsf_ctl_max];
 };
-static struct ds_configuration ds_cfg;
+static DEFINE_PER_CPU(struct ds_configuration, ds_cfg_array);
+
+#define ds_cfg per_cpu(ds_cfg_array, smp_processor_id())
+
+#define MAX_SIZEOF_DS (12 * 8)	/* maximal size of a DS configuration */
+#define MAX_SIZEOF_BTS (3 * 8)	/* maximal size of a BTS record */
+#define DS_ALIGNMENT (1 << 3)	/* BTS and PEBS buffer alignment */
+
+#define BTS_CONTROL \
+ (ds_cfg.ctl[dsf_bts] | ds_cfg.ctl[dsf_bts_kernel] | ds_cfg.ctl[dsf_bts_user] |\
+  ds_cfg.ctl[dsf_bts_overflow])
+
 
 /*
  * A BTS or PEBS tracer.
@@ -61,6 +76,8 @@ struct ds_tracer {
 struct bts_tracer {
 	/* the common DS part */
 	struct ds_tracer ds;
+	/* the trace including the DS configuration */
+	struct bts_trace trace;
 	/* buffer overflow notification function */
 	bts_ovfl_callback_t ovfl;
 };
@@ -68,6 +85,8 @@ struct bts_tracer {
 struct pebs_tracer {
 	/* the common DS part */
 	struct ds_tracer ds;
+	/* the trace including the DS configuration */
+	struct pebs_trace trace;
 	/* buffer overflow notification function */
 	pebs_ovfl_callback_t ovfl;
 };
@@ -134,13 +153,11 @@ static inline void ds_set(unsigned char *base, enum ds_qualifier qual,
 	(*(unsigned long *)base) = value;
 }
 
-#define DS_ALIGNMENT (1 << 3)	/* BTS and PEBS buffer alignment */
-
 
 /*
  * Locking is done only for allocating BTS or PEBS resources.
  */
-static spinlock_t ds_lock = __SPIN_LOCK_UNLOCKED(ds_lock);
+static DEFINE_SPINLOCK(ds_lock);
 
 
 /*
@@ -156,27 +173,32 @@ static spinlock_t ds_lock = __SPIN_LOCK_UNLOCKED(ds_lock);
  *   >0  number of per-thread tracers
  *   <0  number of per-cpu tracers
  *
- * The below functions to get and put tracers and to check the
- * allocation type require the ds_lock to be held by the caller.
- *
  * Tracers essentially gives the number of ds contexts for a certain
  * type of allocation.
  */
-static long tracers;
+static atomic_t tracers = ATOMIC_INIT(0);
 
 static inline void get_tracer(struct task_struct *task)
 {
-	tracers += (task ? 1 : -1);
+	if (task)
+		atomic_inc(&tracers);
+	else
+		atomic_dec(&tracers);
 }
 
 static inline void put_tracer(struct task_struct *task)
 {
-	tracers -= (task ? 1 : -1);
+	if (task)
+		atomic_dec(&tracers);
+	else
+		atomic_inc(&tracers);
 }
 
 static inline int check_tracer(struct task_struct *task)
 {
-	return (task ? (tracers >= 0) : (tracers <= 0));
+	return task ?
+		(atomic_read(&tracers) >= 0) :
+		(atomic_read(&tracers) <= 0);
 }
 
 
@@ -190,14 +212,30 @@ static inline int check_tracer(struct task_struct *task)
  * Contexts are use-counted. They are allocated on first access and
  * deallocated when the last user puts the context.
  */
-static DEFINE_PER_CPU(struct ds_context *, system_context);
+struct ds_context {
+	/* pointer to the DS configuration; goes into MSR_IA32_DS_AREA */
+	unsigned char ds[MAX_SIZEOF_DS];
+	/* the owner of the BTS and PEBS configuration, respectively */
+	struct bts_tracer *bts_master;
+	struct pebs_tracer *pebs_master;
+	/* use count */
+	unsigned long count;
+	/* a pointer to the context location inside the thread_struct
+	 * or the per_cpu context array */
+	struct ds_context **this;
+	/* a pointer to the task owning this context, or NULL, if the
+	 * context is owned by a cpu */
+	struct task_struct *task;
+};
+
+static DEFINE_PER_CPU(struct ds_context *, system_context_array);
 
-#define this_system_context per_cpu(system_context, smp_processor_id())
+#define system_context per_cpu(system_context_array, smp_processor_id())
 
 static inline struct ds_context *ds_get_context(struct task_struct *task)
 {
 	struct ds_context **p_context =
-		(task ? &task->thread.ds_ctx : &this_system_context);
+		(task ? &task->thread.ds_ctx : &system_context);
 	struct ds_context *context = *p_context;
 	unsigned long irq;
 
@@ -225,10 +263,22 @@ static inline struct ds_context *ds_get_context(struct task_struct *task)
 				wrmsrl(MSR_IA32_DS_AREA,
 				       (unsigned long)context->ds);
 		}
+
+		context->count++;
+
+		spin_unlock_irqrestore(&ds_lock, irq);
+	} else {
+		spin_lock_irqsave(&ds_lock, irq);
+
+		context = *p_context;
+		if (context)
+			context->count++;
+
 		spin_unlock_irqrestore(&ds_lock, irq);
-	}
 
-	context->count++;
+		if (!context)
+			context = ds_get_context(task);
+	}
 
 	return context;
 }
@@ -242,8 +292,10 @@ static inline void ds_put_context(struct ds_context *context)
 
 	spin_lock_irqsave(&ds_lock, irq);
 
-	if (--context->count)
-		goto out;
+	if (--context->count) {
+		spin_unlock_irqrestore(&ds_lock, irq);
+		return;
+	}
 
 	*(context->this) = NULL;
 
@@ -253,14 +305,14 @@ static inline void ds_put_context(struct ds_context *context)
 	if (!context->task || (context->task == current))
 		wrmsrl(MSR_IA32_DS_AREA, 0);
 
-	kfree(context);
- out:
 	spin_unlock_irqrestore(&ds_lock, irq);
+
+	kfree(context);
 }
 
 
 /*
- * Handle a buffer overflow
+ * Call the tracer's callback on a buffer overflow.
  *
  * context: the ds context
  * qual: the buffer type
@@ -268,30 +320,244 @@ static inline void ds_put_context(struct ds_context *context)
 static void ds_overflow(struct ds_context *context, enum ds_qualifier qual)
 {
 	switch (qual) {
-	case ds_bts: {
-		struct bts_tracer *tracer =
-			container_of(context->owner[qual],
-				     struct bts_tracer, ds);
-		if (tracer->ovfl)
-			tracer->ovfl(tracer);
-	}
+	case ds_bts:
+		if (context->bts_master &&
+		    context->bts_master->ovfl)
+			context->bts_master->ovfl(context->bts_master);
+		break;
+	case ds_pebs:
+		if (context->pebs_master &&
+		    context->pebs_master->ovfl)
+			context->pebs_master->ovfl(context->pebs_master);
 		break;
-	case ds_pebs: {
-		struct pebs_tracer *tracer =
-			container_of(context->owner[qual],
-				     struct pebs_tracer, ds);
-		if (tracer->ovfl)
-			tracer->ovfl(tracer);
 	}
+}
+
+
+/*
+ * Write raw data into the BTS or PEBS buffer.
+ *
+ * The remainder of any partially written record is zeroed out.
+ *
+ * context: the DS context
+ * qual: the buffer type
+ * record: the data to write
+ * size: the size of the data
+ */
+static int ds_write(struct ds_context *context, enum ds_qualifier qual,
+		    const void *record, size_t size)
+{
+	int bytes_written = 0;
+
+	if (!record)
+		return -EINVAL;
+
+	while (size) {
+		unsigned long base, index, end, write_end, int_th;
+		unsigned long write_size, adj_write_size;
+
+		/*
+		 * write as much as possible without producing an
+		 * overflow interrupt.
+		 *
+		 * interrupt_threshold must either be
+		 * - bigger than absolute_maximum or
+		 * - point to a record between buffer_base and absolute_maximum
+		 *
+		 * index points to a valid record.
+		 */
+		base   = ds_get(context->ds, qual, ds_buffer_base);
+		index  = ds_get(context->ds, qual, ds_index);
+		end    = ds_get(context->ds, qual, ds_absolute_maximum);
+		int_th = ds_get(context->ds, qual, ds_interrupt_threshold);
+
+		write_end = min(end, int_th);
+
+		/* if we are already beyond the interrupt threshold,
+		 * we fill the entire buffer */
+		if (write_end <= index)
+			write_end = end;
+
+		if (write_end <= index)
+			break;
+
+		write_size = min((unsigned long) size, write_end - index);
+		memcpy((void *)index, record, write_size);
+
+		record = (const char *)record + write_size;
+		size -= write_size;
+		bytes_written += write_size;
+
+		adj_write_size = write_size / ds_cfg.sizeof_rec[qual];
+		adj_write_size *= ds_cfg.sizeof_rec[qual];
+
+		/* zero out trailing bytes */
+		memset((char *)index + write_size, 0,
+		       adj_write_size - write_size);
+		index += adj_write_size;
+
+		if (index >= end)
+			index = base;
+		ds_set(context->ds, qual, ds_index, index);
+
+		if (index >= int_th)
+			ds_overflow(context, qual);
+	}
+
+	return bytes_written;
+}
+
+
+/*
+ * Branch Trace Store (BTS) uses the following format. Different
+ * architectures vary in the size of those fields.
+ * - source linear address
+ * - destination linear address
+ * - flags
+ *
+ * Later architectures use 64bit pointers throughout, whereas earlier
+ * architectures use 32bit pointers in 32bit mode.
+ *
+ * We compute the base address for the first 8 fields based on:
+ * - the field size stored in the DS configuration
+ * - the relative field position
+ *
+ * In order to store additional information in the BTS buffer, we use
+ * a special source address to indicate that the record requires
+ * special interpretation.
+ *
+ * Netburst indicated via a bit in the flags field whether the branch
+ * was predicted; this is ignored.
+ *
+ * We use two levels of abstraction:
+ * - the raw data level defined here
+ * - an arch-independent level defined in ds.h
+ */
+
+enum bts_field {
+	bts_from,
+	bts_to,
+	bts_flags,
+
+	bts_qual = bts_from,
+	bts_jiffies = bts_to,
+	bts_pid = bts_flags,
+
+	bts_qual_mask = (bts_qual_max - 1),
+	bts_escape = ((unsigned long)-1 & ~bts_qual_mask)
+};
+
+static inline unsigned long bts_get(const char *base, enum bts_field field)
+{
+	base += (ds_cfg.sizeof_field * field);
+	return *(unsigned long *)base;
+}
+
+static inline void bts_set(char *base, enum bts_field field, unsigned long val)
+{
+	base += (ds_cfg.sizeof_field * field);;
+	(*(unsigned long *)base) = val;
+}
+
+
+/*
+ * The raw BTS data is architecture dependent.
+ *
+ * For higher-level users, we give an arch-independent view.
+ * - ds.h defines struct bts_struct
+ * - bts_read translates one raw bts record into a bts_struct
+ * - bts_write translates one bts_struct into the raw format and
+ *   writes it into the top of the parameter tracer's buffer.
+ *
+ * return: bytes read/written on success; -Eerrno, otherwise
+ */
+static int bts_read(struct bts_tracer *tracer, const void *at,
+		    struct bts_struct *out)
+{
+	if (!tracer)
+		return -EINVAL;
+
+	if (at < tracer->trace.ds.begin)
+		return -EINVAL;
+
+	if (tracer->trace.ds.end < (at + tracer->trace.ds.size))
+		return -EINVAL;
+
+	memset(out, 0, sizeof(*out));
+	if ((bts_get(at, bts_qual) & ~bts_qual_mask) == bts_escape) {
+		out->qualifier = (bts_get(at, bts_qual) & bts_qual_mask);
+		out->variant.timestamp.jiffies = bts_get(at, bts_jiffies);
+		out->variant.timestamp.pid = bts_get(at, bts_pid);
+	} else {
+		out->qualifier = bts_branch;
+		out->variant.lbr.from = bts_get(at, bts_from);
+		out->variant.lbr.to   = bts_get(at, bts_to);
+	}
+
+	return ds_cfg.sizeof_rec[ds_bts];
+}
+
+static int bts_write(struct bts_tracer *tracer, const struct bts_struct *in)
+{
+	unsigned char raw[MAX_SIZEOF_BTS];
+
+	if (!tracer)
+		return -EINVAL;
+
+	if (MAX_SIZEOF_BTS < ds_cfg.sizeof_rec[ds_bts])
+		return -EOVERFLOW;
+
+	switch (in->qualifier) {
+	case bts_invalid:
+		bts_set(raw, bts_from, 0);
+		bts_set(raw, bts_to, 0);
+		bts_set(raw, bts_flags, 0);
+		break;
+	case bts_branch:
+		bts_set(raw, bts_from, in->variant.lbr.from);
+		bts_set(raw, bts_to,   in->variant.lbr.to);
+		bts_set(raw, bts_flags, 0);
+		break;
+	case bts_task_arrives:
+	case bts_task_departs:
+		bts_set(raw, bts_qual, (bts_escape | in->qualifier));
+		bts_set(raw, bts_jiffies, in->variant.timestamp.jiffies);
+		bts_set(raw, bts_pid, in->variant.timestamp.pid);
 		break;
+	default:
+		return -EINVAL;
 	}
+
+	return ds_write(tracer->ds.context, ds_bts, raw,
+			ds_cfg.sizeof_rec[ds_bts]);
 }
 
 
-static void ds_install_ds_config(struct ds_context *context,
-				 enum ds_qualifier qual,
-				 void *base, size_t size, size_t ith)
+static void ds_write_config(struct ds_context *context,
+			    struct ds_trace *cfg, enum ds_qualifier qual)
+{
+	unsigned char *ds = context->ds;
+
+	ds_set(ds, qual, ds_buffer_base, (unsigned long)cfg->begin);
+	ds_set(ds, qual, ds_index, (unsigned long)cfg->top);
+	ds_set(ds, qual, ds_absolute_maximum, (unsigned long)cfg->end);
+	ds_set(ds, qual, ds_interrupt_threshold, (unsigned long)cfg->ith);
+}
+
+static void ds_read_config(struct ds_context *context,
+			   struct ds_trace *cfg, enum ds_qualifier qual)
 {
+	unsigned char *ds = context->ds;
+
+	cfg->begin = (void *)ds_get(ds, qual, ds_buffer_base);
+	cfg->top = (void *)ds_get(ds, qual, ds_index);
+	cfg->end = (void *)ds_get(ds, qual, ds_absolute_maximum);
+	cfg->ith = (void *)ds_get(ds, qual, ds_interrupt_threshold);
+}
+
+static void ds_init_ds_trace(struct ds_trace *trace, enum ds_qualifier qual,
+			     void *base, size_t size, size_t ith,
+			     unsigned int flags) {
 	unsigned long buffer, adj;
 
 	/* adjust the buffer address and size to meet alignment
@@ -308,32 +574,30 @@ static void ds_install_ds_config(struct ds_context *context,
 	buffer += adj;
 	size   -= adj;
 
-	size /= ds_cfg.sizeof_rec[qual];
-	size *= ds_cfg.sizeof_rec[qual];
+	trace->n = size / ds_cfg.sizeof_rec[qual];
+	trace->size = ds_cfg.sizeof_rec[qual];
 
-	ds_set(context->ds, qual, ds_buffer_base, buffer);
-	ds_set(context->ds, qual, ds_index, buffer);
-	ds_set(context->ds, qual, ds_absolute_maximum, buffer + size);
+	size = (trace->n * trace->size);
 
+	trace->begin = (void *)buffer;
+	trace->top = trace->begin;
+	trace->end = (void *)(buffer + size);
 	/* The value for 'no threshold' is -1, which will set the
 	 * threshold outside of the buffer, just like we want it.
 	 */
-	ds_set(context->ds, qual,
-	       ds_interrupt_threshold, buffer + size - ith);
+	trace->ith = (void *)(buffer + size - ith);
+
+	trace->flags = flags;
 }
 
-static int ds_request(struct ds_tracer *tracer, enum ds_qualifier qual,
-		      struct task_struct *task,
-		      void *base, size_t size, size_t th)
+
+static int ds_request(struct ds_tracer *tracer, struct ds_trace *trace,
+		      enum ds_qualifier qual, struct task_struct *task,
+		      void *base, size_t size, size_t th, unsigned int flags)
 {
 	struct ds_context *context;
-	unsigned long irq;
 	int error;
 
-	error = -EOPNOTSUPP;
-	if (!ds_cfg.sizeof_ds)
-		goto out;
-
 	error = -EINVAL;
 	if (!base)
 		goto out;
@@ -360,43 +624,26 @@ static int ds_request(struct ds_tracer *tracer, enum ds_qualifier qual,
 		goto out;
 	tracer->context = context;
 
+	ds_init_ds_trace(trace, qual, base, size, th, flags);
 
-	spin_lock_irqsave(&ds_lock, irq);
-
-	error = -EPERM;
-	if (!check_tracer(task))
-		goto out_unlock;
-	get_tracer(task);
-
-	error = -EPERM;
-	if (context->owner[qual])
-		goto out_put_tracer;
-	context->owner[qual] = tracer;
-
-	spin_unlock_irqrestore(&ds_lock, irq);
-
-
-	ds_install_ds_config(context, qual, base, size, th);
-
-	return 0;
-
- out_put_tracer:
-	put_tracer(task);
- out_unlock:
-	spin_unlock_irqrestore(&ds_lock, irq);
-	ds_put_context(context);
-	tracer->context = NULL;
+	error = 0;
  out:
 	return error;
 }
 
 struct bts_tracer *ds_request_bts(struct task_struct *task,
 				  void *base, size_t size,
-				  bts_ovfl_callback_t ovfl, size_t th)
+				  bts_ovfl_callback_t ovfl, size_t th,
+				  unsigned int flags)
 {
 	struct bts_tracer *tracer;
+	unsigned long irq;
 	int error;
 
+	error = -EOPNOTSUPP;
+	if (!ds_cfg.ctl[dsf_bts])
+		goto out;
+
 	/* buffer overflow notification is not yet implemented */
 	error = -EOPNOTSUPP;
 	if (ovfl)
@@ -408,12 +655,40 @@ struct bts_tracer *ds_request_bts(struct task_struct *task,
 		goto out;
 	tracer->ovfl = ovfl;
 
-	error = ds_request(&tracer->ds, ds_bts, task, base, size, th);
+	error = ds_request(&tracer->ds, &tracer->trace.ds,
+			   ds_bts, task, base, size, th, flags);
 	if (error < 0)
 		goto out_tracer;
 
+
+	spin_lock_irqsave(&ds_lock, irq);
+
+	error = -EPERM;
+	if (!check_tracer(task))
+		goto out_unlock;
+	get_tracer(task);
+
+	error = -EPERM;
+	if (tracer->ds.context->bts_master)
+		goto out_put_tracer;
+	tracer->ds.context->bts_master = tracer;
+
+	spin_unlock_irqrestore(&ds_lock, irq);
+
+
+	tracer->trace.read  = bts_read;
+	tracer->trace.write = bts_write;
+
+	ds_write_config(tracer->ds.context, &tracer->trace.ds, ds_bts);
+	ds_resume_bts(tracer);
+
 	return tracer;
 
+ out_put_tracer:
+	put_tracer(task);
+ out_unlock:
+	spin_unlock_irqrestore(&ds_lock, irq);
+	ds_put_context(tracer->ds.context);
  out_tracer:
 	kfree(tracer);
  out:
@@ -422,9 +697,11 @@ struct bts_tracer *ds_request_bts(struct task_struct *task,
 
 struct pebs_tracer *ds_request_pebs(struct task_struct *task,
 				    void *base, size_t size,
-				    pebs_ovfl_callback_t ovfl, size_t th)
+				    pebs_ovfl_callback_t ovfl, size_t th,
+				    unsigned int flags)
 {
 	struct pebs_tracer *tracer;
+	unsigned long irq;
 	int error;
 
 	/* buffer overflow notification is not yet implemented */
@@ -438,300 +715,171 @@ struct pebs_tracer *ds_request_pebs(struct task_struct *task,
 		goto out;
 	tracer->ovfl = ovfl;
 
-	error = ds_request(&tracer->ds, ds_pebs, task, base, size, th);
+	error = ds_request(&tracer->ds, &tracer->trace.ds,
+			   ds_pebs, task, base, size, th, flags);
 	if (error < 0)
 		goto out_tracer;
 
+	spin_lock_irqsave(&ds_lock, irq);
+
+	error = -EPERM;
+	if (!check_tracer(task))
+		goto out_unlock;
+	get_tracer(task);
+
+	error = -EPERM;
+	if (tracer->ds.context->pebs_master)
+		goto out_put_tracer;
+	tracer->ds.context->pebs_master = tracer;
+
+	spin_unlock_irqrestore(&ds_lock, irq);
+
+	ds_write_config(tracer->ds.context, &tracer->trace.ds, ds_bts);
+	ds_resume_pebs(tracer);
+
 	return tracer;
 
+ out_put_tracer:
+	put_tracer(task);
+ out_unlock:
+	spin_unlock_irqrestore(&ds_lock, irq);
+	ds_put_context(tracer->ds.context);
  out_tracer:
 	kfree(tracer);
  out:
 	return ERR_PTR(error);
 }
 
-static void ds_release(struct ds_tracer *tracer, enum ds_qualifier qual)
-{
-	WARN_ON_ONCE(tracer->context->owner[qual] != tracer);
-	tracer->context->owner[qual] = NULL;
-
-	put_tracer(tracer->context->task);
-	ds_put_context(tracer->context);
-}
-
-int ds_release_bts(struct bts_tracer *tracer)
+void ds_release_bts(struct bts_tracer *tracer)
 {
 	if (!tracer)
-		return -EINVAL;
+		return;
 
-	ds_release(&tracer->ds, ds_bts);
-	kfree(tracer);
+	ds_suspend_bts(tracer);
 
-	return 0;
-}
+	WARN_ON_ONCE(tracer->ds.context->bts_master != tracer);
+	tracer->ds.context->bts_master = NULL;
 
-int ds_release_pebs(struct pebs_tracer *tracer)
-{
-	if (!tracer)
-		return -EINVAL;
+	put_tracer(tracer->ds.context->task);
+	ds_put_context(tracer->ds.context);
 
-	ds_release(&tracer->ds, ds_pebs);
 	kfree(tracer);
-
-	return 0;
-}
-
-static size_t ds_get_index(struct ds_context *context, enum ds_qualifier qual)
-{
-	unsigned long base, index;
-
-	base  = ds_get(context->ds, qual, ds_buffer_base);
-	index = ds_get(context->ds, qual, ds_index);
-
-	return (index - base) / ds_cfg.sizeof_rec[qual];
 }
 
-int ds_get_bts_index(struct bts_tracer *tracer, size_t *pos)
+void ds_suspend_bts(struct bts_tracer *tracer)
 {
-	if (!tracer)
-		return -EINVAL;
+	struct task_struct *task;
 
-	if (!pos)
-		return -EINVAL;
-
-	*pos = ds_get_index(tracer->ds.context, ds_bts);
-
-	return 0;
-}
-
-int ds_get_pebs_index(struct pebs_tracer *tracer, size_t *pos)
-{
 	if (!tracer)
-		return -EINVAL;
+		return;
 
-	if (!pos)
-		return -EINVAL;
+	task = tracer->ds.context->task;
 
-	*pos = ds_get_index(tracer->ds.context, ds_pebs);
+	if (!task || (task == current))
+		update_debugctlmsr(get_debugctlmsr() & ~BTS_CONTROL);
 
-	return 0;
-}
+	if (task) {
+		task->thread.debugctlmsr &= ~BTS_CONTROL;
 
-static size_t ds_get_end(struct ds_context *context, enum ds_qualifier qual)
-{
-	unsigned long base, max;
-
-	base = ds_get(context->ds, qual, ds_buffer_base);
-	max  = ds_get(context->ds, qual, ds_absolute_maximum);
-
-	return (max - base) / ds_cfg.sizeof_rec[qual];
+		if (!task->thread.debugctlmsr)
+			clear_tsk_thread_flag(task, TIF_DEBUGCTLMSR);
+	}
 }
 
-int ds_get_bts_end(struct bts_tracer *tracer, size_t *pos)
+void ds_resume_bts(struct bts_tracer *tracer)
 {
-	if (!tracer)
-		return -EINVAL;
-
-	if (!pos)
-		return -EINVAL;
-
-	*pos = ds_get_end(tracer->ds.context, ds_bts);
-
-	return 0;
-}
+	struct task_struct *task;
+	unsigned long control;
 
-int ds_get_pebs_end(struct pebs_tracer *tracer, size_t *pos)
-{
 	if (!tracer)
-		return -EINVAL;
-
-	if (!pos)
-		return -EINVAL;
-
-	*pos = ds_get_end(tracer->ds.context, ds_pebs);
-
-	return 0;
-}
-
-static int ds_access(struct ds_context *context, enum ds_qualifier qual,
-		     size_t index, const void **record)
-{
-	unsigned long base, idx;
-
-	if (!record)
-		return -EINVAL;
-
-	base = ds_get(context->ds, qual, ds_buffer_base);
-	idx = base + (index * ds_cfg.sizeof_rec[qual]);
-
-	if (idx > ds_get(context->ds, qual, ds_absolute_maximum))
-		return -EINVAL;
+		return;
 
-	*record = (const void *)idx;
+	task = tracer->ds.context->task;
 
-	return ds_cfg.sizeof_rec[qual];
-}
+	control = ds_cfg.ctl[dsf_bts];
+	if (!(tracer->trace.ds.flags & BTS_KERNEL))
+		control |= ds_cfg.ctl[dsf_bts_kernel];
+	if (!(tracer->trace.ds.flags & BTS_USER))
+		control |= ds_cfg.ctl[dsf_bts_user];
 
-int ds_access_bts(struct bts_tracer *tracer, size_t index,
-		  const void **record)
-{
-	if (!tracer)
-		return -EINVAL;
+	if (task) {
+		task->thread.debugctlmsr |= control;
+		set_tsk_thread_flag(task, TIF_DEBUGCTLMSR);
+	}
 
-	return ds_access(tracer->ds.context, ds_bts, index, record);
+	if (!task || (task == current))
+		update_debugctlmsr(get_debugctlmsr() | control);
 }
 
-int ds_access_pebs(struct pebs_tracer *tracer, size_t index,
-		   const void **record)
+void ds_release_pebs(struct pebs_tracer *tracer)
 {
 	if (!tracer)
-		return -EINVAL;
-
-	return ds_access(tracer->ds.context, ds_pebs, index, record);
-}
-
-static int ds_write(struct ds_context *context, enum ds_qualifier qual,
-		    const void *record, size_t size)
-{
-	int bytes_written = 0;
-
-	if (!record)
-		return -EINVAL;
-
-	while (size) {
-		unsigned long base, index, end, write_end, int_th;
-		unsigned long write_size, adj_write_size;
-
-		/*
-		 * write as much as possible without producing an
-		 * overflow interrupt.
-		 *
-		 * interrupt_threshold must either be
-		 * - bigger than absolute_maximum or
-		 * - point to a record between buffer_base and absolute_maximum
-		 *
-		 * index points to a valid record.
-		 */
-		base   = ds_get(context->ds, qual, ds_buffer_base);
-		index  = ds_get(context->ds, qual, ds_index);
-		end    = ds_get(context->ds, qual, ds_absolute_maximum);
-		int_th = ds_get(context->ds, qual, ds_interrupt_threshold);
-
-		write_end = min(end, int_th);
-
-		/* if we are already beyond the interrupt threshold,
-		 * we fill the entire buffer */
-		if (write_end <= index)
-			write_end = end;
-
-		if (write_end <= index)
-			break;
-
-		write_size = min((unsigned long) size, write_end - index);
-		memcpy((void *)index, record, write_size);
-
-		record = (const char *)record + write_size;
-		size -= write_size;
-		bytes_written += write_size;
-
-		adj_write_size = write_size / ds_cfg.sizeof_rec[qual];
-		adj_write_size *= ds_cfg.sizeof_rec[qual];
-
-		/* zero out trailing bytes */
-		memset((char *)index + write_size, 0,
-		       adj_write_size - write_size);
-		index += adj_write_size;
+		return;
 
-		if (index >= end)
-			index = base;
-		ds_set(context->ds, qual, ds_index, index);
+	ds_suspend_pebs(tracer);
 
-		if (index >= int_th)
-			ds_overflow(context, qual);
-	}
+	WARN_ON_ONCE(tracer->ds.context->pebs_master != tracer);
+	tracer->ds.context->pebs_master = NULL;
 
-	return bytes_written;
-}
+	put_tracer(tracer->ds.context->task);
+	ds_put_context(tracer->ds.context);
 
-int ds_write_bts(struct bts_tracer *tracer, const void *record, size_t size)
-{
-	if (!tracer)
-		return -EINVAL;
-
-	return ds_write(tracer->ds.context, ds_bts, record, size);
+	kfree(tracer);
 }
 
-int ds_write_pebs(struct pebs_tracer *tracer, const void *record, size_t size)
+void ds_suspend_pebs(struct pebs_tracer *tracer)
 {
-	if (!tracer)
-		return -EINVAL;
 
-	return ds_write(tracer->ds.context, ds_pebs, record, size);
 }
 
-static void ds_reset_or_clear(struct ds_context *context,
-			      enum ds_qualifier qual, int clear)
+void ds_resume_pebs(struct pebs_tracer *tracer)
 {
-	unsigned long base, end;
-
-	base = ds_get(context->ds, qual, ds_buffer_base);
-	end  = ds_get(context->ds, qual, ds_absolute_maximum);
-
-	if (clear)
-		memset((void *)base, 0, end - base);
 
-	ds_set(context->ds, qual, ds_index, base);
 }
 
-int ds_reset_bts(struct bts_tracer *tracer)
+const struct bts_trace *ds_read_bts(struct bts_tracer *tracer)
 {
 	if (!tracer)
-		return -EINVAL;
-
-	ds_reset_or_clear(tracer->ds.context, ds_bts, /* clear = */ 0);
+		return NULL;
 
-	return 0;
+	ds_read_config(tracer->ds.context, &tracer->trace.ds, ds_bts);
+	return &tracer->trace;
 }
 
-int ds_reset_pebs(struct pebs_tracer *tracer)
+const struct pebs_trace *ds_read_pebs(struct pebs_tracer *tracer)
 {
 	if (!tracer)
-		return -EINVAL;
+		return NULL;
 
-	ds_reset_or_clear(tracer->ds.context, ds_pebs, /* clear = */ 0);
+	ds_read_config(tracer->ds.context, &tracer->trace.ds, ds_pebs);
+	tracer->trace.reset_value =
+		*(u64 *)(tracer->ds.context->ds + (ds_cfg.sizeof_field * 8));
 
-	return 0;
+	return &tracer->trace;
 }
 
-int ds_clear_bts(struct bts_tracer *tracer)
+int ds_reset_bts(struct bts_tracer *tracer)
 {
 	if (!tracer)
 		return -EINVAL;
 
-	ds_reset_or_clear(tracer->ds.context, ds_bts, /* clear = */ 1);
-
-	return 0;
-}
-
-int ds_clear_pebs(struct pebs_tracer *tracer)
-{
-	if (!tracer)
-		return -EINVAL;
+	tracer->trace.ds.top = tracer->trace.ds.begin;
 
-	ds_reset_or_clear(tracer->ds.context, ds_pebs, /* clear = */ 1);
+	ds_set(tracer->ds.context->ds, ds_bts, ds_index,
+	       (unsigned long)tracer->trace.ds.top);
 
 	return 0;
 }
 
-int ds_get_pebs_reset(struct pebs_tracer *tracer, u64 *value)
+int ds_reset_pebs(struct pebs_tracer *tracer)
 {
 	if (!tracer)
 		return -EINVAL;
 
-	if (!value)
-		return -EINVAL;
+	tracer->trace.ds.top = tracer->trace.ds.begin;
 
-	*value = *(u64 *)(tracer->ds.context->ds + (ds_cfg.sizeof_field * 8));
+	ds_set(tracer->ds.context->ds, ds_bts, ds_index,
+	       (unsigned long)tracer->trace.ds.top);
 
 	return 0;
 }
@@ -746,35 +894,59 @@ int ds_set_pebs_reset(struct pebs_tracer *tracer, u64 value)
 	return 0;
 }
 
-static const struct ds_configuration ds_cfg_var = {
-	.sizeof_ds    = sizeof(long) * 12,
-	.sizeof_field = sizeof(long),
-	.sizeof_rec[ds_bts]   = sizeof(long) * 3,
+static const struct ds_configuration ds_cfg_netburst = {
+	.name = "netburst",
+	.ctl[dsf_bts]		= (1 << 2) | (1 << 3),
+	.ctl[dsf_bts_kernel]	= (1 << 5),
+	.ctl[dsf_bts_user]	= (1 << 6),
+
+	.sizeof_field		= sizeof(long),
+	.sizeof_rec[ds_bts]	= sizeof(long) * 3,
 #ifdef __i386__
-	.sizeof_rec[ds_pebs]  = sizeof(long) * 10
+	.sizeof_rec[ds_pebs]	= sizeof(long) * 10,
 #else
-	.sizeof_rec[ds_pebs]  = sizeof(long) * 18
+	.sizeof_rec[ds_pebs]	= sizeof(long) * 18,
 #endif
 };
-static const struct ds_configuration ds_cfg_64 = {
-	.sizeof_ds    = 8 * 12,
-	.sizeof_field = 8,
-	.sizeof_rec[ds_bts]   = 8 * 3,
+static const struct ds_configuration ds_cfg_pentium_m = {
+	.name = "pentium m",
+	.ctl[dsf_bts]		= (1 << 6) | (1 << 7),
+
+	.sizeof_field		= sizeof(long),
+	.sizeof_rec[ds_bts]	= sizeof(long) * 3,
 #ifdef __i386__
-	.sizeof_rec[ds_pebs]  = 8 * 10
+	.sizeof_rec[ds_pebs]	= sizeof(long) * 10,
 #else
-	.sizeof_rec[ds_pebs]  = 8 * 18
+	.sizeof_rec[ds_pebs]	= sizeof(long) * 18,
 #endif
 };
+static const struct ds_configuration ds_cfg_core2 = {
+	.name = "core 2",
+	.ctl[dsf_bts]		= (1 << 6) | (1 << 7),
+	.ctl[dsf_bts_kernel]	= (1 << 9),
+	.ctl[dsf_bts_user]	= (1 << 10),
+
+	.sizeof_field		= 8,
+	.sizeof_rec[ds_bts]	= 8 * 3,
+	.sizeof_rec[ds_pebs]	= 8 * 18,
+};
 
-static inline void
+static void
 ds_configure(const struct ds_configuration *cfg)
 {
+	memset(&ds_cfg, 0, sizeof(ds_cfg));
 	ds_cfg = *cfg;
 
-	printk(KERN_INFO "DS available\n");
+	printk(KERN_INFO "[ds] using %s configuration\n", ds_cfg.name);
+
+	if (!cpu_has_bts) {
+		ds_cfg.ctl[dsf_bts] = 0;
+		printk(KERN_INFO "[ds] bts not available\n");
+	}
+	if (!cpu_has_pebs)
+		printk(KERN_INFO "[ds] pebs not available\n");
 
-	WARN_ON_ONCE(MAX_SIZEOF_DS < ds_cfg.sizeof_ds);
+	WARN_ON_ONCE(MAX_SIZEOF_DS < (12 * ds_cfg.sizeof_field));
 }
 
 void __cpuinit ds_init_intel(struct cpuinfo_x86 *c)
@@ -787,10 +959,10 @@ void __cpuinit ds_init_intel(struct cpuinfo_x86 *c)
 			break;
 		case 0xD:
 		case 0xE: /* Pentium M */
-			ds_configure(&ds_cfg_var);
+			ds_configure(&ds_cfg_pentium_m);
 			break;
 		default: /* Core2, Atom, ... */
-			ds_configure(&ds_cfg_64);
+			ds_configure(&ds_cfg_core2);
 			break;
 		}
 		break;
@@ -799,7 +971,7 @@ void __cpuinit ds_init_intel(struct cpuinfo_x86 *c)
 		case 0x0:
 		case 0x1:
 		case 0x2: /* Netburst */
-			ds_configure(&ds_cfg_var);
+			ds_configure(&ds_cfg_netburst);
 			break;
 		default:
 			/* sorry, don't know about them */
@@ -812,14 +984,41 @@ void __cpuinit ds_init_intel(struct cpuinfo_x86 *c)
 	}
 }
 
-void ds_free(struct ds_context *context)
+/*
+ * Change the DS configuration from tracing prev to tracing next.
+ */
+void ds_switch_to(struct task_struct *prev, struct task_struct *next)
 {
-	/* This is called when the task owning the parameter context
-	 * is dying. There should not be any user of that context left
-	 * to disturb us, anymore. */
-	unsigned long leftovers = context->count;
-	while (leftovers--) {
-		put_tracer(context->task);
-		ds_put_context(context);
+	struct ds_context *prev_ctx = prev->thread.ds_ctx;
+	struct ds_context *next_ctx = next->thread.ds_ctx;
+
+	if (prev_ctx) {
+		update_debugctlmsr(0);
+
+		if (prev_ctx->bts_master &&
+		    (prev_ctx->bts_master->trace.ds.flags & BTS_TIMESTAMPS)) {
+			struct bts_struct ts = {
+				.qualifier = bts_task_departs,
+				.variant.timestamp.jiffies = jiffies_64,
+				.variant.timestamp.pid = prev->pid
+			};
+			bts_write(prev_ctx->bts_master, &ts);
+		}
+	}
+
+	if (next_ctx) {
+		if (next_ctx->bts_master &&
+		    (next_ctx->bts_master->trace.ds.flags & BTS_TIMESTAMPS)) {
+			struct bts_struct ts = {
+				.qualifier = bts_task_arrives,
+				.variant.timestamp.jiffies = jiffies_64,
+				.variant.timestamp.pid = next->pid
+			};
+			bts_write(next_ctx->bts_master, &ts);
+		}
+
+		wrmsrl(MSR_IA32_DS_AREA, (unsigned long)next_ctx->ds);
 	}
+
+	update_debugctlmsr(next->thread.debugctlmsr);
 }
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index 24c2276aa45..605eff9a8ac 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -252,11 +252,14 @@ void exit_thread(void)
 		put_cpu();
 	}
 #ifdef CONFIG_X86_DS
-	/* Free any DS contexts that have not been properly released. */
-	if (unlikely(current->thread.ds_ctx)) {
-		/* we clear debugctl to make sure DS is not used. */
-		update_debugctlmsr(0);
-		ds_free(current->thread.ds_ctx);
+	/* Free any BTS tracers that have not been properly released. */
+	if (unlikely(current->bts)) {
+		ds_release_bts(current->bts);
+		current->bts = NULL;
+
+		kfree(current->bts_buffer);
+		current->bts_buffer = NULL;
+		current->bts_size = 0;
 	}
 #endif /* CONFIG_X86_DS */
 }
@@ -420,48 +423,19 @@ int set_tsc_mode(unsigned int val)
 	return 0;
 }
 
-#ifdef CONFIG_X86_DS
-static int update_debugctl(struct thread_struct *prev,
-			struct thread_struct *next, unsigned long debugctl)
-{
-	unsigned long ds_prev = 0;
-	unsigned long ds_next = 0;
-
-	if (prev->ds_ctx)
-		ds_prev = (unsigned long)prev->ds_ctx->ds;
-	if (next->ds_ctx)
-		ds_next = (unsigned long)next->ds_ctx->ds;
-
-	if (ds_next != ds_prev) {
-		/* we clear debugctl to make sure DS
-		 * is not in use when we change it */
-		debugctl = 0;
-		update_debugctlmsr(0);
-		wrmsr(MSR_IA32_DS_AREA, ds_next, 0);
-	}
-	return debugctl;
-}
-#else
-static int update_debugctl(struct thread_struct *prev,
-			struct thread_struct *next, unsigned long debugctl)
-{
-	return debugctl;
-}
-#endif /* CONFIG_X86_DS */
-
 static noinline void
 __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p,
 		 struct tss_struct *tss)
 {
 	struct thread_struct *prev, *next;
-	unsigned long debugctl;
 
 	prev = &prev_p->thread;
 	next = &next_p->thread;
 
-	debugctl = update_debugctl(prev, next, prev->debugctlmsr);
-
-	if (next->debugctlmsr != debugctl)
+	if (test_tsk_thread_flag(next_p, TIF_DS_AREA_MSR) ||
+	    test_tsk_thread_flag(prev_p, TIF_DS_AREA_MSR))
+		ds_switch_to(prev_p, next_p);
+	else if (next->debugctlmsr != prev->debugctlmsr)
 		update_debugctlmsr(next->debugctlmsr);
 
 	if (test_tsk_thread_flag(next_p, TIF_DEBUG)) {
@@ -483,15 +457,6 @@ __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p,
 			hard_enable_TSC();
 	}
 
-#ifdef CONFIG_X86_PTRACE_BTS
-	if (test_tsk_thread_flag(prev_p, TIF_BTS_TRACE_TS))
-		ptrace_bts_take_timestamp(prev_p, BTS_TASK_DEPARTS);
-
-	if (test_tsk_thread_flag(next_p, TIF_BTS_TRACE_TS))
-		ptrace_bts_take_timestamp(next_p, BTS_TASK_ARRIVES);
-#endif /* CONFIG_X86_PTRACE_BTS */
-
-
 	if (!test_tsk_thread_flag(next_p, TIF_IO_BITMAP)) {
 		/*
 		 * Disable the bitmap via an invalid offset. We still cache
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index fbb321d53d3..1cfd2a4bf85 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -237,11 +237,14 @@ void exit_thread(void)
 		put_cpu();
 	}
 #ifdef CONFIG_X86_DS
-	/* Free any DS contexts that have not been properly released. */
-	if (unlikely(t->ds_ctx)) {
-		/* we clear debugctl to make sure DS is not used. */
-		update_debugctlmsr(0);
-		ds_free(t->ds_ctx);
+	/* Free any BTS tracers that have not been properly released. */
+	if (unlikely(current->bts)) {
+		ds_release_bts(current->bts);
+		current->bts = NULL;
+
+		kfree(current->bts_buffer);
+		current->bts_buffer = NULL;
+		current->bts_size = 0;
 	}
 #endif /* CONFIG_X86_DS */
 }
@@ -471,35 +474,14 @@ static inline void __switch_to_xtra(struct task_struct *prev_p,
 				    struct tss_struct *tss)
 {
 	struct thread_struct *prev, *next;
-	unsigned long debugctl;
 
 	prev = &prev_p->thread,
 	next = &next_p->thread;
 
-	debugctl = prev->debugctlmsr;
-
-#ifdef CONFIG_X86_DS
-	{
-		unsigned long ds_prev = 0, ds_next = 0;
-
-		if (prev->ds_ctx)
-			ds_prev = (unsigned long)prev->ds_ctx->ds;
-		if (next->ds_ctx)
-			ds_next = (unsigned long)next->ds_ctx->ds;
-
-		if (ds_next != ds_prev) {
-			/*
-			 * We clear debugctl to make sure DS
-			 * is not in use when we change it:
-			 */
-			debugctl = 0;
-			update_debugctlmsr(0);
-			wrmsrl(MSR_IA32_DS_AREA, ds_next);
-		}
-	}
-#endif /* CONFIG_X86_DS */
-
-	if (next->debugctlmsr != debugctl)
+	if (test_tsk_thread_flag(next_p, TIF_DS_AREA_MSR) ||
+	    test_tsk_thread_flag(prev_p, TIF_DS_AREA_MSR))
+		ds_switch_to(prev_p, next_p);
+	else if (next->debugctlmsr != prev->debugctlmsr)
 		update_debugctlmsr(next->debugctlmsr);
 
 	if (test_tsk_thread_flag(next_p, TIF_DEBUG)) {
@@ -534,14 +516,6 @@ static inline void __switch_to_xtra(struct task_struct *prev_p,
 		 */
 		memset(tss->io_bitmap, 0xff, prev->io_bitmap_max);
 	}
-
-#ifdef CONFIG_X86_PTRACE_BTS
-	if (test_tsk_thread_flag(prev_p, TIF_BTS_TRACE_TS))
-		ptrace_bts_take_timestamp(prev_p, BTS_TASK_DEPARTS);
-
-	if (test_tsk_thread_flag(next_p, TIF_BTS_TRACE_TS))
-		ptrace_bts_take_timestamp(next_p, BTS_TASK_ARRIVES);
-#endif /* CONFIG_X86_PTRACE_BTS */
 }
 
 /*
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index b2998fe1166..45e9855da2d 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -581,153 +581,73 @@ static int ioperm_get(struct task_struct *target,
 }
 
 #ifdef CONFIG_X86_PTRACE_BTS
-/*
- * The configuration for a particular BTS hardware implementation.
- */
-struct bts_configuration {
-	/* the size of a BTS record in bytes; at most BTS_MAX_RECORD_SIZE */
-	unsigned char  sizeof_bts;
-	/* the size of a field in the BTS record in bytes */
-	unsigned char  sizeof_field;
-	/* a bitmask to enable/disable BTS in DEBUGCTL MSR */
-	unsigned long debugctl_mask;
-};
-static struct bts_configuration bts_cfg;
-
-#define BTS_MAX_RECORD_SIZE (8 * 3)
-
-
-/*
- * Branch Trace Store (BTS) uses the following format. Different
- * architectures vary in the size of those fields.
- * - source linear address
- * - destination linear address
- * - flags
- *
- * Later architectures use 64bit pointers throughout, whereas earlier
- * architectures use 32bit pointers in 32bit mode.
- *
- * We compute the base address for the first 8 fields based on:
- * - the field size stored in the DS configuration
- * - the relative field position
- *
- * In order to store additional information in the BTS buffer, we use
- * a special source address to indicate that the record requires
- * special interpretation.
- *
- * Netburst indicated via a bit in the flags field whether the branch
- * was predicted; this is ignored.
- */
-
-enum bts_field {
-	bts_from = 0,
-	bts_to,
-	bts_flags,
-
-	bts_escape = (unsigned long)-1,
-	bts_qual = bts_to,
-	bts_jiffies = bts_flags
-};
-
-static inline unsigned long bts_get(const char *base, enum bts_field field)
-{
-	base += (bts_cfg.sizeof_field * field);
-	return *(unsigned long *)base;
-}
-
-static inline void bts_set(char *base, enum bts_field field, unsigned long val)
-{
-	base += (bts_cfg.sizeof_field * field);;
-	(*(unsigned long *)base) = val;
-}
-
-/*
- * Translate a BTS record from the raw format into the bts_struct format
- *
- * out (out): bts_struct interpretation
- * raw: raw BTS record
- */
-static void ptrace_bts_translate_record(struct bts_struct *out, const void *raw)
-{
-	memset(out, 0, sizeof(*out));
-	if (bts_get(raw, bts_from) == bts_escape) {
-		out->qualifier       = bts_get(raw, bts_qual);
-		out->variant.jiffies = bts_get(raw, bts_jiffies);
-	} else {
-		out->qualifier = BTS_BRANCH;
-		out->variant.lbr.from_ip = bts_get(raw, bts_from);
-		out->variant.lbr.to_ip   = bts_get(raw, bts_to);
-	}
-}
-
 static int ptrace_bts_read_record(struct task_struct *child, size_t index,
 				  struct bts_struct __user *out)
 {
-	struct bts_struct ret;
-	const void *bts_record;
-	size_t bts_index, bts_end;
+	const struct bts_trace *trace;
+	struct bts_struct bts;
+	const unsigned char *at;
 	int error;
 
-	error = ds_get_bts_end(child->bts, &bts_end);
-	if (error < 0)
-		return error;
-
-	if (bts_end <= index)
-		return -EINVAL;
+	trace = ds_read_bts(child->bts);
+	if (!trace)
+		return -EPERM;
 
-	error = ds_get_bts_index(child->bts, &bts_index);
-	if (error < 0)
-		return error;
+	at = trace->ds.top - ((index + 1) * trace->ds.size);
+	if ((void *)at < trace->ds.begin)
+		at += (trace->ds.n * trace->ds.size);
 
-	/* translate the ptrace bts index into the ds bts index */
-	bts_index += bts_end - (index + 1);
-	if (bts_end <= bts_index)
-		bts_index -= bts_end;
+	if (!trace->read)
+		return -EOPNOTSUPP;
 
-	error = ds_access_bts(child->bts, bts_index, &bts_record);
+	error = trace->read(child->bts, at, &bts);
 	if (error < 0)
 		return error;
 
-	ptrace_bts_translate_record(&ret, bts_record);
-
-	if (copy_to_user(out, &ret, sizeof(ret)))
+	if (copy_to_user(out, &bts, sizeof(bts)))
 		return -EFAULT;
 
-	return sizeof(ret);
+	return sizeof(bts);
 }
 
 static int ptrace_bts_drain(struct task_struct *child,
 			    long size,
 			    struct bts_struct __user *out)
 {
-	struct bts_struct ret;
-	const unsigned char *raw;
-	size_t end, i;
-	int error;
+	const struct bts_trace *trace;
+	const unsigned char *at;
+	int error, drained = 0;
 
-	error = ds_get_bts_index(child->bts, &end);
-	if (error < 0)
-		return error;
+	trace = ds_read_bts(child->bts);
+	if (!trace)
+		return -EPERM;
 
-	if (size < (end * sizeof(struct bts_struct)))
+	if (!trace->read)
+		return -EOPNOTSUPP;
+
+	if (size < (trace->ds.top - trace->ds.begin))
 		return -EIO;
 
-	error = ds_access_bts(child->bts, 0, (const void **)&raw);
-	if (error < 0)
-		return error;
+	for (at = trace->ds.begin; (void *)at < trace->ds.top;
+	     out++, drained++, at += trace->ds.size) {
+		struct bts_struct bts;
+		int error;
 
-	for (i = 0; i < end; i++, out++, raw += bts_cfg.sizeof_bts) {
-		ptrace_bts_translate_record(&ret, raw);
+		error = trace->read(child->bts, at, &bts);
+		if (error < 0)
+			return error;
 
-		if (copy_to_user(out, &ret, sizeof(ret)))
+		if (copy_to_user(out, &bts, sizeof(bts)))
 			return -EFAULT;
 	}
 
-	error = ds_clear_bts(child->bts);
+	memset(trace->ds.begin, 0, trace->ds.n * trace->ds.size);
+
+	error = ds_reset_bts(child->bts);
 	if (error < 0)
 		return error;
 
-	return end;
+	return drained;
 }
 
 static int ptrace_bts_config(struct task_struct *child,
@@ -735,136 +655,89 @@ static int ptrace_bts_config(struct task_struct *child,
 			     const struct ptrace_bts_config __user *ucfg)
 {
 	struct ptrace_bts_config cfg;
-	int error = 0;
-
-	error = -EOPNOTSUPP;
-	if (!bts_cfg.sizeof_bts)
-		goto errout;
+	unsigned int flags = 0;
 
-	error = -EIO;
 	if (cfg_size < sizeof(cfg))
-		goto errout;
+		return -EIO;
 
-	error = -EFAULT;
 	if (copy_from_user(&cfg, ucfg, sizeof(cfg)))
-		goto errout;
-
-	error = -EINVAL;
-	if ((cfg.flags & PTRACE_BTS_O_SIGNAL) &&
-	    !(cfg.flags & PTRACE_BTS_O_ALLOC))
-		goto errout;
-
-	if (cfg.flags & PTRACE_BTS_O_ALLOC) {
-		bts_ovfl_callback_t ovfl = NULL;
-		unsigned int sig = 0;
-
-		error = -EINVAL;
-		if (cfg.size < (10 * bts_cfg.sizeof_bts))
-			goto errout;
+		return -EFAULT;
 
-		if (cfg.flags & PTRACE_BTS_O_SIGNAL) {
-			if (!cfg.signal)
-				goto errout;
+	if (child->bts) {
+		ds_release_bts(child->bts);
+		child->bts = NULL;
+	}
 
-			error = -EOPNOTSUPP;
-			goto errout;
+	if (cfg.flags & PTRACE_BTS_O_SIGNAL) {
+		if (!cfg.signal)
+			return -EINVAL;
 
-			sig  = cfg.signal;
-		}
+		return -EOPNOTSUPP;
 
-		if (child->bts) {
-			(void)ds_release_bts(child->bts);
-			kfree(child->bts_buffer);
+		child->thread.bts_ovfl_signal = cfg.signal;
+	}
 
-			child->bts = NULL;
-			child->bts_buffer = NULL;
-		}
+	if ((cfg.flags & PTRACE_BTS_O_ALLOC) &&
+	    (cfg.size != child->bts_size)) {
+		kfree(child->bts_buffer);
 
-		error = -ENOMEM;
+		child->bts_size = cfg.size;
 		child->bts_buffer = kzalloc(cfg.size, GFP_KERNEL);
-		if (!child->bts_buffer)
-			goto errout;
-
-		child->bts = ds_request_bts(child, child->bts_buffer, cfg.size,
-					    ovfl, /* th = */ (size_t)-1);
-		if (IS_ERR(child->bts)) {
-			error = PTR_ERR(child->bts);
-			kfree(child->bts_buffer);
-			child->bts = NULL;
-			child->bts_buffer = NULL;
-			goto errout;
+		if (!child->bts_buffer) {
+			child->bts_size = 0;
+			return -ENOMEM;
 		}
-
-		child->thread.bts_ovfl_signal = sig;
 	}
 
-	error = -EINVAL;
-	if (!child->thread.ds_ctx && cfg.flags)
-		goto errout;
-
 	if (cfg.flags & PTRACE_BTS_O_TRACE)
-		child->thread.debugctlmsr |= bts_cfg.debugctl_mask;
-	else
-		child->thread.debugctlmsr &= ~bts_cfg.debugctl_mask;
+		flags |= BTS_USER;
 
 	if (cfg.flags & PTRACE_BTS_O_SCHED)
-		set_tsk_thread_flag(child, TIF_BTS_TRACE_TS);
-	else
-		clear_tsk_thread_flag(child, TIF_BTS_TRACE_TS);
+		flags |= BTS_TIMESTAMPS;
 
-	error = sizeof(cfg);
+	child->bts = ds_request_bts(child, child->bts_buffer, child->bts_size,
+				    /* ovfl = */ NULL, /* th = */ (size_t)-1,
+				    flags);
+	if (IS_ERR(child->bts)) {
+		int error = PTR_ERR(child->bts);
 
-out:
-	if (child->thread.debugctlmsr)
-		set_tsk_thread_flag(child, TIF_DEBUGCTLMSR);
-	else
-		clear_tsk_thread_flag(child, TIF_DEBUGCTLMSR);
+		kfree(child->bts_buffer);
+		child->bts = NULL;
+		child->bts_buffer = NULL;
+		child->bts_size = 0;
 
-	return error;
+		return error;
+	}
 
-errout:
-	child->thread.debugctlmsr &= ~bts_cfg.debugctl_mask;
-	clear_tsk_thread_flag(child, TIF_BTS_TRACE_TS);
-	goto out;
+	return sizeof(cfg);
 }
 
 static int ptrace_bts_status(struct task_struct *child,
 			     long cfg_size,
 			     struct ptrace_bts_config __user *ucfg)
 {
+	const struct bts_trace *trace;
 	struct ptrace_bts_config cfg;
-	size_t end;
-	const void *base, *max;
-	int error;
 
 	if (cfg_size < sizeof(cfg))
 		return -EIO;
 
-	error = ds_get_bts_end(child->bts, &end);
-	if (error < 0)
-		return error;
-
-	error = ds_access_bts(child->bts, /* index = */ 0, &base);
-	if (error < 0)
-		return error;
-
-	error = ds_access_bts(child->bts, /* index = */ end, &max);
-	if (error < 0)
-		return error;
+	trace = ds_read_bts(child->bts);
+	if (!trace)
+		return -EPERM;
 
 	memset(&cfg, 0, sizeof(cfg));
-	cfg.size = (max - base);
+	cfg.size = trace->ds.end - trace->ds.begin;
 	cfg.signal = child->thread.bts_ovfl_signal;
 	cfg.bts_size = sizeof(struct bts_struct);
 
 	if (cfg.signal)
 		cfg.flags |= PTRACE_BTS_O_SIGNAL;
 
-	if (test_tsk_thread_flag(child, TIF_DEBUGCTLMSR) &&
-	    child->thread.debugctlmsr & bts_cfg.debugctl_mask)
+	if (trace->ds.flags & BTS_USER)
 		cfg.flags |= PTRACE_BTS_O_TRACE;
 
-	if (test_tsk_thread_flag(child, TIF_BTS_TRACE_TS))
+	if (trace->ds.flags & BTS_TIMESTAMPS)
 		cfg.flags |= PTRACE_BTS_O_SCHED;
 
 	if (copy_to_user(ucfg, &cfg, sizeof(cfg)))
@@ -873,105 +746,28 @@ static int ptrace_bts_status(struct task_struct *child,
 	return sizeof(cfg);
 }
 
-static int ptrace_bts_write_record(struct task_struct *child,
-				   const struct bts_struct *in)
+static int ptrace_bts_clear(struct task_struct *child)
 {
-	unsigned char bts_record[BTS_MAX_RECORD_SIZE];
+	const struct bts_trace *trace;
 
-	if (BTS_MAX_RECORD_SIZE < bts_cfg.sizeof_bts)
-		return -EOVERFLOW;
+	trace = ds_read_bts(child->bts);
+	if (!trace)
+		return -EPERM;
 
-	memset(bts_record, 0, bts_cfg.sizeof_bts);
-	switch (in->qualifier) {
-	case BTS_INVALID:
-		break;
+	memset(trace->ds.begin, 0, trace->ds.n * trace->ds.size);
 
-	case BTS_BRANCH:
-		bts_set(bts_record, bts_from, in->variant.lbr.from_ip);
-		bts_set(bts_record, bts_to,   in->variant.lbr.to_ip);
-		break;
-
-	case BTS_TASK_ARRIVES:
-	case BTS_TASK_DEPARTS:
-		bts_set(bts_record, bts_from,    bts_escape);
-		bts_set(bts_record, bts_qual,    in->qualifier);
-		bts_set(bts_record, bts_jiffies, in->variant.jiffies);
-		break;
-
-	default:
-		return -EINVAL;
-	}
-
-	return ds_write_bts(child->bts, bts_record, bts_cfg.sizeof_bts);
+	return ds_reset_bts(child->bts);
 }
 
-void ptrace_bts_take_timestamp(struct task_struct *tsk,
-			       enum bts_qualifier qualifier)
+static int ptrace_bts_size(struct task_struct *child)
 {
-	struct bts_struct rec = {
-		.qualifier = qualifier,
-		.variant.jiffies = jiffies_64
-	};
-
-	ptrace_bts_write_record(tsk, &rec);
-}
-
-static const struct bts_configuration bts_cfg_netburst = {
-	.sizeof_bts    = sizeof(long) * 3,
-	.sizeof_field  = sizeof(long),
-	.debugctl_mask = (1<<2)|(1<<3)|(1<<5)
-};
+	const struct bts_trace *trace;
 
-static const struct bts_configuration bts_cfg_pentium_m = {
-	.sizeof_bts    = sizeof(long) * 3,
-	.sizeof_field  = sizeof(long),
-	.debugctl_mask = (1<<6)|(1<<7)
-};
+	trace = ds_read_bts(child->bts);
+	if (!trace)
+		return -EPERM;
 
-static const struct bts_configuration bts_cfg_core2 = {
-	.sizeof_bts    = 8 * 3,
-	.sizeof_field  = 8,
-	.debugctl_mask = (1<<6)|(1<<7)|(1<<9)
-};
-
-static inline void bts_configure(const struct bts_configuration *cfg)
-{
-	bts_cfg = *cfg;
-}
-
-void __cpuinit ptrace_bts_init_intel(struct cpuinfo_x86 *c)
-{
-	switch (c->x86) {
-	case 0x6:
-		switch (c->x86_model) {
-		case 0 ... 0xC:
-			/* sorry, don't know about them */
-			break;
-		case 0xD:
-		case 0xE: /* Pentium M */
-			bts_configure(&bts_cfg_pentium_m);
-			break;
-		default: /* Core2, Atom, ... */
-			bts_configure(&bts_cfg_core2);
-			break;
-		}
-		break;
-	case 0xF:
-		switch (c->x86_model) {
-		case 0x0:
-		case 0x1:
-		case 0x2: /* Netburst */
-			bts_configure(&bts_cfg_netburst);
-			break;
-		default:
-			/* sorry, don't know about them */
-			break;
-		}
-		break;
-	default:
-		/* sorry, don't know about them */
-		break;
-	}
+	return (trace->ds.top - trace->ds.begin) / trace->ds.size;
 }
 #endif /* CONFIG_X86_PTRACE_BTS */
 
@@ -988,15 +784,12 @@ void ptrace_disable(struct task_struct *child)
 #endif
 #ifdef CONFIG_X86_PTRACE_BTS
 	if (child->bts) {
-		(void)ds_release_bts(child->bts);
+		ds_release_bts(child->bts);
+		child->bts = NULL;
+
 		kfree(child->bts_buffer);
 		child->bts_buffer = NULL;
-
-		child->thread.debugctlmsr &= ~bts_cfg.debugctl_mask;
-		if (!child->thread.debugctlmsr)
-			clear_tsk_thread_flag(child, TIF_DEBUGCTLMSR);
-
-		clear_tsk_thread_flag(child, TIF_BTS_TRACE_TS);
+		child->bts_size = 0;
 	}
 #endif /* CONFIG_X86_PTRACE_BTS */
 }
@@ -1129,16 +922,9 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data)
 			(child, data, (struct ptrace_bts_config __user *)addr);
 		break;
 
-	case PTRACE_BTS_SIZE: {
-		size_t size;
-
-		ret = ds_get_bts_index(child->bts, &size);
-		if (ret == 0) {
-			WARN_ON_ONCE(size != (int) size);
-			ret = (int) size;
-		}
+	case PTRACE_BTS_SIZE:
+		ret = ptrace_bts_size(child);
 		break;
-	}
 
 	case PTRACE_BTS_GET:
 		ret = ptrace_bts_read_record
@@ -1146,7 +932,7 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data)
 		break;
 
 	case PTRACE_BTS_CLEAR:
-		ret = ds_clear_bts(child->bts);
+		ret = ptrace_bts_clear(child);
 		break;
 
 	case PTRACE_BTS_DRAIN:
@@ -1409,6 +1195,14 @@ long compat_arch_ptrace(struct task_struct *child, compat_long_t request,
 
 	case PTRACE_GET_THREAD_AREA:
 	case PTRACE_SET_THREAD_AREA:
+#ifdef CONFIG_X86_PTRACE_BTS
+	case PTRACE_BTS_CONFIG:
+	case PTRACE_BTS_STATUS:
+	case PTRACE_BTS_SIZE:
+	case PTRACE_BTS_GET:
+	case PTRACE_BTS_CLEAR:
+	case PTRACE_BTS_DRAIN:
+#endif /* CONFIG_X86_PTRACE_BTS */
 		return arch_ptrace(child, request, addr, data);
 
 	default:
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 4b81fc5f773..dc5ea65dc71 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1176,6 +1176,7 @@ struct task_struct {
 	 * The buffer to hold the BTS data.
 	 */
 	void *bts_buffer;
+	size_t bts_size;
 #endif /* CONFIG_X86_PTRACE_BTS */
 
 	/* PID/PID hash table linkage. */
-- 
cgit v1.2.3-70-g09d2


From bcbc4f20b52c2c40c43a4d2337707dcdfe81bc3a Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Tue, 9 Dec 2008 23:54:20 +0100
Subject: tracing/function-graph-tracer: annotate do_IRQ and
 smp_apic_timer_interrupt

Impact: move most important x86 irq entry-points to a separate subsection

Annotate do_IRQ and smp_apic_timer_interrupt to put them into the .irqentry.text
subsection. These function will so be recognized as hardirq entrypoints for the
function-graph-tracer. We could also annotate other irq entries but the others
are far less important but they can be added on request.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/apic.c   |  3 ++-
 arch/x86/kernel/irq_64.c |  3 ++-
 include/linux/ftrace.h   | 11 +++++++++++
 3 files changed, 15 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/kernel/apic.c b/arch/x86/kernel/apic.c
index 16f94879b52..b946ac19753 100644
--- a/arch/x86/kernel/apic.c
+++ b/arch/x86/kernel/apic.c
@@ -30,6 +30,7 @@
 #include <linux/module.h>
 #include <linux/dmi.h>
 #include <linux/dmar.h>
+#include <linux/ftrace.h>
 
 #include <asm/atomic.h>
 #include <asm/smp.h>
@@ -800,7 +801,7 @@ static void local_apic_timer_interrupt(void)
  * [ if a single-CPU system runs an SMP kernel then we call the local
  *   interrupt as well. Thus we cannot inline the local irq ... ]
  */
-void smp_apic_timer_interrupt(struct pt_regs *regs)
+void __irq_entry smp_apic_timer_interrupt(struct pt_regs *regs)
 {
 	struct pt_regs *old_regs = set_irq_regs(regs);
 
diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c
index 60eb84eb77a..11c65e811ff 100644
--- a/arch/x86/kernel/irq_64.c
+++ b/arch/x86/kernel/irq_64.c
@@ -13,6 +13,7 @@
 #include <linux/seq_file.h>
 #include <linux/module.h>
 #include <linux/delay.h>
+#include <linux/ftrace.h>
 #include <asm/uaccess.h>
 #include <asm/io_apic.h>
 #include <asm/idle.h>
@@ -47,7 +48,7 @@ static inline void stack_overflow_check(struct pt_regs *regs)
  * SMP cross-CPU interrupts have their own specific
  * handlers).
  */
-asmlinkage unsigned int do_IRQ(struct pt_regs *regs)
+asmlinkage unsigned int __irq_entry do_IRQ(struct pt_regs *regs)
 {
 	struct pt_regs *old_regs = set_irq_regs(regs);
 	struct irq_desc *desc;
diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index 11cac81eed0..44020f31bd8 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -377,6 +377,16 @@ struct ftrace_graph_ret {
  */
 #define __notrace_funcgraph		notrace
 
+/*
+ * We want to which function is an entrypoint of a hardirq.
+ * That will help us to put a signal on output.
+ */
+#define __irq_entry		 __attribute__((__section__(".irqentry.text")))
+
+/* Limits of hardirq entrypoints */
+extern char __irqentry_text_start[];
+extern char __irqentry_text_end[];
+
 #define FTRACE_RETFUNC_DEPTH 50
 #define FTRACE_RETSTACK_ALLOC_SIZE 32
 /* Type of the callback handlers for tracing function graph*/
@@ -414,6 +424,7 @@ static inline void unpause_graph_tracing(void)
 #else
 
 #define __notrace_funcgraph
+#define __irq_entry
 
 static inline void ftrace_graph_init_task(struct task_struct *t) { }
 static inline void ftrace_graph_exit_task(struct task_struct *t) { }
-- 
cgit v1.2.3-70-g09d2


From 0ebb26e7a4e2c5337502e98b2221e037fda911b9 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 12 Dec 2008 11:26:39 +0100
Subject: sparse irqs: handle !GENIRQ platforms

Impact: build fix

fix:

 In file included from /home/mingo/tip/arch/m68k/amiga/amiints.c:39:
 /home/mingo/tip/include/linux/interrupt.h:21: error: expected identifier or '('
 /home/mingo/tip/arch/m68k/amiga/amiints.c: In function 'amiga_init_IRQ':

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/interrupt.h |  4 ++--
 include/linux/irqnr.h     | 11 ++++++++++-
 include/linux/random.h    |  2 +-
 3 files changed, 13 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index 79e915e7e8a..777f89e00b4 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -14,12 +14,12 @@
 #include <linux/irqflags.h>
 #include <linux/smp.h>
 #include <linux/percpu.h>
+#include <linux/irqnr.h>
+
 #include <asm/atomic.h>
 #include <asm/ptrace.h>
 #include <asm/system.h>
 
-extern int nr_irqs;
-
 /*
  * These correspond to the IORESOURCE_IRQ_* defines in
  * linux/ioport.h to select the interrupt line behaviour.  When
diff --git a/include/linux/irqnr.h b/include/linux/irqnr.h
index 13754f81358..95d2b74641f 100644
--- a/include/linux/irqnr.h
+++ b/include/linux/irqnr.h
@@ -1,6 +1,11 @@
 #ifndef _LINUX_IRQNR_H
 #define _LINUX_IRQNR_H
 
+/*
+ * Generic irq_desc iterators:
+ */
+#ifdef __KERNEL__
+
 #ifndef CONFIG_GENERIC_HARDIRQS
 #include <asm/irq.h>
 # define nr_irqs		NR_IRQS
@@ -11,10 +16,12 @@
 # define for_each_irq_desc_reverse(irq, desc)                          \
 	for (irq = nr_irqs - 1; irq >= 0; irq--)
 #else
+
+extern int nr_irqs;
+
 #ifndef CONFIG_SPARSE_IRQ
 
 struct irq_desc;
-extern int nr_irqs;
 # define for_each_irq_desc(irq, desc)		\
 	for (irq = 0, desc = irq_desc; irq < nr_irqs; irq++, desc++)
 # define for_each_irq_desc_reverse(irq, desc)                          \
@@ -26,4 +33,6 @@ extern int nr_irqs;
 #define for_each_irq_nr(irq)                   \
        for (irq = 0; irq < nr_irqs; irq++)
 
+#endif /* __KERNEL__ */
+
 #endif
diff --git a/include/linux/random.h b/include/linux/random.h
index ad9daa2374d..adbf3bd3c6b 100644
--- a/include/linux/random.h
+++ b/include/linux/random.h
@@ -8,6 +8,7 @@
 #define _LINUX_RANDOM_H
 
 #include <linux/ioctl.h>
+#include <linux/irqnr.h>
 
 /* ioctl()'s for the random number generator */
 
@@ -49,7 +50,6 @@ struct timer_rand_state;
 
 extern struct timer_rand_state *irq_timer_state[];
 
-extern int nr_irqs;
 static inline struct timer_rand_state *get_timer_rand_state(unsigned int irq)
 {
 	if (irq >= nr_irqs)
-- 
cgit v1.2.3-70-g09d2


From 30cb367ea2be76bf71dbd275f38d0fd3b6f4142b Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 12 Dec 2008 12:19:57 +0100
Subject: sparse irqs: add irqnr.h to the user headers list

Impact: fix build error

/home/mingo/tip/usr/include/linux/random.h:11: included file
'linux/irqnr.h' is not exported

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/Kbuild | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/Kbuild b/include/linux/Kbuild
index e531783e5d7..95ac82340c3 100644
--- a/include/linux/Kbuild
+++ b/include/linux/Kbuild
@@ -313,6 +313,7 @@ unifdef-y += ptrace.h
 unifdef-y += qnx4_fs.h
 unifdef-y += quota.h
 unifdef-y += random.h
+unifdef-y += irqnr.h
 unifdef-y += reboot.h
 unifdef-y += reiserfs_fs.h
 unifdef-y += reiserfs_xattr.h
-- 
cgit v1.2.3-70-g09d2


From ee79d1bdb6a10499e53f80b1e8d14110215178ba Mon Sep 17 00:00:00 2001
From: Heiko Carstens <heiko.carstens@de.ibm.com>
Date: Tue, 9 Dec 2008 18:49:50 +0100
Subject: sched: let arch_update_cpu_topology indicate if topology changed

Change arch_update_cpu_topology so it returns 1 if the cpu topology changed
and 0 if it didn't change. This will be useful for the next patch which adds
a call to this function in partition_sched_domains.

Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/s390/kernel/topology.c | 5 +++--
 include/linux/topology.h    | 2 +-
 kernel/sched.c              | 8 +++++++-
 3 files changed, 11 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/arch/s390/kernel/topology.c b/arch/s390/kernel/topology.c
index a947899dcba..bf96f1b5c6e 100644
--- a/arch/s390/kernel/topology.c
+++ b/arch/s390/kernel/topology.c
@@ -212,7 +212,7 @@ static void update_cpu_core_map(void)
 		cpu_core_map[cpu] = cpu_coregroup_map(cpu);
 }
 
-void arch_update_cpu_topology(void)
+int arch_update_cpu_topology(void)
 {
 	struct tl_info *info = tl_info;
 	struct sys_device *sysdev;
@@ -221,7 +221,7 @@ void arch_update_cpu_topology(void)
 	if (!machine_has_topology) {
 		update_cpu_core_map();
 		topology_update_polarization_simple();
-		return;
+		return 0;
 	}
 	stsi(info, 15, 1, 2);
 	tl_to_cores(info);
@@ -230,6 +230,7 @@ void arch_update_cpu_topology(void)
 		sysdev = get_cpu_sysdev(cpu);
 		kobject_uevent(&sysdev->kobj, KOBJ_CHANGE);
 	}
+	return 1;
 }
 
 static void topology_work_fn(struct work_struct *work)
diff --git a/include/linux/topology.h b/include/linux/topology.h
index 117f1b7405c..0c5b5ac36d8 100644
--- a/include/linux/topology.h
+++ b/include/linux/topology.h
@@ -49,7 +49,7 @@
 	for_each_online_node(node)			\
 		if (nr_cpus_node(node))
 
-void arch_update_cpu_topology(void);
+int arch_update_cpu_topology(void);
 
 /* Conform to ACPI 2.0 SLIT distance definitions */
 #define LOCAL_DISTANCE		10
diff --git a/kernel/sched.c b/kernel/sched.c
index ef212da928e..fcfbbd9dbd6 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -7675,8 +7675,14 @@ static struct sched_domain_attr *dattr_cur;
  */
 static cpumask_t fallback_doms;
 
-void __attribute__((weak)) arch_update_cpu_topology(void)
+/*
+ * arch_update_cpu_topology lets virtualized architectures update the
+ * cpu core maps. It is supposed to return 1 if the topology changed
+ * or 0 if it stayed the same.
+ */
+int __attribute__((weak)) arch_update_cpu_topology(void)
 {
+	return 0;
 }
 
 /*
-- 
cgit v1.2.3-70-g09d2


From 5b37717a23b8e40f6cf7ad85a26ddcf41c171e2c Mon Sep 17 00:00:00 2001
From: Stefano Panella <stefano.panella@csr.com>
Date: Fri, 12 Dec 2008 13:00:06 +0000
Subject: uwb: improved MAS allocator and reservation conflict handling

Greatly enhance the MAS allocator:
  - Handle row and column reservations.
  - Permit all the available MAS to be allocated.
  - Follows the WiMedia rules on MAS selection.

Take appropriate action when reservation conflicts are detected.
  - Correctly identify which reservation wins the conflict.
  - Protect alien BP reservations.
  - If an owned reservation loses, resize/move it.
  - Follow the backoff procedure before requesting additional MAS.

When reservations are terminated, move the remaining reservations (if
necessary) so they keep following the MAS allocation rules.

Signed-off-by: Stefano Panella <stefano.panella@csr.com>
Signed-off-by: David Vrabel <david.vrabel@csr.com>
---
 drivers/usb/wusbcore/reservation.c |  13 +-
 drivers/uwb/Makefile               |   1 +
 drivers/uwb/allocator.c            | 386 +++++++++++++++++++++
 drivers/uwb/drp-avail.c            |   4 +-
 drivers/uwb/drp-ie.c               | 160 +++++++--
 drivers/uwb/drp.c                  | 681 ++++++++++++++++++++++++++++---------
 drivers/uwb/rsv.c                  | 482 ++++++++++++++++++++------
 drivers/uwb/uwb-debug.c            |  49 +--
 drivers/uwb/uwb-internal.h         |  80 ++++-
 include/linux/uwb.h                |  47 ++-
 include/linux/uwb/debug-cmd.h      |   2 +-
 include/linux/uwb/spec.h           |  25 ++
 12 files changed, 1603 insertions(+), 327 deletions(-)
 create mode 100644 drivers/uwb/allocator.c

(limited to 'include/linux')

diff --git a/drivers/usb/wusbcore/reservation.c b/drivers/usb/wusbcore/reservation.c
index 7b6525dac2f..c37e4f83e54 100644
--- a/drivers/usb/wusbcore/reservation.c
+++ b/drivers/usb/wusbcore/reservation.c
@@ -48,13 +48,15 @@ static void wusbhc_rsv_complete_cb(struct uwb_rsv *rsv)
 {
 	struct wusbhc *wusbhc = rsv->pal_priv;
 	struct device *dev = wusbhc->dev;
+	struct uwb_mas_bm mas;
 	char buf[72];
 
 	switch (rsv->state) {
 	case UWB_RSV_STATE_O_ESTABLISHED:
-		bitmap_scnprintf(buf, sizeof(buf), rsv->mas.bm, UWB_NUM_MAS);
+		uwb_rsv_get_usable_mas(rsv, &mas);
+		bitmap_scnprintf(buf, sizeof(buf), mas.bm, UWB_NUM_MAS);
 		dev_dbg(dev, "established reservation: %s\n", buf);
-		wusbhc_bwa_set(wusbhc, rsv->stream, &rsv->mas);
+		wusbhc_bwa_set(wusbhc, rsv->stream, &mas);
 		break;
 	case UWB_RSV_STATE_NONE:
 		dev_dbg(dev, "removed reservation\n");
@@ -85,13 +87,12 @@ int wusbhc_rsv_establish(struct wusbhc *wusbhc)
 	bcid.data[0] = wusbhc->cluster_id;
 	bcid.data[1] = 0;
 
-	rsv->owner = &rc->uwb_dev;
 	rsv->target.type = UWB_RSV_TARGET_DEVADDR;
 	rsv->target.devaddr = bcid;
 	rsv->type = UWB_DRP_TYPE_PRIVATE;
-	rsv->max_mas = 256;
-	rsv->min_mas = 16;  /* one MAS per zone? */
-	rsv->sparsity = 16; /* at least one MAS in each zone? */
+	rsv->max_mas = 256; /* try to get as much as possible */
+	rsv->min_mas = 15;  /* one MAS per zone */
+	rsv->max_interval = 1; /* max latency is one zone */
 	rsv->is_multicast = true;
 
 	ret = uwb_rsv_establish(rsv);
diff --git a/drivers/uwb/Makefile b/drivers/uwb/Makefile
index ce21a95da04..2f98d080fe7 100644
--- a/drivers/uwb/Makefile
+++ b/drivers/uwb/Makefile
@@ -6,6 +6,7 @@ obj-$(CONFIG_UWB_I1480U)	+= i1480/
 
 uwb-objs :=		\
 	address.o	\
+	allocator.o	\
 	beacon.o	\
 	driver.o	\
 	drp.o		\
diff --git a/drivers/uwb/allocator.c b/drivers/uwb/allocator.c
new file mode 100644
index 00000000000..c8185e6b0cd
--- /dev/null
+++ b/drivers/uwb/allocator.c
@@ -0,0 +1,386 @@
+/*
+ * UWB reservation management.
+ *
+ * Copyright (C) 2008 Cambridge Silicon Radio Ltd.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version
+ * 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+#include <linux/version.h>
+#include <linux/kernel.h>
+#include <linux/uwb.h>
+
+#include "uwb-internal.h"
+
+static void uwb_rsv_fill_column_alloc(struct uwb_rsv_alloc_info *ai)
+{
+	int col, mas, safe_mas, unsafe_mas;
+	unsigned char *bm = ai->bm;
+	struct uwb_rsv_col_info *ci = ai->ci;
+	unsigned char c;
+
+	for (col = ci->csi.start_col; col < UWB_NUM_ZONES; col += ci->csi.interval) {
+    
+		safe_mas   = ci->csi.safe_mas_per_col;
+		unsafe_mas = ci->csi.unsafe_mas_per_col;
+    
+		for (mas = 0; mas < UWB_MAS_PER_ZONE; mas++ ) {
+			if (bm[col * UWB_MAS_PER_ZONE + mas] == 0) {
+	
+				if (safe_mas > 0) {
+					safe_mas--;
+					c = UWB_RSV_MAS_SAFE;
+				} else if (unsafe_mas > 0) {
+					unsafe_mas--;
+					c = UWB_RSV_MAS_UNSAFE;
+				} else {
+					break;
+				}
+				bm[col * UWB_MAS_PER_ZONE + mas] = c;
+			}
+		}
+	}
+}
+
+static void uwb_rsv_fill_row_alloc(struct uwb_rsv_alloc_info *ai)
+{
+	int mas, col, rows;
+	unsigned char *bm = ai->bm;
+	struct uwb_rsv_row_info *ri = &ai->ri;
+	unsigned char c;
+
+	rows = 1;
+	c = UWB_RSV_MAS_SAFE;
+	for (mas = UWB_MAS_PER_ZONE - 1; mas >= 0; mas--) {
+		if (ri->avail[mas] == 1) {
+      
+			if (rows > ri->used_rows) {
+				break;
+			} else if (rows > 7) {
+				c = UWB_RSV_MAS_UNSAFE;
+			}
+
+			for (col = 0; col < UWB_NUM_ZONES; col++) {
+				if (bm[col * UWB_NUM_ZONES + mas] != UWB_RSV_MAS_NOT_AVAIL) {
+					bm[col * UWB_NUM_ZONES + mas] = c;
+					if(c == UWB_RSV_MAS_SAFE)
+						ai->safe_allocated_mases++;
+					else
+						ai->unsafe_allocated_mases++;
+				}
+			}
+			rows++;
+		}
+	}
+	ai->total_allocated_mases = ai->safe_allocated_mases + ai->unsafe_allocated_mases;
+}
+
+/*
+ * Find the best column set for a given availability, interval, num safe mas and
+ * num unsafe mas.
+ *
+ * The different sets are tried in order as shown below, depending on the interval.
+ *
+ * interval = 16
+ *	deep = 0
+ *		set 1 ->  {  8 }
+ *	deep = 1
+ *		set 1 ->  {  4 }
+ *		set 2 ->  { 12 }
+ *	deep = 2
+ *		set 1 ->  {  2 }
+ *		set 2 ->  {  6 }
+ *		set 3 ->  { 10 }
+ *		set 4 ->  { 14 }
+ *	deep = 3
+ *		set 1 ->  {  1 }
+ *		set 2 ->  {  3 }
+ *		set 3 ->  {  5 }
+ *		set 4 ->  {  7 }
+ *		set 5 ->  {  9 }
+ *		set 6 ->  { 11 }
+ *		set 7 ->  { 13 }
+ *		set 8 ->  { 15 }
+ *
+ * interval = 8
+ *	deep = 0
+ *		set 1 ->  {  4  12 }
+ *	deep = 1
+ *		set 1 ->  {  2  10 }
+ *		set 2 ->  {  6  14 }
+ *	deep = 2
+ *		set 1 ->  {  1   9 }
+ *		set 2 ->  {  3  11 }
+ *		set 3 ->  {  5  13 }
+ *		set 4 ->  {  7  15 }
+ *
+ * interval = 4
+ *	deep = 0
+ *		set 1 ->  {  2   6  10  14 }
+ *	deep = 1
+ *		set 1 ->  {  1   5   9  13 }
+ *		set 2 ->  {  3   7  11  15 }
+ *
+ * interval = 2
+ *	deep = 0
+ *		set 1 ->  {  1   3   5   7   9  11  13  15 }
+ */
+static int uwb_rsv_find_best_column_set(struct uwb_rsv_alloc_info *ai, int interval, 
+					int num_safe_mas, int num_unsafe_mas)
+{
+	struct uwb_rsv_col_info *ci = ai->ci;
+	struct uwb_rsv_col_set_info *csi = &ci->csi;
+	struct uwb_rsv_col_set_info tmp_csi;
+	int deep, set, col, start_col_deep, col_start_set;
+	int start_col, max_mas_in_set, lowest_max_mas_in_deep;
+	int n_mas;
+	int found = UWB_RSV_ALLOC_NOT_FOUND; 
+
+	tmp_csi.start_col = 0;
+	start_col_deep = interval;
+	n_mas = num_unsafe_mas + num_safe_mas;
+
+	for (deep = 0; ((interval >> deep) & 0x1) == 0; deep++) {
+		start_col_deep /= 2;
+		col_start_set = 0;
+		lowest_max_mas_in_deep = UWB_MAS_PER_ZONE;
+
+		for (set = 1; set <= (1 << deep); set++) {
+			max_mas_in_set = 0;
+			start_col = start_col_deep + col_start_set;
+			for (col = start_col; col < UWB_NUM_ZONES; col += interval) {
+                
+				if (ci[col].max_avail_safe >= num_safe_mas &&
+				    ci[col].max_avail_unsafe >= n_mas) {
+					if (ci[col].highest_mas[n_mas] > max_mas_in_set)
+						max_mas_in_set = ci[col].highest_mas[n_mas];
+				} else {
+					max_mas_in_set = 0;
+					break;
+				}
+			}
+			if ((lowest_max_mas_in_deep > max_mas_in_set) && max_mas_in_set) {
+				lowest_max_mas_in_deep = max_mas_in_set;
+
+				tmp_csi.start_col = start_col;
+			}
+			col_start_set += (interval >> deep);
+		}
+
+		if (lowest_max_mas_in_deep < 8) {
+			csi->start_col = tmp_csi.start_col;
+			found = UWB_RSV_ALLOC_FOUND;
+			break;
+		} else if ((lowest_max_mas_in_deep > 8) && 
+			   (lowest_max_mas_in_deep != UWB_MAS_PER_ZONE) &&
+			   (found == UWB_RSV_ALLOC_NOT_FOUND)) {
+			csi->start_col = tmp_csi.start_col;
+			found = UWB_RSV_ALLOC_FOUND;
+		}
+	}
+
+	if (found == UWB_RSV_ALLOC_FOUND) {
+		csi->interval = interval;
+		csi->safe_mas_per_col = num_safe_mas;
+		csi->unsafe_mas_per_col = num_unsafe_mas;
+
+		ai->safe_allocated_mases = (UWB_NUM_ZONES / interval) * num_safe_mas;
+		ai->unsafe_allocated_mases = (UWB_NUM_ZONES / interval) * num_unsafe_mas;
+		ai->total_allocated_mases = ai->safe_allocated_mases + ai->unsafe_allocated_mases;
+		ai->interval = interval;		
+	}
+	return found;
+}
+
+static void get_row_descriptors(struct uwb_rsv_alloc_info *ai)
+{
+	unsigned char *bm = ai->bm;
+	struct uwb_rsv_row_info *ri = &ai->ri;
+	int col, mas;
+  
+	ri->free_rows = 16;
+	for (mas = 0; mas < UWB_MAS_PER_ZONE; mas ++) {
+		ri->avail[mas] = 1;
+		for (col = 1; col < UWB_NUM_ZONES; col++) {
+			if (bm[col * UWB_NUM_ZONES + mas] == UWB_RSV_MAS_NOT_AVAIL) {
+				ri->free_rows--;
+				ri->avail[mas]=0;
+				break;
+			}
+		}
+	}
+}
+
+static void uwb_rsv_fill_column_info(unsigned char *bm, int column, struct uwb_rsv_col_info *rci)
+{
+	int mas;
+	int block_count = 0, start_block = 0; 
+	int previous_avail = 0;
+	int available = 0;
+	int safe_mas_in_row[UWB_MAS_PER_ZONE] = {
+		8, 7, 6, 5, 4, 4, 4, 4, 4, 4, 4, 4, 4, 3, 2, 1,
+	};
+
+	rci->max_avail_safe = 0;
+
+	for (mas = 0; mas < UWB_MAS_PER_ZONE; mas ++) {
+		if (!bm[column * UWB_NUM_ZONES + mas]) {
+			available++;
+			rci->max_avail_unsafe = available;
+
+			rci->highest_mas[available] = mas;
+
+			if (previous_avail) {
+				block_count++;
+				if ((block_count > safe_mas_in_row[start_block]) &&
+				    (!rci->max_avail_safe))
+					rci->max_avail_safe = available - 1;
+			} else {
+				previous_avail = 1;
+				start_block = mas;
+				block_count = 1;
+			}
+		} else {
+			previous_avail = 0;
+		}
+	}
+	if (!rci->max_avail_safe)
+		rci->max_avail_safe = rci->max_avail_unsafe;
+}
+
+static void get_column_descriptors(struct uwb_rsv_alloc_info *ai)
+{
+	unsigned char *bm = ai->bm;
+	struct uwb_rsv_col_info *ci = ai->ci;
+	int col;
+
+	for (col = 1; col < UWB_NUM_ZONES; col++) {
+		uwb_rsv_fill_column_info(bm, col, &ci[col]);
+	}
+}
+
+static int uwb_rsv_find_best_row_alloc(struct uwb_rsv_alloc_info *ai)
+{
+	int n_rows;
+	int max_rows = ai->max_mas / UWB_USABLE_MAS_PER_ROW;
+	int min_rows = ai->min_mas / UWB_USABLE_MAS_PER_ROW;
+	if (ai->min_mas % UWB_USABLE_MAS_PER_ROW)
+		min_rows++;
+	for (n_rows = max_rows; n_rows >= min_rows; n_rows--) {
+		if (n_rows <= ai->ri.free_rows) {
+			ai->ri.used_rows = n_rows;
+			ai->interval = 1; /* row reservation */
+			uwb_rsv_fill_row_alloc(ai);
+			return UWB_RSV_ALLOC_FOUND;
+		}
+	}  
+	return UWB_RSV_ALLOC_NOT_FOUND;
+}
+
+static int uwb_rsv_find_best_col_alloc(struct uwb_rsv_alloc_info *ai, int interval)
+{
+	int n_safe, n_unsafe, n_mas;  
+	int n_column = UWB_NUM_ZONES / interval;
+	int max_per_zone = ai->max_mas / n_column;
+	int min_per_zone = ai->min_mas / n_column;
+
+	if (ai->min_mas % n_column)
+		min_per_zone++;
+
+	if (min_per_zone > UWB_MAS_PER_ZONE) {
+		return UWB_RSV_ALLOC_NOT_FOUND;
+	}
+    
+	if (max_per_zone > UWB_MAS_PER_ZONE) {
+		max_per_zone = UWB_MAS_PER_ZONE;
+	}
+    
+	for (n_mas = max_per_zone; n_mas >= min_per_zone; n_mas--) {
+		if (uwb_rsv_find_best_column_set(ai, interval, 0, n_mas) == UWB_RSV_ALLOC_NOT_FOUND)
+			continue;
+		for (n_safe = n_mas; n_safe >= 0; n_safe--) {
+			n_unsafe = n_mas - n_safe;
+			if (uwb_rsv_find_best_column_set(ai, interval, n_safe, n_unsafe) == UWB_RSV_ALLOC_FOUND) {
+				uwb_rsv_fill_column_alloc(ai);
+				return UWB_RSV_ALLOC_FOUND;
+			}
+		}
+	}
+	return UWB_RSV_ALLOC_NOT_FOUND;
+}
+
+int uwb_rsv_find_best_allocation(struct uwb_rsv *rsv, struct uwb_mas_bm *available, 
+				 struct uwb_mas_bm *result)
+{
+	struct uwb_rsv_alloc_info *ai;
+	int interval;
+	int bit_index;
+
+	ai = kzalloc(sizeof(struct uwb_rsv_alloc_info), GFP_KERNEL);
+	
+	ai->min_mas = rsv->min_mas;
+	ai->max_mas = rsv->max_mas;
+	ai->max_interval = rsv->max_interval;
+
+
+	/* fill the not available vector from the available bm */
+	for (bit_index = 0; bit_index < UWB_NUM_MAS; bit_index++) {
+		if (!test_bit(bit_index, available->bm))
+			ai->bm[bit_index] = UWB_RSV_MAS_NOT_AVAIL;
+	}
+
+	if (ai->max_interval == 1) {
+		get_row_descriptors(ai);
+		if (uwb_rsv_find_best_row_alloc(ai) == UWB_RSV_ALLOC_FOUND)
+			goto alloc_found;
+		else
+			goto alloc_not_found;
+	}
+
+	get_column_descriptors(ai);
+        
+	for (interval = 16; interval >= 2; interval>>=1) {
+		if (interval > ai->max_interval)
+			continue;
+		if (uwb_rsv_find_best_col_alloc(ai, interval) == UWB_RSV_ALLOC_FOUND)
+			goto alloc_found;
+	}
+
+	/* try row reservation if no column is found */
+	get_row_descriptors(ai);
+	if (uwb_rsv_find_best_row_alloc(ai) == UWB_RSV_ALLOC_FOUND)
+		goto alloc_found;
+	else
+		goto alloc_not_found;
+
+  alloc_found:
+	bitmap_zero(result->bm, UWB_NUM_MAS);
+	bitmap_zero(result->unsafe_bm, UWB_NUM_MAS);
+	/* fill the safe and unsafe bitmaps */
+	for (bit_index = 0; bit_index < UWB_NUM_MAS; bit_index++) {
+		if (ai->bm[bit_index] == UWB_RSV_MAS_SAFE)
+			set_bit(bit_index, result->bm);
+		else if (ai->bm[bit_index] == UWB_RSV_MAS_UNSAFE)
+			set_bit(bit_index, result->unsafe_bm);
+	}
+	bitmap_or(result->bm, result->bm, result->unsafe_bm, UWB_NUM_MAS);
+
+	result->safe   = ai->safe_allocated_mases;
+	result->unsafe = ai->unsafe_allocated_mases;
+	
+	kfree(ai);		
+	return UWB_RSV_ALLOC_FOUND;
+  
+  alloc_not_found:
+	kfree(ai);
+	return UWB_RSV_ALLOC_NOT_FOUND;
+}
diff --git a/drivers/uwb/drp-avail.c b/drivers/uwb/drp-avail.c
index 3febd855280..40a540a5a72 100644
--- a/drivers/uwb/drp-avail.c
+++ b/drivers/uwb/drp-avail.c
@@ -58,7 +58,7 @@ void uwb_drp_avail_init(struct uwb_rc *rc)
  *
  * avail = global & local & pending
  */
-static void uwb_drp_available(struct uwb_rc *rc, struct uwb_mas_bm *avail)
+void uwb_drp_available(struct uwb_rc *rc, struct uwb_mas_bm *avail)
 {
 	bitmap_and(avail->bm, rc->drp_avail.global, rc->drp_avail.local, UWB_NUM_MAS);
 	bitmap_and(avail->bm, avail->bm, rc->drp_avail.pending, UWB_NUM_MAS);
@@ -105,6 +105,7 @@ void uwb_drp_avail_release(struct uwb_rc *rc, struct uwb_mas_bm *mas)
 	bitmap_or(rc->drp_avail.local, rc->drp_avail.local, mas->bm, UWB_NUM_MAS);
 	bitmap_or(rc->drp_avail.pending, rc->drp_avail.pending, mas->bm, UWB_NUM_MAS);
 	rc->drp_avail.ie_valid = false;
+	uwb_rsv_handle_drp_avail_change(rc);
 }
 
 /**
@@ -280,6 +281,7 @@ int uwbd_evt_handle_rc_drp_avail(struct uwb_event *evt)
 	mutex_lock(&rc->rsvs_mutex);
 	bitmap_copy(rc->drp_avail.global, bmp, UWB_NUM_MAS);
 	rc->drp_avail.ie_valid = false;
+	uwb_rsv_handle_drp_avail_change(rc);
 	mutex_unlock(&rc->rsvs_mutex);
 
 	uwb_rsv_sched_update(rc);
diff --git a/drivers/uwb/drp-ie.c b/drivers/uwb/drp-ie.c
index 75491d47806..2840d7bf9e6 100644
--- a/drivers/uwb/drp-ie.c
+++ b/drivers/uwb/drp-ie.c
@@ -22,6 +22,96 @@
 
 #include "uwb-internal.h"
 
+
+/*
+ * Return the reason code for a reservations's DRP IE.
+ */
+int uwb_rsv_reason_code(struct uwb_rsv *rsv)
+{
+	static const int reason_codes[] = {
+		[UWB_RSV_STATE_O_INITIATED]          = UWB_DRP_REASON_ACCEPTED,
+		[UWB_RSV_STATE_O_PENDING]            = UWB_DRP_REASON_ACCEPTED,
+		[UWB_RSV_STATE_O_MODIFIED]           = UWB_DRP_REASON_MODIFIED,
+		[UWB_RSV_STATE_O_ESTABLISHED]        = UWB_DRP_REASON_ACCEPTED,
+		[UWB_RSV_STATE_O_TO_BE_MOVED]        = UWB_DRP_REASON_ACCEPTED,
+		[UWB_RSV_STATE_O_MOVE_COMBINING]     = UWB_DRP_REASON_MODIFIED,
+		[UWB_RSV_STATE_O_MOVE_REDUCING]      = UWB_DRP_REASON_MODIFIED,
+		[UWB_RSV_STATE_O_MOVE_EXPANDING]     = UWB_DRP_REASON_ACCEPTED,
+		[UWB_RSV_STATE_T_ACCEPTED]           = UWB_DRP_REASON_ACCEPTED,
+		[UWB_RSV_STATE_T_CONFLICT]           = UWB_DRP_REASON_CONFLICT,
+		[UWB_RSV_STATE_T_PENDING]            = UWB_DRP_REASON_PENDING,
+		[UWB_RSV_STATE_T_DENIED]             = UWB_DRP_REASON_DENIED,
+		[UWB_RSV_STATE_T_RESIZED]            = UWB_DRP_REASON_ACCEPTED,
+		[UWB_RSV_STATE_T_EXPANDING_ACCEPTED] = UWB_DRP_REASON_ACCEPTED,
+		[UWB_RSV_STATE_T_EXPANDING_CONFLICT] = UWB_DRP_REASON_CONFLICT,
+		[UWB_RSV_STATE_T_EXPANDING_PENDING]  = UWB_DRP_REASON_PENDING,
+		[UWB_RSV_STATE_T_EXPANDING_DENIED]   = UWB_DRP_REASON_DENIED,
+	};
+
+	return reason_codes[rsv->state];
+}
+
+/*
+ * Return the reason code for a reservations's companion DRP IE .
+ */
+int uwb_rsv_companion_reason_code(struct uwb_rsv *rsv)
+{
+	static const int companion_reason_codes[] = {
+		[UWB_RSV_STATE_O_MOVE_EXPANDING]     = UWB_DRP_REASON_ACCEPTED,
+		[UWB_RSV_STATE_T_EXPANDING_ACCEPTED] = UWB_DRP_REASON_ACCEPTED,
+		[UWB_RSV_STATE_T_EXPANDING_CONFLICT] = UWB_DRP_REASON_CONFLICT,
+		[UWB_RSV_STATE_T_EXPANDING_PENDING]  = UWB_DRP_REASON_PENDING,
+		[UWB_RSV_STATE_T_EXPANDING_DENIED]   = UWB_DRP_REASON_DENIED,
+	};
+
+	return companion_reason_codes[rsv->state];
+}
+
+/*
+ * Return the status bit for a reservations's DRP IE.
+ */
+int uwb_rsv_status(struct uwb_rsv *rsv)
+{
+	static const int statuses[] = {
+		[UWB_RSV_STATE_O_INITIATED]          = 0,
+		[UWB_RSV_STATE_O_PENDING]            = 0,
+		[UWB_RSV_STATE_O_MODIFIED]           = 1,
+		[UWB_RSV_STATE_O_ESTABLISHED]        = 1,
+		[UWB_RSV_STATE_O_TO_BE_MOVED]        = 0,
+		[UWB_RSV_STATE_O_MOVE_COMBINING]     = 1,
+		[UWB_RSV_STATE_O_MOVE_REDUCING]      = 1,
+		[UWB_RSV_STATE_O_MOVE_EXPANDING]     = 1,
+		[UWB_RSV_STATE_T_ACCEPTED]           = 1,
+		[UWB_RSV_STATE_T_CONFLICT]           = 0,
+		[UWB_RSV_STATE_T_PENDING]            = 0,
+		[UWB_RSV_STATE_T_DENIED]             = 0,
+		[UWB_RSV_STATE_T_RESIZED]            = 1,
+		[UWB_RSV_STATE_T_EXPANDING_ACCEPTED] = 1,
+		[UWB_RSV_STATE_T_EXPANDING_CONFLICT] = 1,
+		[UWB_RSV_STATE_T_EXPANDING_PENDING]  = 1,
+		[UWB_RSV_STATE_T_EXPANDING_DENIED]   = 1,
+
+	};
+
+	return statuses[rsv->state];
+}
+
+/*
+ * Return the status bit for a reservations's companion DRP IE .
+ */
+int uwb_rsv_companion_status(struct uwb_rsv *rsv)
+{
+	static const int companion_statuses[] = {
+		[UWB_RSV_STATE_O_MOVE_EXPANDING]     = 0,
+		[UWB_RSV_STATE_T_EXPANDING_ACCEPTED] = 1,
+		[UWB_RSV_STATE_T_EXPANDING_CONFLICT] = 0,
+		[UWB_RSV_STATE_T_EXPANDING_PENDING]  = 0,
+		[UWB_RSV_STATE_T_EXPANDING_DENIED]   = 0,
+	};
+
+	return companion_statuses[rsv->state];
+}
+
 /*
  * Allocate a DRP IE.
  *
@@ -33,16 +123,12 @@
 static struct uwb_ie_drp *uwb_drp_ie_alloc(void)
 {
 	struct uwb_ie_drp *drp_ie;
-	unsigned tiebreaker;
 
 	drp_ie = kzalloc(sizeof(struct uwb_ie_drp) +
 			UWB_NUM_ZONES * sizeof(struct uwb_drp_alloc),
 			GFP_KERNEL);
 	if (drp_ie) {
 		drp_ie->hdr.element_id = UWB_IE_DRP;
-
-		get_random_bytes(&tiebreaker, sizeof(unsigned));
-		uwb_ie_drp_set_tiebreaker(drp_ie, tiebreaker & 1);
 	}
 	return drp_ie;
 }
@@ -103,43 +189,17 @@ static void uwb_drp_ie_from_bm(struct uwb_ie_drp *drp_ie,
  */
 int uwb_drp_ie_update(struct uwb_rsv *rsv)
 {
-	struct device *dev = &rsv->rc->uwb_dev.dev;
 	struct uwb_ie_drp *drp_ie;
-	int reason_code, status;
+	struct uwb_rsv_move *mv;
+	int unsafe;
 
-	switch (rsv->state) {
-	case UWB_RSV_STATE_NONE:
+	if (rsv->state == UWB_RSV_STATE_NONE) {
 		kfree(rsv->drp_ie);
 		rsv->drp_ie = NULL;
 		return 0;
-	case UWB_RSV_STATE_O_INITIATED:
-		reason_code = UWB_DRP_REASON_ACCEPTED;
-		status = 0;
-		break;
-	case UWB_RSV_STATE_O_PENDING:
-		reason_code = UWB_DRP_REASON_ACCEPTED;
-		status = 0;
-		break;
-	case UWB_RSV_STATE_O_MODIFIED:
-		reason_code = UWB_DRP_REASON_MODIFIED;
-		status = 1;
-		break;
-	case UWB_RSV_STATE_O_ESTABLISHED:
-		reason_code = UWB_DRP_REASON_ACCEPTED;
-		status = 1;
-		break;
-	case UWB_RSV_STATE_T_ACCEPTED:
-		reason_code = UWB_DRP_REASON_ACCEPTED;
-		status = 1;
-		break;
-	case UWB_RSV_STATE_T_DENIED:
-		reason_code = UWB_DRP_REASON_DENIED;
-		status = 0;
-		break;
-	default:
-		dev_dbg(dev, "rsv with unhandled state (%d)\n", rsv->state);
-		return -EINVAL;
 	}
+	
+	unsafe = rsv->mas.unsafe ? 1 : 0;
 
 	if (rsv->drp_ie == NULL) {
 		rsv->drp_ie = uwb_drp_ie_alloc();
@@ -148,9 +208,11 @@ int uwb_drp_ie_update(struct uwb_rsv *rsv)
 	}
 	drp_ie = rsv->drp_ie;
 
+	uwb_ie_drp_set_unsafe(drp_ie,       unsafe);
+	uwb_ie_drp_set_tiebreaker(drp_ie,   rsv->tiebreaker);
 	uwb_ie_drp_set_owner(drp_ie,        uwb_rsv_is_owner(rsv));
-	uwb_ie_drp_set_status(drp_ie,       status);
-	uwb_ie_drp_set_reason_code(drp_ie,  reason_code);
+	uwb_ie_drp_set_status(drp_ie,       uwb_rsv_status(rsv));
+	uwb_ie_drp_set_reason_code(drp_ie,  uwb_rsv_reason_code(rsv));
 	uwb_ie_drp_set_stream_index(drp_ie, rsv->stream);
 	uwb_ie_drp_set_type(drp_ie,         rsv->type);
 
@@ -168,6 +230,27 @@ int uwb_drp_ie_update(struct uwb_rsv *rsv)
 
 	uwb_drp_ie_from_bm(drp_ie, &rsv->mas);
 
+	if (uwb_rsv_has_two_drp_ies(rsv)) {
+		mv = &rsv->mv; 
+		if (mv->companion_drp_ie == NULL) {
+			mv->companion_drp_ie = uwb_drp_ie_alloc();
+			if (mv->companion_drp_ie == NULL)
+				return -ENOMEM;
+		}
+		drp_ie = mv->companion_drp_ie;
+		
+		/* keep all the same configuration of the main drp_ie */
+		memcpy(drp_ie, rsv->drp_ie, sizeof(struct uwb_ie_drp));
+		
+
+		/* FIXME: handle properly the unsafe bit */
+		uwb_ie_drp_set_unsafe(drp_ie,       1);
+		uwb_ie_drp_set_status(drp_ie,       uwb_rsv_companion_status(rsv));
+		uwb_ie_drp_set_reason_code(drp_ie,  uwb_rsv_companion_reason_code(rsv));
+	
+		uwb_drp_ie_from_bm(drp_ie, &mv->companion_mas);
+	}
+
 	rsv->ie_valid = true;
 	return 0;
 }
@@ -218,6 +301,8 @@ void uwb_drp_ie_to_bm(struct uwb_mas_bm *bm, const struct uwb_ie_drp *drp_ie)
 	u8 zone;
 	u16 zone_mask;
 
+	bitmap_zero(bm->bm, UWB_NUM_MAS);
+
 	for (cnt = 0; cnt < numallocs; cnt++) {
 		alloc = &drp_ie->allocs[cnt];
 		zone_bm = le16_to_cpu(alloc->zone_bm);
@@ -229,3 +314,4 @@ void uwb_drp_ie_to_bm(struct uwb_mas_bm *bm, const struct uwb_ie_drp *drp_ie)
 		}
 	}
 }
+
diff --git a/drivers/uwb/drp.c b/drivers/uwb/drp.c
index fe328146adb..2b4f9406789 100644
--- a/drivers/uwb/drp.c
+++ b/drivers/uwb/drp.c
@@ -23,6 +23,59 @@
 #include <linux/delay.h>
 #include "uwb-internal.h"
 
+
+/* DRP Conflict Actions ([ECMA-368 2nd Edition] 17.4.6) */
+enum uwb_drp_conflict_action {
+	/* Reservation is mantained, no action needed */
+	UWB_DRP_CONFLICT_MANTAIN = 0,
+	
+	/* the device shall not transmit frames in conflicting MASs in
+	 * the following superframe. If the device is the reservation
+	 * target, it shall also set the Reason Code in its DRP IE to
+	 * Conflict in its beacon in the following superframe.
+	 */
+	UWB_DRP_CONFLICT_ACT1,
+	
+	/* the device shall not set the Reservation Status bit to ONE
+	 * and shall not transmit frames in conflicting MASs. If the
+	 * device is the reservation target, it shall also set the
+	 * Reason Code in its DRP IE to Conflict.
+	 */	
+	UWB_DRP_CONFLICT_ACT2,
+
+	/* the device shall not transmit frames in conflicting MASs in
+	 * the following superframe. It shall remove the conflicting
+	 * MASs from the reservation or set the Reservation Status to
+	 * ZERO in its beacon in the following superframe. If the
+	 * device is the reservation target, it shall also set the
+	 * Reason Code in its DRP IE to Conflict.
+	 */
+	UWB_DRP_CONFLICT_ACT3,
+};
+
+
+static void uwb_rc_set_drp_cmd_done(struct uwb_rc *rc, void *arg,
+				    struct uwb_rceb *reply, ssize_t reply_size)
+{
+	struct uwb_rc_evt_set_drp_ie *r = (struct uwb_rc_evt_set_drp_ie *)reply;
+
+	if (r != NULL) {
+		if (r->bResultCode != UWB_RC_RES_SUCCESS)
+			dev_err(&rc->uwb_dev.dev, "SET-DRP-IE failed: %s (%d)\n",
+				uwb_rc_strerror(r->bResultCode), r->bResultCode);
+	} else
+		dev_err(&rc->uwb_dev.dev, "SET-DRP-IE: timeout\n");
+
+	spin_lock(&rc->rsvs_lock);
+	if (rc->set_drp_ie_pending > 1) {
+		rc->set_drp_ie_pending = 0;
+		uwb_rsv_queue_update(rc);	
+	} else {
+		rc->set_drp_ie_pending = 0;	
+	}
+	spin_unlock(&rc->rsvs_lock);
+}
+
 /**
  * Construct and send the SET DRP IE
  *
@@ -46,18 +99,23 @@
 int uwb_rc_send_all_drp_ie(struct uwb_rc *rc)
 {
 	int result;
-	struct device *dev = &rc->uwb_dev.dev;
 	struct uwb_rc_cmd_set_drp_ie *cmd;
-	struct uwb_rc_evt_set_drp_ie reply;
 	struct uwb_rsv *rsv;
+	struct uwb_rsv_move *mv;
 	int num_bytes = 0;
 	u8 *IEDataptr;
 
 	result = -ENOMEM;
 	/* First traverse all reservations to determine memory needed. */
 	list_for_each_entry(rsv, &rc->reservations, rc_node) {
-		if (rsv->drp_ie != NULL)
+		if (rsv->drp_ie != NULL) {
 			num_bytes += rsv->drp_ie->hdr.length + 2;
+			if (uwb_rsv_has_two_drp_ies(rsv) &&
+				(rsv->mv.companion_drp_ie != NULL)) {
+				mv = &rsv->mv;
+				num_bytes += mv->companion_drp_ie->hdr.length + 2;	
+			}
+		}
 	}
 	num_bytes += sizeof(rc->drp_avail.ie);
 	cmd = kzalloc(sizeof(*cmd) + num_bytes, GFP_KERNEL);
@@ -68,109 +126,322 @@ int uwb_rc_send_all_drp_ie(struct uwb_rc *rc)
 	cmd->wIELength = num_bytes;
 	IEDataptr = (u8 *)&cmd->IEData[0];
 
+	/* FIXME: DRV avail IE is not always needed */
+	/* put DRP avail IE first */
+	memcpy(IEDataptr, &rc->drp_avail.ie, sizeof(rc->drp_avail.ie));
+	IEDataptr += sizeof(struct uwb_ie_drp_avail);
+
 	/* Next traverse all reservations to place IEs in allocated memory. */
 	list_for_each_entry(rsv, &rc->reservations, rc_node) {
 		if (rsv->drp_ie != NULL) {
 			memcpy(IEDataptr, rsv->drp_ie,
 			       rsv->drp_ie->hdr.length + 2);
 			IEDataptr += rsv->drp_ie->hdr.length + 2;
+			
+			if (uwb_rsv_has_two_drp_ies(rsv) &&
+				(rsv->mv.companion_drp_ie != NULL)) {
+				mv = &rsv->mv;
+				memcpy(IEDataptr, mv->companion_drp_ie,
+				       mv->companion_drp_ie->hdr.length + 2);
+				IEDataptr += mv->companion_drp_ie->hdr.length + 2;	
+			}
 		}
 	}
-	memcpy(IEDataptr, &rc->drp_avail.ie, sizeof(rc->drp_avail.ie));
 
-	reply.rceb.bEventType = UWB_RC_CET_GENERAL;
-	reply.rceb.wEvent = UWB_RC_CMD_SET_DRP_IE;
-	result = uwb_rc_cmd(rc, "SET-DRP-IE", &cmd->rccb,
-			sizeof(*cmd) + num_bytes, &reply.rceb,
-			sizeof(reply));
-	if (result < 0)
-		goto error_cmd;
-	result = le16_to_cpu(reply.wRemainingSpace);
-	if (reply.bResultCode != UWB_RC_RES_SUCCESS) {
-		dev_err(&rc->uwb_dev.dev, "SET-DRP-IE: command execution "
-				"failed: %s (%d). RemainingSpace in beacon "
-				"= %d\n", uwb_rc_strerror(reply.bResultCode),
-				reply.bResultCode, result);
-		result = -EIO;
-	} else {
-		dev_dbg(dev, "SET-DRP-IE sent. RemainingSpace in beacon "
-			     "= %d.\n", result);
-		result = 0;
-	}
-error_cmd:
+	result = uwb_rc_cmd_async(rc, "SET-DRP-IE", &cmd->rccb, sizeof(*cmd) + num_bytes,
+				  UWB_RC_CET_GENERAL, UWB_RC_CMD_SET_DRP_IE,
+				  uwb_rc_set_drp_cmd_done, NULL);
+	
+	rc->set_drp_ie_pending = 1;
+
 	kfree(cmd);
 error:
 	return result;
 }
 
-void uwb_drp_handle_timeout(struct uwb_rsv *rsv)
+/*
+ * Evaluate the action to perform using conflict resolution rules
+ *
+ * Return a uwb_drp_conflict_action.
+ */
+static int evaluate_conflict_action(struct uwb_ie_drp *ext_drp_ie, int ext_beacon_slot,
+				    struct uwb_rsv *rsv, int our_status)
 {
-	struct device *dev = &rsv->rc->uwb_dev.dev;
+	int our_tie_breaker = rsv->tiebreaker;
+	int our_type        = rsv->type;
+	int our_beacon_slot = rsv->rc->uwb_dev.beacon_slot;
+
+	int ext_tie_breaker = uwb_ie_drp_tiebreaker(ext_drp_ie);
+	int ext_status      = uwb_ie_drp_status(ext_drp_ie);
+	int ext_type        = uwb_ie_drp_type(ext_drp_ie);
+	
+	
+	/* [ECMA-368 2nd Edition] 17.4.6 */
+	if (ext_type == UWB_DRP_TYPE_PCA && our_type == UWB_DRP_TYPE_PCA) {
+		return UWB_DRP_CONFLICT_MANTAIN;
+	}
+
+	/* [ECMA-368 2nd Edition] 17.4.6-1 */
+	if (our_type == UWB_DRP_TYPE_ALIEN_BP) {
+		return UWB_DRP_CONFLICT_MANTAIN;
+	}
+	
+	/* [ECMA-368 2nd Edition] 17.4.6-2 */
+	if (ext_type == UWB_DRP_TYPE_ALIEN_BP) {
+		/* here we know our_type != UWB_DRP_TYPE_ALIEN_BP */
+		return UWB_DRP_CONFLICT_ACT1;
+	}
+
+	/* [ECMA-368 2nd Edition] 17.4.6-3 */
+	if (our_status == 0 && ext_status == 1) {
+		return UWB_DRP_CONFLICT_ACT2;
+	}
 
-	dev_dbg(dev, "reservation timeout in state %s (%d)\n",
-		uwb_rsv_state_str(rsv->state), rsv->state);
+	/* [ECMA-368 2nd Edition] 17.4.6-4 */
+	if (our_status == 1 && ext_status == 0) {
+		return UWB_DRP_CONFLICT_MANTAIN;
+	}
 
-	switch (rsv->state) {
-	case UWB_RSV_STATE_O_INITIATED:
-		if (rsv->is_multicast) {
-			uwb_rsv_set_state(rsv, UWB_RSV_STATE_O_ESTABLISHED);
-			return;
+	/* [ECMA-368 2nd Edition] 17.4.6-5a */
+	if (our_tie_breaker == ext_tie_breaker &&
+	    our_beacon_slot <  ext_beacon_slot) {
+		return UWB_DRP_CONFLICT_MANTAIN;
+	}
+
+	/* [ECMA-368 2nd Edition] 17.4.6-5b */
+	if (our_tie_breaker != ext_tie_breaker &&
+	    our_beacon_slot >  ext_beacon_slot) {
+		return UWB_DRP_CONFLICT_MANTAIN;
+	}
+	
+	if (our_status == 0) {
+		if (our_tie_breaker == ext_tie_breaker) {
+			/* [ECMA-368 2nd Edition] 17.4.6-6a */
+			if (our_beacon_slot > ext_beacon_slot) {
+				return UWB_DRP_CONFLICT_ACT2;
+			}
+		} else  {
+			/* [ECMA-368 2nd Edition] 17.4.6-6b */
+			if (our_beacon_slot < ext_beacon_slot) {
+				return UWB_DRP_CONFLICT_ACT2;
+			}
 		}
-		break;
-	case UWB_RSV_STATE_O_ESTABLISHED:
-		if (rsv->is_multicast)
-			return;
-		break;
-	default:
-		break;
+	} else {
+		if (our_tie_breaker == ext_tie_breaker) {
+			/* [ECMA-368 2nd Edition] 17.4.6-7a */
+			if (our_beacon_slot > ext_beacon_slot) {
+				return UWB_DRP_CONFLICT_ACT3;
+			}
+		} else {
+			/* [ECMA-368 2nd Edition] 17.4.6-7b */
+			if (our_beacon_slot < ext_beacon_slot) {
+				return UWB_DRP_CONFLICT_ACT3;
+			}
+		}
+	}
+	return UWB_DRP_CONFLICT_MANTAIN;
+}
+
+static void handle_conflict_normal(struct uwb_ie_drp *drp_ie, 
+				   int ext_beacon_slot, 
+				   struct uwb_rsv *rsv, 
+				   struct uwb_mas_bm *conflicting_mas)
+{
+	struct uwb_rc *rc = rsv->rc;
+	struct uwb_rsv_move *mv = &rsv->mv;
+	struct uwb_drp_backoff_win *bow = &rc->bow;
+	int action;
+
+	action = evaluate_conflict_action(drp_ie, ext_beacon_slot, rsv, uwb_rsv_status(rsv));
+
+	if (uwb_rsv_is_owner(rsv)) {
+		switch(action) {
+		case UWB_DRP_CONFLICT_ACT2:
+			/* try move */
+			uwb_rsv_set_state(rsv, UWB_RSV_STATE_O_TO_BE_MOVED);
+			if (bow->can_reserve_extra_mases == false)
+				uwb_rsv_backoff_win_increment(rc);
+			
+			break;
+		case UWB_DRP_CONFLICT_ACT3:
+			uwb_rsv_backoff_win_increment(rc);
+			/* drop some mases with reason modified */
+			/* put in the companion the mases to be dropped */
+			bitmap_and(mv->companion_mas.bm, rsv->mas.bm, conflicting_mas->bm, UWB_NUM_MAS);
+			uwb_rsv_set_state(rsv, UWB_RSV_STATE_O_MODIFIED);
+		default:
+			break;
+		}
+	} else {
+		switch(action) {
+		case UWB_DRP_CONFLICT_ACT2:
+		case UWB_DRP_CONFLICT_ACT3:
+			uwb_rsv_set_state(rsv, UWB_RSV_STATE_T_CONFLICT);	
+		default:
+			break;
+		}
+
+	}
+	
+}
+
+static void handle_conflict_expanding(struct uwb_ie_drp *drp_ie, int ext_beacon_slot,
+				      struct uwb_rsv *rsv, bool companion_only,
+				      struct uwb_mas_bm *conflicting_mas)
+{
+	struct uwb_rc *rc = rsv->rc;
+	struct uwb_drp_backoff_win *bow = &rc->bow;
+	struct uwb_rsv_move *mv = &rsv->mv;
+	int action;
+	
+	if (companion_only) {
+		/* status of companion is 0 at this point */
+		action = evaluate_conflict_action(drp_ie, ext_beacon_slot, rsv, 0);
+		if (uwb_rsv_is_owner(rsv)) {
+			switch(action) {
+			case UWB_DRP_CONFLICT_ACT2:
+			case UWB_DRP_CONFLICT_ACT3:
+				uwb_rsv_set_state(rsv, UWB_RSV_STATE_O_ESTABLISHED);
+				rsv->needs_release_companion_mas = false;
+				if (bow->can_reserve_extra_mases == false)
+					uwb_rsv_backoff_win_increment(rc);
+				uwb_drp_avail_release(rsv->rc, &rsv->mv.companion_mas);
+			}
+		} else { /* rsv is target */			
+			switch(action) {
+			case UWB_DRP_CONFLICT_ACT2:
+			case UWB_DRP_CONFLICT_ACT3:
+				uwb_rsv_set_state(rsv, UWB_RSV_STATE_T_EXPANDING_CONFLICT);
+                                /* send_drp_avail_ie = true; */
+			}
+		}
+	} else { /* also base part of the reservation is conflicting */		
+		if (uwb_rsv_is_owner(rsv)) {
+			uwb_rsv_backoff_win_increment(rc);
+			/* remove companion part */
+			uwb_drp_avail_release(rsv->rc, &rsv->mv.companion_mas);
+
+			/* drop some mases with reason modified */
+
+			/* put in the companion the mases to be dropped */
+			bitmap_andnot(mv->companion_mas.bm, rsv->mas.bm, conflicting_mas->bm, UWB_NUM_MAS);
+			uwb_rsv_set_state(rsv, UWB_RSV_STATE_O_MODIFIED);
+		} else { /* it is a target rsv */
+			uwb_rsv_set_state(rsv, UWB_RSV_STATE_T_CONFLICT);
+                        /* send_drp_avail_ie = true; */
+		}
+	}
+}
+
+static void uwb_drp_handle_conflict_rsv(struct uwb_rc *rc, struct uwb_rsv *rsv,
+					struct uwb_rc_evt_drp *drp_evt, 
+					struct uwb_ie_drp *drp_ie,
+					struct uwb_mas_bm *conflicting_mas)
+{
+	struct uwb_rsv_move *mv;
+
+	/* check if the conflicting reservation has two drp_ies */
+	if (uwb_rsv_has_two_drp_ies(rsv)) {
+		mv = &rsv->mv;
+		if (bitmap_intersects(rsv->mas.bm, conflicting_mas->bm, UWB_NUM_MAS)) {
+			handle_conflict_expanding(drp_ie, drp_evt->beacon_slot_number,
+						  rsv, false, conflicting_mas);
+		} else {
+			if (bitmap_intersects(mv->companion_mas.bm, conflicting_mas->bm, UWB_NUM_MAS)) {
+				handle_conflict_expanding(drp_ie, drp_evt->beacon_slot_number,
+							  rsv, true, conflicting_mas);	
+			}
+		}
+	} else if (bitmap_intersects(rsv->mas.bm, conflicting_mas->bm, UWB_NUM_MAS)) {
+		handle_conflict_normal(drp_ie, drp_evt->beacon_slot_number, rsv, conflicting_mas);
 	}
-	uwb_rsv_remove(rsv);
 }
 
+static void uwb_drp_handle_all_conflict_rsv(struct uwb_rc *rc,
+					    struct uwb_rc_evt_drp *drp_evt, 
+					    struct uwb_ie_drp *drp_ie,
+					    struct uwb_mas_bm *conflicting_mas)
+{
+	struct uwb_rsv *rsv;
+	
+	list_for_each_entry(rsv, &rc->reservations, rc_node) {
+		uwb_drp_handle_conflict_rsv(rc, rsv, drp_evt, drp_ie, conflicting_mas);	
+	}
+}
+	
 /*
  * Based on the DRP IE, transition a target reservation to a new
  * state.
  */
 static void uwb_drp_process_target(struct uwb_rc *rc, struct uwb_rsv *rsv,
-				   struct uwb_ie_drp *drp_ie)
+				   struct uwb_ie_drp *drp_ie, struct uwb_rc_evt_drp *drp_evt)
 {
 	struct device *dev = &rc->uwb_dev.dev;
+	struct uwb_rsv_move *mv = &rsv->mv;
 	int status;
 	enum uwb_drp_reason reason_code;
-
+	struct uwb_mas_bm mas;
+	
 	status = uwb_ie_drp_status(drp_ie);
 	reason_code = uwb_ie_drp_reason_code(drp_ie);
+	uwb_drp_ie_to_bm(&mas, drp_ie);
 
-	if (status) {
-		switch (reason_code) {
-		case UWB_DRP_REASON_ACCEPTED:
-			uwb_rsv_set_state(rsv, UWB_RSV_STATE_T_ACCEPTED);
-			break;
-		case UWB_DRP_REASON_MODIFIED:
-			dev_err(dev, "FIXME: unhandled reason code (%d/%d)\n",
-				reason_code, status);
+	switch (reason_code) {
+	case UWB_DRP_REASON_ACCEPTED:
+
+		if (rsv->state == UWB_RSV_STATE_T_CONFLICT) {
+			uwb_rsv_set_state(rsv, UWB_RSV_STATE_T_CONFLICT);
 			break;
-		default:
-			dev_warn(dev, "ignoring invalid DRP IE state (%d/%d)\n",
-				 reason_code, status);
 		}
-	} else {
-		switch (reason_code) {
-		case UWB_DRP_REASON_ACCEPTED:
-			/* New reservations are handled in uwb_rsv_find(). */
-			break;
-		case UWB_DRP_REASON_DENIED:
-			uwb_rsv_set_state(rsv, UWB_RSV_STATE_NONE);
-			break;
-		case UWB_DRP_REASON_CONFLICT:
-		case UWB_DRP_REASON_MODIFIED:
-			dev_err(dev, "FIXME: unhandled reason code (%d/%d)\n",
-				reason_code, status);
+
+		if (rsv->state == UWB_RSV_STATE_T_EXPANDING_ACCEPTED) {
+			/* drp_ie is companion */
+			if (!bitmap_equal(rsv->mas.bm, mas.bm, UWB_NUM_MAS))
+				/* stroke companion */
+				uwb_rsv_set_state(rsv, UWB_RSV_STATE_T_EXPANDING_ACCEPTED);	
+		} else {
+			if (!bitmap_equal(rsv->mas.bm, mas.bm, UWB_NUM_MAS)) {
+				if (uwb_drp_avail_reserve_pending(rc, &mas) == -EBUSY) {
+					/* FIXME: there is a conflict, find
+					 * the conflicting reservations and
+					 * take a sensible action. Consider
+					 * that in drp_ie there is the
+					 * "neighbour" */
+					uwb_drp_handle_all_conflict_rsv(rc, drp_evt, drp_ie, &mas);
+				} else {
+					/* accept the extra reservation */
+					bitmap_copy(mv->companion_mas.bm, mas.bm, UWB_NUM_MAS);
+					uwb_rsv_set_state(rsv, UWB_RSV_STATE_T_EXPANDING_ACCEPTED);
+				}
+			} else {
+				if (status) {
+					uwb_rsv_set_state(rsv, UWB_RSV_STATE_T_ACCEPTED);
+				}
+			}
+			
+		}
+		break;
+
+	case UWB_DRP_REASON_MODIFIED:
+		/* check to see if we have already modified the reservation */
+		if (bitmap_equal(rsv->mas.bm, mas.bm, UWB_NUM_MAS)) {
+			uwb_rsv_set_state(rsv, UWB_RSV_STATE_T_ACCEPTED);
 			break;
-		default:
-			dev_warn(dev, "ignoring invalid DRP IE state (%d/%d)\n",
-				 reason_code, status);
 		}
+
+		/* find if the owner wants to expand or reduce */
+		if (bitmap_subset(mas.bm, rsv->mas.bm, UWB_NUM_MAS)) {
+			/* owner is reducing */
+			bitmap_andnot(mv->companion_mas.bm, rsv->mas.bm, mas.bm, UWB_NUM_MAS);
+			uwb_drp_avail_release(rsv->rc, &mv->companion_mas);
+		}
+
+		bitmap_copy(rsv->mas.bm, mas.bm, UWB_NUM_MAS);
+		uwb_rsv_set_state(rsv, UWB_RSV_STATE_T_RESIZED);
+		break;
+	default:
+		dev_warn(dev, "ignoring invalid DRP IE state (%d/%d)\n",
+			 reason_code, status);
 	}
 }
 
@@ -179,23 +450,60 @@ static void uwb_drp_process_target(struct uwb_rc *rc, struct uwb_rsv *rsv,
  * state.
  */
 static void uwb_drp_process_owner(struct uwb_rc *rc, struct uwb_rsv *rsv,
-				  struct uwb_ie_drp *drp_ie)
+				  struct uwb_dev *src, struct uwb_ie_drp *drp_ie,
+				  struct uwb_rc_evt_drp *drp_evt)
 {
 	struct device *dev = &rc->uwb_dev.dev;
+	struct uwb_rsv_move *mv = &rsv->mv;
 	int status;
 	enum uwb_drp_reason reason_code;
+	struct uwb_mas_bm mas;
 
 	status = uwb_ie_drp_status(drp_ie);
 	reason_code = uwb_ie_drp_reason_code(drp_ie);
+	uwb_drp_ie_to_bm(&mas, drp_ie);
 
 	if (status) {
 		switch (reason_code) {
 		case UWB_DRP_REASON_ACCEPTED:
-			uwb_rsv_set_state(rsv, UWB_RSV_STATE_O_ESTABLISHED);
-			break;
-		case UWB_DRP_REASON_MODIFIED:
-			dev_err(dev, "FIXME: unhandled reason code (%d/%d)\n",
-				reason_code, status);
+			switch (rsv->state) {
+			case UWB_RSV_STATE_O_PENDING:
+			case UWB_RSV_STATE_O_INITIATED:
+			case UWB_RSV_STATE_O_ESTABLISHED:
+				uwb_rsv_set_state(rsv, UWB_RSV_STATE_O_ESTABLISHED);
+				break;
+			case UWB_RSV_STATE_O_MODIFIED:
+				if (bitmap_equal(mas.bm, rsv->mas.bm, UWB_NUM_MAS)) {
+					uwb_rsv_set_state(rsv, UWB_RSV_STATE_O_ESTABLISHED);
+				} else {
+					uwb_rsv_set_state(rsv, UWB_RSV_STATE_O_MODIFIED);	
+				}
+				break;
+				
+			case UWB_RSV_STATE_O_MOVE_REDUCING: /* shouldn' t be a problem */
+				if (bitmap_equal(mas.bm, rsv->mas.bm, UWB_NUM_MAS)) {
+					uwb_rsv_set_state(rsv, UWB_RSV_STATE_O_ESTABLISHED);
+				} else {
+					uwb_rsv_set_state(rsv, UWB_RSV_STATE_O_MOVE_REDUCING);	
+				}
+				break;
+			case UWB_RSV_STATE_O_MOVE_EXPANDING:
+				if (bitmap_equal(mas.bm, mv->companion_mas.bm, UWB_NUM_MAS)) {
+					/* Companion reservation accepted */
+					uwb_rsv_set_state(rsv, UWB_RSV_STATE_O_MOVE_COMBINING);
+				} else {
+					uwb_rsv_set_state(rsv, UWB_RSV_STATE_O_MOVE_EXPANDING);
+				}
+				break;
+			case UWB_RSV_STATE_O_MOVE_COMBINING:
+				if (bitmap_equal(mas.bm, rsv->mas.bm, UWB_NUM_MAS))
+					uwb_rsv_set_state(rsv, UWB_RSV_STATE_O_MOVE_REDUCING);
+				else
+					uwb_rsv_set_state(rsv, UWB_RSV_STATE_O_MOVE_COMBINING);
+				break;
+			default:
+				break;	
+			}
 			break;
 		default:
 			dev_warn(dev, "ignoring invalid DRP IE state (%d/%d)\n",
@@ -210,9 +518,10 @@ static void uwb_drp_process_owner(struct uwb_rc *rc, struct uwb_rsv *rsv,
 			uwb_rsv_set_state(rsv, UWB_RSV_STATE_NONE);
 			break;
 		case UWB_DRP_REASON_CONFLICT:
-		case UWB_DRP_REASON_MODIFIED:
-			dev_err(dev, "FIXME: unhandled reason code (%d/%d)\n",
-				reason_code, status);
+			/* resolve the conflict */
+			bitmap_complement(mas.bm, src->last_availability_bm,
+					  UWB_NUM_MAS);
+			uwb_drp_handle_conflict_rsv(rc, rsv, drp_evt, drp_ie, &mas);
 			break;
 		default:
 			dev_warn(dev, "ignoring invalid DRP IE state (%d/%d)\n",
@@ -221,12 +530,110 @@ static void uwb_drp_process_owner(struct uwb_rc *rc, struct uwb_rsv *rsv,
 	}
 }
 
+static void uwb_cnflt_alien_stroke_timer(struct uwb_cnflt_alien *cnflt)
+{
+	unsigned timeout_us = UWB_MAX_LOST_BEACONS * UWB_SUPERFRAME_LENGTH_US;
+	mod_timer(&cnflt->timer, jiffies + usecs_to_jiffies(timeout_us));
+}
+
+static void uwb_cnflt_update_work(struct work_struct *work)
+{
+	struct uwb_cnflt_alien *cnflt = container_of(work,
+						     struct uwb_cnflt_alien,
+						     cnflt_update_work);
+	struct uwb_cnflt_alien *c;
+	struct uwb_rc *rc = cnflt->rc;
+	
+	unsigned long delay_us = UWB_MAS_LENGTH_US * UWB_MAS_PER_ZONE;
+	
+	mutex_lock(&rc->rsvs_mutex);
+
+	list_del(&cnflt->rc_node);
+
+	/* update rc global conflicting alien bitmap */
+	bitmap_zero(rc->cnflt_alien_bitmap.bm, UWB_NUM_MAS);
+
+	list_for_each_entry(c, &rc->cnflt_alien_list, rc_node) {
+		bitmap_or(rc->cnflt_alien_bitmap.bm, rc->cnflt_alien_bitmap.bm, c->mas.bm, UWB_NUM_MAS);			
+	}
+	
+	queue_delayed_work(rc->rsv_workq, &rc->rsv_alien_bp_work, usecs_to_jiffies(delay_us));
+
+	kfree(cnflt);
+	mutex_unlock(&rc->rsvs_mutex);
+}
+
+static void uwb_cnflt_timer(unsigned long arg)
+{
+	struct uwb_cnflt_alien *cnflt = (struct uwb_cnflt_alien *)arg;
+
+	queue_work(cnflt->rc->rsv_workq, &cnflt->cnflt_update_work);
+}
+
 /*
- * Process a received DRP IE, it's either for a reservation owned by
- * the RC or targeted at it (or it's for a WUSB cluster reservation).
+ * We have received an DRP_IE of type Alien BP and we need to make
+ * sure we do not transmit in conflicting MASs.
  */
-static void uwb_drp_process(struct uwb_rc *rc, struct uwb_dev *src,
-		     struct uwb_ie_drp *drp_ie)
+static void uwb_drp_handle_alien_drp(struct uwb_rc *rc, struct uwb_ie_drp *drp_ie)
+{
+	struct device *dev = &rc->uwb_dev.dev;
+	struct uwb_mas_bm mas;
+	struct uwb_cnflt_alien *cnflt;
+	char buf[72];
+	unsigned long delay_us = UWB_MAS_LENGTH_US * UWB_MAS_PER_ZONE;
+	
+	uwb_drp_ie_to_bm(&mas, drp_ie);
+	bitmap_scnprintf(buf, sizeof(buf), mas.bm, UWB_NUM_MAS);
+	
+	list_for_each_entry(cnflt, &rc->cnflt_alien_list, rc_node) {
+		if (bitmap_equal(cnflt->mas.bm, mas.bm, UWB_NUM_MAS)) {
+			/* Existing alien BP reservation conflicting
+			 * bitmap, just reset the timer */
+			uwb_cnflt_alien_stroke_timer(cnflt);
+			return;
+		}
+	}
+
+	/* New alien BP reservation conflicting bitmap */
+
+	/* alloc and initialize new uwb_cnflt_alien */
+	cnflt = kzalloc(sizeof(struct uwb_cnflt_alien), GFP_KERNEL);
+	if (!cnflt)
+		dev_err(dev, "failed to alloc uwb_cnflt_alien struct\n");
+	INIT_LIST_HEAD(&cnflt->rc_node);
+	init_timer(&cnflt->timer);
+	cnflt->timer.function = uwb_cnflt_timer;
+	cnflt->timer.data     = (unsigned long)cnflt;
+
+	cnflt->rc = rc;
+	INIT_WORK(&cnflt->cnflt_update_work, uwb_cnflt_update_work);
+	
+	bitmap_copy(cnflt->mas.bm, mas.bm, UWB_NUM_MAS);
+
+	list_add_tail(&cnflt->rc_node, &rc->cnflt_alien_list);
+
+	/* update rc global conflicting alien bitmap */
+	bitmap_or(rc->cnflt_alien_bitmap.bm, rc->cnflt_alien_bitmap.bm, mas.bm, UWB_NUM_MAS);
+
+	queue_delayed_work(rc->rsv_workq, &rc->rsv_alien_bp_work, usecs_to_jiffies(delay_us));
+	
+	/* start the timer */
+	uwb_cnflt_alien_stroke_timer(cnflt);
+}
+
+static void uwb_drp_process_not_involved(struct uwb_rc *rc,
+					 struct uwb_rc_evt_drp *drp_evt, 
+					 struct uwb_ie_drp *drp_ie)
+{
+	struct uwb_mas_bm mas;
+	
+	uwb_drp_ie_to_bm(&mas, drp_ie);
+	uwb_drp_handle_all_conflict_rsv(rc, drp_evt, drp_ie, &mas);
+}
+
+static void uwb_drp_process_involved(struct uwb_rc *rc, struct uwb_dev *src,
+				     struct uwb_rc_evt_drp *drp_evt,
+				     struct uwb_ie_drp *drp_ie)
 {
 	struct uwb_rsv *rsv;
 
@@ -239,7 +646,7 @@ static void uwb_drp_process(struct uwb_rc *rc, struct uwb_dev *src,
 		 */
 		return;
 	}
-
+	
 	/*
 	 * Do nothing with DRP IEs for reservations that have been
 	 * terminated.
@@ -248,13 +655,43 @@ static void uwb_drp_process(struct uwb_rc *rc, struct uwb_dev *src,
 		uwb_rsv_set_state(rsv, UWB_RSV_STATE_NONE);
 		return;
 	}
-
+			
 	if (uwb_ie_drp_owner(drp_ie))
-		uwb_drp_process_target(rc, rsv, drp_ie);
+		uwb_drp_process_target(rc, rsv, drp_ie, drp_evt);
+	else
+		uwb_drp_process_owner(rc, rsv, src, drp_ie, drp_evt);
+	
+}
+
+
+static bool uwb_drp_involves_us(struct uwb_rc *rc, struct uwb_ie_drp *drp_ie)
+{
+	return uwb_dev_addr_cmp(&rc->uwb_dev.dev_addr, &drp_ie->dev_addr) == 0;
+}
+
+/*
+ * Process a received DRP IE.
+ */
+static void uwb_drp_process(struct uwb_rc *rc, struct uwb_rc_evt_drp *drp_evt,
+			    struct uwb_dev *src, struct uwb_ie_drp *drp_ie)
+{
+	if (uwb_ie_drp_type(drp_ie) == UWB_DRP_TYPE_ALIEN_BP)
+		uwb_drp_handle_alien_drp(rc, drp_ie);
+	else if (uwb_drp_involves_us(rc, drp_ie))
+		uwb_drp_process_involved(rc, src, drp_evt, drp_ie);
 	else
-		uwb_drp_process_owner(rc, rsv, drp_ie);
+		uwb_drp_process_not_involved(rc, drp_evt, drp_ie);
 }
 
+/*
+ * Process a received DRP Availability IE
+ */
+static void uwb_drp_availability_process(struct uwb_rc *rc, struct uwb_dev *src,
+					 struct uwb_ie_drp_avail *drp_availability_ie)
+{
+	bitmap_copy(src->last_availability_bm,
+		    drp_availability_ie->bmp, UWB_NUM_MAS);
+}
 
 /*
  * Process all the DRP IEs (both DRP IEs and the DRP Availability IE)
@@ -276,10 +713,10 @@ void uwb_drp_process_all(struct uwb_rc *rc, struct uwb_rc_evt_drp *drp_evt,
 
 		switch (ie_hdr->element_id) {
 		case UWB_IE_DRP_AVAILABILITY:
-			/* FIXME: does something need to be done with this? */
+			uwb_drp_availability_process(rc, src_dev, (struct uwb_ie_drp_avail *)ie_hdr);
 			break;
 		case UWB_IE_DRP:
-			uwb_drp_process(rc, src_dev, (struct uwb_ie_drp *)ie_hdr);
+			uwb_drp_process(rc, drp_evt, src_dev, (struct uwb_ie_drp *)ie_hdr);
 			break;
 		default:
 			dev_warn(dev, "unexpected IE in DRP notification\n");
@@ -292,55 +729,6 @@ void uwb_drp_process_all(struct uwb_rc *rc, struct uwb_rc_evt_drp *drp_evt,
 			 (int)ielen);
 }
 
-
-/*
- * Go through all the DRP IEs and find the ones that conflict with our
- * reservations.
- *
- * FIXME: must resolve the conflict according the the rules in
- * [ECMA-368].
- */
-static
-void uwb_drp_process_conflict_all(struct uwb_rc *rc, struct uwb_rc_evt_drp *drp_evt,
-				  size_t ielen, struct uwb_dev *src_dev)
-{
-	struct device *dev = &rc->uwb_dev.dev;
-	struct uwb_ie_hdr *ie_hdr;
-	struct uwb_ie_drp *drp_ie;
-	void *ptr;
-
-	ptr = drp_evt->ie_data;
-	for (;;) {
-		ie_hdr = uwb_ie_next(&ptr, &ielen);
-		if (!ie_hdr)
-			break;
-
-		drp_ie = container_of(ie_hdr, struct uwb_ie_drp, hdr);
-
-		/* FIXME: check if this DRP IE conflicts. */
-	}
-
-	if (ielen > 0)
-		dev_warn(dev, "%d octets remaining in DRP notification\n",
-			 (int)ielen);
-}
-
-
-/*
- * Terminate all reservations owned by, or targeted at, 'uwb_dev'.
- */
-static void uwb_drp_terminate_all(struct uwb_rc *rc, struct uwb_dev *uwb_dev)
-{
-	struct uwb_rsv *rsv;
-
-	list_for_each_entry(rsv, &rc->reservations, rc_node) {
-		if (rsv->owner == uwb_dev
-		    || (rsv->target.type == UWB_RSV_TARGET_DEV && rsv->target.dev == uwb_dev))
-			uwb_rsv_remove(rsv);
-	}
-}
-
-
 /**
  * uwbd_evt_handle_rc_drp - handle a DRP_IE event
  * @evt: the DRP_IE event from the radio controller
@@ -381,7 +769,6 @@ int uwbd_evt_handle_rc_drp(struct uwb_event *evt)
 	size_t ielength, bytes_left;
 	struct uwb_dev_addr src_addr;
 	struct uwb_dev *src_dev;
-	int reason;
 
 	/* Is there enough data to decode the event (and any IEs in
 	   its payload)? */
@@ -417,22 +804,8 @@ int uwbd_evt_handle_rc_drp(struct uwb_event *evt)
 
 	mutex_lock(&rc->rsvs_mutex);
 
-	reason = uwb_rc_evt_drp_reason(drp_evt);
-
-	switch (reason) {
-	case UWB_DRP_NOTIF_DRP_IE_RCVD:
-		uwb_drp_process_all(rc, drp_evt, ielength, src_dev);
-		break;
-	case UWB_DRP_NOTIF_CONFLICT:
-		uwb_drp_process_conflict_all(rc, drp_evt, ielength, src_dev);
-		break;
-	case UWB_DRP_NOTIF_TERMINATE:
-		uwb_drp_terminate_all(rc, src_dev);
-		break;
-	default:
-		dev_warn(dev, "ignored DRP event with reason code: %d\n", reason);
-		break;
-	}
+	/* We do not distinguish from the reason */
+	uwb_drp_process_all(rc, drp_evt, ielength, src_dev);
 
 	mutex_unlock(&rc->rsvs_mutex);
 
diff --git a/drivers/uwb/rsv.c b/drivers/uwb/rsv.c
index 1cd84f92754..165aec6a8f9 100644
--- a/drivers/uwb/rsv.c
+++ b/drivers/uwb/rsv.c
@@ -17,20 +17,31 @@
  */
 #include <linux/kernel.h>
 #include <linux/uwb.h>
+#include <linux/random.h>
 
 #include "uwb-internal.h"
 
 static void uwb_rsv_timer(unsigned long arg);
 
 static const char *rsv_states[] = {
-	[UWB_RSV_STATE_NONE]          = "none",
-	[UWB_RSV_STATE_O_INITIATED]   = "initiated",
-	[UWB_RSV_STATE_O_PENDING]     = "pending",
-	[UWB_RSV_STATE_O_MODIFIED]    = "modified",
-	[UWB_RSV_STATE_O_ESTABLISHED] = "established",
-	[UWB_RSV_STATE_T_ACCEPTED]    = "accepted",
-	[UWB_RSV_STATE_T_DENIED]      = "denied",
-	[UWB_RSV_STATE_T_PENDING]     = "pending",
+	[UWB_RSV_STATE_NONE]                 = "none            ",
+	[UWB_RSV_STATE_O_INITIATED]          = "o initiated     ",
+	[UWB_RSV_STATE_O_PENDING]            = "o pending       ",
+	[UWB_RSV_STATE_O_MODIFIED]           = "o modified      ",
+	[UWB_RSV_STATE_O_ESTABLISHED]        = "o established   ",
+	[UWB_RSV_STATE_O_TO_BE_MOVED]        = "o to be moved   ",
+	[UWB_RSV_STATE_O_MOVE_EXPANDING]     = "o move expanding",
+	[UWB_RSV_STATE_O_MOVE_COMBINING]     = "o move combining",
+	[UWB_RSV_STATE_O_MOVE_REDUCING]      = "o move reducing ",
+	[UWB_RSV_STATE_T_ACCEPTED]           = "t accepted      ",
+	[UWB_RSV_STATE_T_CONFLICT]           = "t conflict      ",
+	[UWB_RSV_STATE_T_PENDING]            = "t pending       ",
+	[UWB_RSV_STATE_T_DENIED]             = "t denied        ",
+	[UWB_RSV_STATE_T_RESIZED]            = "t resized       ",
+	[UWB_RSV_STATE_T_EXPANDING_ACCEPTED] = "t expanding acc ",
+	[UWB_RSV_STATE_T_EXPANDING_CONFLICT] = "t expanding conf",
+	[UWB_RSV_STATE_T_EXPANDING_PENDING]  = "t expanding pend",
+	[UWB_RSV_STATE_T_EXPANDING_DENIED]   = "t expanding den ",
 };
 
 static const char *rsv_types[] = {
@@ -41,6 +52,31 @@ static const char *rsv_types[] = {
 	[UWB_DRP_TYPE_PCA]      = "pca",
 };
 
+bool uwb_rsv_has_two_drp_ies(struct uwb_rsv *rsv)
+{
+	static const bool has_two_drp_ies[] = {
+		[UWB_RSV_STATE_O_INITIATED]               = false,
+		[UWB_RSV_STATE_O_PENDING]                 = false,
+		[UWB_RSV_STATE_O_MODIFIED]                = false,
+		[UWB_RSV_STATE_O_ESTABLISHED]             = false,
+		[UWB_RSV_STATE_O_TO_BE_MOVED]             = false,
+		[UWB_RSV_STATE_O_MOVE_COMBINING]          = false,
+		[UWB_RSV_STATE_O_MOVE_REDUCING]           = false,
+		[UWB_RSV_STATE_O_MOVE_EXPANDING]          = true,
+		[UWB_RSV_STATE_T_ACCEPTED]                = false,
+		[UWB_RSV_STATE_T_CONFLICT]                = false,
+		[UWB_RSV_STATE_T_PENDING]                 = false,
+		[UWB_RSV_STATE_T_DENIED]                  = false,
+		[UWB_RSV_STATE_T_RESIZED]                 = false,
+		[UWB_RSV_STATE_T_EXPANDING_ACCEPTED]      = true,
+		[UWB_RSV_STATE_T_EXPANDING_CONFLICT]      = true,
+		[UWB_RSV_STATE_T_EXPANDING_PENDING]       = true,
+		[UWB_RSV_STATE_T_EXPANDING_DENIED]        = true,
+	};
+
+	return has_two_drp_ies[rsv->state];
+}
+
 /**
  * uwb_rsv_state_str - return a string for a reservation state
  * @state: the reservation state.
@@ -65,7 +101,7 @@ const char *uwb_rsv_type_str(enum uwb_drp_type type)
 }
 EXPORT_SYMBOL_GPL(uwb_rsv_type_str);
 
-static void uwb_rsv_dump(struct uwb_rsv *rsv)
+void uwb_rsv_dump(char *text, struct uwb_rsv *rsv)
 {
 	struct device *dev = &rsv->rc->uwb_dev.dev;
 	struct uwb_dev_addr devaddr;
@@ -88,12 +124,12 @@ static void uwb_rsv_release(struct kref *kref)
 	kfree(rsv);
 }
 
-static void uwb_rsv_get(struct uwb_rsv *rsv)
+void uwb_rsv_get(struct uwb_rsv *rsv)
 {
 	kref_get(&rsv->kref);
 }
 
-static void uwb_rsv_put(struct uwb_rsv *rsv)
+void uwb_rsv_put(struct uwb_rsv *rsv)
 {
 	kref_put(&rsv->kref, uwb_rsv_release);
 }
@@ -108,6 +144,7 @@ static void uwb_rsv_put(struct uwb_rsv *rsv)
 static int uwb_rsv_get_stream(struct uwb_rsv *rsv)
 {
 	struct uwb_rc *rc = rsv->rc;
+	struct device *dev = &rc->uwb_dev.dev;
 	unsigned long *streams_bm;
 	int stream;
 
@@ -129,12 +166,15 @@ static int uwb_rsv_get_stream(struct uwb_rsv *rsv)
 	rsv->stream = stream;
 	set_bit(stream, streams_bm);
 
+	dev_dbg(dev, "get stream %d\n", rsv->stream);
+
 	return 0;
 }
 
 static void uwb_rsv_put_stream(struct uwb_rsv *rsv)
 {
 	struct uwb_rc *rc = rsv->rc;
+	struct device *dev = &rc->uwb_dev.dev;
 	unsigned long *streams_bm;
 
 	switch (rsv->target.type) {
@@ -149,86 +189,52 @@ static void uwb_rsv_put_stream(struct uwb_rsv *rsv)
 	}
 
 	clear_bit(rsv->stream, streams_bm);
+
+	dev_dbg(dev, "put stream %d\n", rsv->stream);
 }
 
-/*
- * Generate a MAS allocation with a single row component.
- */
-static void uwb_rsv_gen_alloc_row(struct uwb_mas_bm *mas,
-				  int first_mas, int mas_per_zone,
-				  int zs, int ze)
+void uwb_rsv_backoff_win_timer(unsigned long arg)
 {
-	struct uwb_mas_bm col;
-	int z;
+	struct uwb_drp_backoff_win *bow = (struct uwb_drp_backoff_win *)arg;
+	struct uwb_rc *rc = container_of(bow, struct uwb_rc, bow);
+	struct device *dev = &rc->uwb_dev.dev;
 
-	bitmap_zero(mas->bm, UWB_NUM_MAS);
-	bitmap_zero(col.bm, UWB_NUM_MAS);
-	bitmap_fill(col.bm, mas_per_zone);
-	bitmap_shift_left(col.bm, col.bm, first_mas + zs * UWB_MAS_PER_ZONE, UWB_NUM_MAS);
-
-	for (z = zs; z <= ze; z++) {
-		bitmap_or(mas->bm, mas->bm, col.bm, UWB_NUM_MAS);
-		bitmap_shift_left(col.bm, col.bm, UWB_MAS_PER_ZONE, UWB_NUM_MAS);
+	bow->can_reserve_extra_mases = true;
+	if (bow->total_expired <= 4) {
+		bow->total_expired++;
+	} else {
+		/* after 4 backoff window has expired we can exit from
+		 * the backoff procedure */
+		bow->total_expired = 0;
+		bow->window = UWB_DRP_BACKOFF_WIN_MIN >> 1;
 	}
+	dev_dbg(dev, "backoff_win_timer total_expired=%d, n=%d\n: ", bow->total_expired, bow->n);
+
+	/* try to relocate all the "to be moved" relocations */
+	uwb_rsv_handle_drp_avail_change(rc);
 }
 
-/*
- * Allocate some MAS for this reservation based on current local
- * availability, the reservation parameters (max_mas, min_mas,
- * sparsity), and the WiMedia rules for MAS allocations.
- *
- * Returns -EBUSY is insufficient free MAS are available.
- *
- * FIXME: to simplify this, only safe reservations with a single row
- * component in zones 1 to 15 are tried (zone 0 is skipped to avoid
- * problems with the MAS reserved for the BP).
- *
- * [ECMA-368] section B.2.
- */
-static int uwb_rsv_alloc_mas(struct uwb_rsv *rsv)
+void uwb_rsv_backoff_win_increment(struct uwb_rc *rc)
 {
-	static const int safe_mas_in_row[UWB_NUM_ZONES] = {
-		8, 7, 6, 5, 4, 4, 4, 4, 4, 4, 4, 4, 4, 3, 2, 1,
-	};
-	int n, r;
-	struct uwb_mas_bm mas;
-	bool found = false;
+	struct uwb_drp_backoff_win *bow = &rc->bow;
+	struct device *dev = &rc->uwb_dev.dev;
+	unsigned timeout_us;
 
-	/*
-	 * Search all valid safe allocations until either: too few MAS
-	 * are available; or the smallest allocation with sufficient
-	 * MAS is found.
-	 *
-	 * The top of the zones are preferred, so space for larger
-	 * allocations is available in the bottom of the zone (e.g., a
-	 * 15 MAS allocation should start in row 14 leaving space for
-	 * a 120 MAS allocation at row 0).
-	 */
-	for (n = safe_mas_in_row[0]; n >= 1; n--) {
-		int num_mas;
+	dev_dbg(dev, "backoff_win_increment: window=%d\n", bow->window);
 
-		num_mas = n * (UWB_NUM_ZONES - 1);
-		if (num_mas < rsv->min_mas)
-			break;
-		if (found && num_mas < rsv->max_mas)
-			break;
+	bow->can_reserve_extra_mases = false;
 
-		for (r = UWB_MAS_PER_ZONE-1;  r >= 0; r--) {
-			if (safe_mas_in_row[r] < n)
-				continue;
-			uwb_rsv_gen_alloc_row(&mas, r, n, 1, UWB_NUM_ZONES);
-			if (uwb_drp_avail_reserve_pending(rsv->rc, &mas) == 0) {
-				found = true;
-				break;
-			}
-		}
-	}
+	if((bow->window << 1) == UWB_DRP_BACKOFF_WIN_MAX)
+		return;
 
-	if (!found)
-		return -EBUSY;
+	bow->window <<= 1;
+	bow->n = random32() & (bow->window - 1);
+	dev_dbg(dev, "new_window=%d, n=%d\n: ", bow->window, bow->n);
 
-	bitmap_copy(rsv->mas.bm, mas.bm, UWB_NUM_MAS);
-	return 0;
+	/* reset the timer associated variables */
+	timeout_us = bow->n * UWB_SUPERFRAME_LENGTH_US;
+	bow->total_expired = 0;
+	mod_timer(&bow->timer, jiffies + usecs_to_jiffies(timeout_us));		
 }
 
 static void uwb_rsv_stroke_timer(struct uwb_rsv *rsv)
@@ -241,13 +247,16 @@ static void uwb_rsv_stroke_timer(struct uwb_rsv *rsv)
 	 * received.
 	 */
 	if (rsv->is_multicast) {
-		if (rsv->state == UWB_RSV_STATE_O_INITIATED)
+		if (rsv->state == UWB_RSV_STATE_O_INITIATED
+		    || rsv->state == UWB_RSV_STATE_O_MOVE_EXPANDING
+		    || rsv->state == UWB_RSV_STATE_O_MOVE_COMBINING
+		    || rsv->state == UWB_RSV_STATE_O_MOVE_REDUCING)
 			sframes = 1;
 		if (rsv->state == UWB_RSV_STATE_O_ESTABLISHED)
 			sframes = 0;
+		
 	}
 
-	rsv->expired = false;
 	if (sframes > 0) {
 		/*
 		 * Add an additional 2 superframes to account for the
@@ -269,7 +278,7 @@ static void uwb_rsv_state_update(struct uwb_rsv *rsv,
 	rsv->state = new_state;
 	rsv->ie_valid = false;
 
-	uwb_rsv_dump(rsv);
+	uwb_rsv_dump("SU", rsv);
 
 	uwb_rsv_stroke_timer(rsv);
 	uwb_rsv_sched_update(rsv->rc);
@@ -283,10 +292,17 @@ static void uwb_rsv_callback(struct uwb_rsv *rsv)
 
 void uwb_rsv_set_state(struct uwb_rsv *rsv, enum uwb_rsv_state new_state)
 {
+	struct uwb_rsv_move *mv = &rsv->mv;
+
 	if (rsv->state == new_state) {
 		switch (rsv->state) {
 		case UWB_RSV_STATE_O_ESTABLISHED:
+		case UWB_RSV_STATE_O_MOVE_EXPANDING:
+		case UWB_RSV_STATE_O_MOVE_COMBINING:
+		case UWB_RSV_STATE_O_MOVE_REDUCING:
 		case UWB_RSV_STATE_T_ACCEPTED:
+		case UWB_RSV_STATE_T_EXPANDING_ACCEPTED:
+		case UWB_RSV_STATE_T_RESIZED:
 		case UWB_RSV_STATE_NONE:
 			uwb_rsv_stroke_timer(rsv);
 			break;
@@ -298,11 +314,10 @@ void uwb_rsv_set_state(struct uwb_rsv *rsv, enum uwb_rsv_state new_state)
 		return;
 	}
 
+	uwb_rsv_dump("SC", rsv);
+
 	switch (new_state) {
 	case UWB_RSV_STATE_NONE:
-		uwb_drp_avail_release(rsv->rc, &rsv->mas);
-		if (uwb_rsv_is_owner(rsv))
-			uwb_rsv_put_stream(rsv);
 		uwb_rsv_state_update(rsv, UWB_RSV_STATE_NONE);
 		uwb_rsv_callback(rsv);
 		break;
@@ -312,12 +327,45 @@ void uwb_rsv_set_state(struct uwb_rsv *rsv, enum uwb_rsv_state new_state)
 	case UWB_RSV_STATE_O_PENDING:
 		uwb_rsv_state_update(rsv, UWB_RSV_STATE_O_PENDING);
 		break;
+	case UWB_RSV_STATE_O_MODIFIED:
+		/* in the companion there are the MASes to drop */
+		bitmap_andnot(rsv->mas.bm, rsv->mas.bm, mv->companion_mas.bm, UWB_NUM_MAS);
+		uwb_rsv_state_update(rsv, UWB_RSV_STATE_O_MODIFIED);
+		break;
 	case UWB_RSV_STATE_O_ESTABLISHED:
+		if (rsv->state == UWB_RSV_STATE_O_MODIFIED
+		    || rsv->state == UWB_RSV_STATE_O_MOVE_REDUCING) {
+			uwb_drp_avail_release(rsv->rc, &mv->companion_mas);
+			rsv->needs_release_companion_mas = false;
+		}
 		uwb_drp_avail_reserve(rsv->rc, &rsv->mas);
 		uwb_rsv_state_update(rsv, UWB_RSV_STATE_O_ESTABLISHED);
 		uwb_rsv_callback(rsv);
 		break;
+	case UWB_RSV_STATE_O_MOVE_EXPANDING:
+		rsv->needs_release_companion_mas = true;
+		uwb_rsv_state_update(rsv, UWB_RSV_STATE_O_MOVE_EXPANDING);
+		break;
+	case UWB_RSV_STATE_O_MOVE_COMBINING:
+		rsv->needs_release_companion_mas = false;
+		uwb_drp_avail_reserve(rsv->rc, &mv->companion_mas);
+		bitmap_or(rsv->mas.bm, rsv->mas.bm, mv->companion_mas.bm, UWB_NUM_MAS);
+		rsv->mas.safe   += mv->companion_mas.safe;
+		rsv->mas.unsafe += mv->companion_mas.unsafe;
+		uwb_rsv_state_update(rsv, UWB_RSV_STATE_O_MOVE_COMBINING);
+		break;
+	case UWB_RSV_STATE_O_MOVE_REDUCING:
+		bitmap_andnot(mv->companion_mas.bm, rsv->mas.bm, mv->final_mas.bm, UWB_NUM_MAS);
+		rsv->needs_release_companion_mas = true;
+		rsv->mas.safe   = mv->final_mas.safe;
+		rsv->mas.unsafe = mv->final_mas.unsafe;
+		bitmap_copy(rsv->mas.bm, mv->final_mas.bm, UWB_NUM_MAS);
+		bitmap_copy(rsv->mas.unsafe_bm, mv->final_mas.unsafe_bm, UWB_NUM_MAS);
+		uwb_rsv_state_update(rsv, UWB_RSV_STATE_O_MOVE_REDUCING);
+		break;
 	case UWB_RSV_STATE_T_ACCEPTED:
+	case UWB_RSV_STATE_T_RESIZED:
+		rsv->needs_release_companion_mas = false;
 		uwb_drp_avail_reserve(rsv->rc, &rsv->mas);
 		uwb_rsv_state_update(rsv, UWB_RSV_STATE_T_ACCEPTED);
 		uwb_rsv_callback(rsv);
@@ -325,12 +373,82 @@ void uwb_rsv_set_state(struct uwb_rsv *rsv, enum uwb_rsv_state new_state)
 	case UWB_RSV_STATE_T_DENIED:
 		uwb_rsv_state_update(rsv, UWB_RSV_STATE_T_DENIED);
 		break;
+	case UWB_RSV_STATE_T_CONFLICT:
+		uwb_rsv_state_update(rsv, UWB_RSV_STATE_T_CONFLICT);
+		break;
+	case UWB_RSV_STATE_T_PENDING:
+		uwb_rsv_state_update(rsv, UWB_RSV_STATE_T_PENDING);
+		break;
+	case UWB_RSV_STATE_T_EXPANDING_ACCEPTED:
+		rsv->needs_release_companion_mas = true;
+		uwb_drp_avail_reserve(rsv->rc, &mv->companion_mas);
+		uwb_rsv_state_update(rsv, UWB_RSV_STATE_T_EXPANDING_ACCEPTED);
+		break;
 	default:
 		dev_err(&rsv->rc->uwb_dev.dev, "unhandled state: %s (%d)\n",
 			uwb_rsv_state_str(new_state), new_state);
 	}
 }
 
+static void uwb_rsv_handle_timeout_work(struct work_struct *work)
+{
+	struct uwb_rsv *rsv = container_of(work, struct uwb_rsv,
+					   handle_timeout_work);
+	struct uwb_rc *rc = rsv->rc;
+
+	mutex_lock(&rc->rsvs_mutex);
+
+	uwb_rsv_dump("TO", rsv);
+
+	switch (rsv->state) {
+	case UWB_RSV_STATE_O_INITIATED:
+		if (rsv->is_multicast) {
+			uwb_rsv_set_state(rsv, UWB_RSV_STATE_O_ESTABLISHED);
+			goto unlock;
+		}
+		break;
+	case UWB_RSV_STATE_O_MOVE_EXPANDING:
+		if (rsv->is_multicast) {
+			uwb_rsv_set_state(rsv, UWB_RSV_STATE_O_MOVE_COMBINING);
+			goto unlock;
+		}
+		break;
+	case UWB_RSV_STATE_O_MOVE_COMBINING:
+		if (rsv->is_multicast) {
+			uwb_rsv_set_state(rsv, UWB_RSV_STATE_O_MOVE_REDUCING);
+			goto unlock;
+		}
+		break;
+	case UWB_RSV_STATE_O_MOVE_REDUCING:
+		if (rsv->is_multicast) {
+			uwb_rsv_set_state(rsv, UWB_RSV_STATE_O_ESTABLISHED);
+			goto unlock;
+		}
+		break;
+	case UWB_RSV_STATE_O_ESTABLISHED:
+		if (rsv->is_multicast)
+			goto unlock;
+		break;
+	case UWB_RSV_STATE_T_EXPANDING_ACCEPTED:
+		/*
+		 * The time out could be for the main or of the
+		 * companion DRP, assume it's for the companion and
+		 * drop that first.  A further time out is required to
+		 * drop the main.
+		 */
+		uwb_rsv_set_state(rsv, UWB_RSV_STATE_T_ACCEPTED);
+		uwb_drp_avail_release(rsv->rc, &rsv->mv.companion_mas);
+		goto unlock;
+	default:
+		break;
+	}
+
+	uwb_rsv_remove(rsv);
+
+unlock:
+	mutex_unlock(&rc->rsvs_mutex);
+}
+
 static struct uwb_rsv *uwb_rsv_alloc(struct uwb_rc *rc)
 {
 	struct uwb_rsv *rsv;
@@ -347,6 +465,7 @@ static struct uwb_rsv *uwb_rsv_alloc(struct uwb_rc *rc)
 	rsv->timer.data     = (unsigned long)rsv;
 
 	rsv->rc = rc;
+	INIT_WORK(&rsv->handle_timeout_work, uwb_rsv_handle_timeout_work);
 
 	return rsv;
 }
@@ -381,8 +500,18 @@ EXPORT_SYMBOL_GPL(uwb_rsv_create);
 
 void uwb_rsv_remove(struct uwb_rsv *rsv)
 {
+	uwb_rsv_dump("RM", rsv);
+
 	if (rsv->state != UWB_RSV_STATE_NONE)
 		uwb_rsv_set_state(rsv, UWB_RSV_STATE_NONE);
+
+	if (rsv->needs_release_companion_mas)
+		uwb_drp_avail_release(rsv->rc, &rsv->mv.companion_mas);
+	uwb_drp_avail_release(rsv->rc, &rsv->mas);
+
+	if (uwb_rsv_is_owner(rsv))
+		uwb_rsv_put_stream(rsv);
+	
 	del_timer_sync(&rsv->timer);
 	uwb_dev_put(rsv->owner);
 	if (rsv->target.type == UWB_RSV_TARGET_DEV)
@@ -409,7 +538,7 @@ EXPORT_SYMBOL_GPL(uwb_rsv_destroy);
  * @rsv: the reservation
  *
  * The PAL should fill in @rsv's owner, target, type, max_mas,
- * min_mas, sparsity and is_multicast fields.  If the target is a
+ * min_mas, max_interval and is_multicast fields.  If the target is a
  * uwb_dev it must be referenced.
  *
  * The reservation's callback will be called when the reservation is
@@ -418,16 +547,27 @@ EXPORT_SYMBOL_GPL(uwb_rsv_destroy);
 int uwb_rsv_establish(struct uwb_rsv *rsv)
 {
 	struct uwb_rc *rc = rsv->rc;
+	struct uwb_mas_bm available;
 	int ret;
 
 	mutex_lock(&rc->rsvs_mutex);
-
 	ret = uwb_rsv_get_stream(rsv);
 	if (ret)
 		goto out;
 
-	ret = uwb_rsv_alloc_mas(rsv);
-	if (ret) {
+	rsv->tiebreaker = random32() & 1;
+	/* get available mas bitmap */
+	uwb_drp_available(rc, &available);
+
+	ret = uwb_rsv_find_best_allocation(rsv, &available, &rsv->mas);
+	if (ret == UWB_RSV_ALLOC_NOT_FOUND) {
+		ret = -EBUSY;
+		uwb_rsv_put_stream(rsv);
+		goto out;
+	}
+
+	ret = uwb_drp_avail_reserve_pending(rc, &rsv->mas);
+	if (ret != 0) {
 		uwb_rsv_put_stream(rsv);
 		goto out;
 	}
@@ -448,16 +588,71 @@ EXPORT_SYMBOL_GPL(uwb_rsv_establish);
  * @rsv: the reservation to modify
  * @max_mas: new maximum MAS to reserve
  * @min_mas: new minimum MAS to reserve
- * @sparsity: new sparsity to use
+ * @max_interval: new max_interval to use
  *
  * FIXME: implement this once there are PALs that use it.
  */
-int uwb_rsv_modify(struct uwb_rsv *rsv, int max_mas, int min_mas, int sparsity)
+int uwb_rsv_modify(struct uwb_rsv *rsv, int max_mas, int min_mas, int max_interval)
 {
 	return -ENOSYS;
 }
 EXPORT_SYMBOL_GPL(uwb_rsv_modify);
 
+/*
+ * move an already established reservation (rc->rsvs_mutex must to be
+ * taken when tis function is called)
+ */
+int uwb_rsv_try_move(struct uwb_rsv *rsv, struct uwb_mas_bm *available)
+{
+	struct uwb_rc *rc = rsv->rc;
+	struct uwb_drp_backoff_win *bow = &rc->bow;
+	struct device *dev = &rc->uwb_dev.dev;
+	struct uwb_rsv_move *mv;
+	int ret = 0;
+ 
+	if (bow->can_reserve_extra_mases == false)
+		return -EBUSY;
+
+	mv = &rsv->mv;
+
+	if (uwb_rsv_find_best_allocation(rsv, available, &mv->final_mas) == UWB_RSV_ALLOC_FOUND) {
+
+		if (!bitmap_equal(rsv->mas.bm, mv->final_mas.bm, UWB_NUM_MAS)) {
+			/* We want to move the reservation */
+			bitmap_andnot(mv->companion_mas.bm, mv->final_mas.bm, rsv->mas.bm, UWB_NUM_MAS);
+			uwb_drp_avail_reserve_pending(rc, &mv->companion_mas);
+			uwb_rsv_set_state(rsv, UWB_RSV_STATE_O_MOVE_EXPANDING);
+		}
+	} else {
+		dev_dbg(dev, "new allocation not found\n");
+	}
+	
+	return ret;
+}
+
+/* It will try to move every reservation in state O_ESTABLISHED giving
+ * to the MAS allocator algorithm an availability that is the real one
+ * plus the allocation already established from the reservation. */
+void uwb_rsv_handle_drp_avail_change(struct uwb_rc *rc)
+{
+	struct uwb_drp_backoff_win *bow = &rc->bow;
+	struct uwb_rsv *rsv;
+	struct uwb_mas_bm mas;
+	
+	if (bow->can_reserve_extra_mases == false)
+		return;
+
+	list_for_each_entry(rsv, &rc->reservations, rc_node) {
+		if (rsv->state == UWB_RSV_STATE_O_ESTABLISHED ||
+		    rsv->state == UWB_RSV_STATE_O_TO_BE_MOVED) {
+			uwb_drp_available(rc, &mas);
+			bitmap_or(mas.bm, mas.bm, rsv->mas.bm, UWB_NUM_MAS);
+			uwb_rsv_try_move(rsv, &mas);
+		}
+	}
+	
+}
+
 /**
  * uwb_rsv_terminate - terminate an established reservation
  * @rsv: the reservation to terminate
@@ -546,6 +741,7 @@ static struct uwb_rsv *uwb_rsv_new_target(struct uwb_rc *rc,
 	uwb_dev_get(rsv->owner);
 	rsv->target.type = UWB_RSV_TARGET_DEV;
 	rsv->target.dev  = &rc->uwb_dev;
+	uwb_dev_get(&rc->uwb_dev);
 	rsv->type        = uwb_ie_drp_type(drp_ie);
 	rsv->stream      = uwb_ie_drp_stream_index(drp_ie);
 	uwb_drp_ie_to_bm(&rsv->mas, drp_ie);
@@ -567,11 +763,33 @@ static struct uwb_rsv *uwb_rsv_new_target(struct uwb_rc *rc,
 	list_add_tail(&rsv->rc_node, &rc->reservations);
 	state = rsv->state;
 	rsv->state = UWB_RSV_STATE_NONE;
-	uwb_rsv_set_state(rsv, state);
+
+	/* FIXME: do something sensible here */
+	if (state == UWB_RSV_STATE_T_ACCEPTED
+	    && uwb_drp_avail_reserve_pending(rc, &rsv->mas) == -EBUSY) {
+		/* FIXME: do something sensible here */
+	} else {
+		uwb_rsv_set_state(rsv, state);
+	}
 
 	return rsv;
 }
 
+/**
+ * uwb_rsv_get_usable_mas - get the bitmap of the usable MAS of a reservations
+ * @rsv: the reservation.
+ * @mas: returns the available MAS.
+ *
+ * The usable MAS of a reservation may be less than the negotiated MAS
+ * if alien BPs are present.
+ */
+void uwb_rsv_get_usable_mas(struct uwb_rsv *rsv, struct uwb_mas_bm *mas)
+{
+	bitmap_zero(mas->bm, UWB_NUM_MAS);
+	bitmap_andnot(mas->bm, rsv->mas.bm, rsv->rc->cnflt_alien_bitmap.bm, UWB_NUM_MAS);
+}
+EXPORT_SYMBOL_GPL(uwb_rsv_get_usable_mas);
+
 /**
  * uwb_rsv_find - find a reservation for a received DRP IE.
  * @rc: the radio controller
@@ -611,8 +829,6 @@ static bool uwb_rsv_update_all(struct uwb_rc *rc)
 	bool ie_updated = false;
 
 	list_for_each_entry_safe(rsv, t, &rc->reservations, rc_node) {
-		if (rsv->expired)
-			uwb_drp_handle_timeout(rsv);
 		if (!rsv->ie_valid) {
 			uwb_drp_ie_update(rsv);
 			ie_updated = true;
@@ -622,9 +838,47 @@ static bool uwb_rsv_update_all(struct uwb_rc *rc)
 	return ie_updated;
 }
 
+void uwb_rsv_queue_update(struct uwb_rc *rc)
+{
+	unsigned long delay_us = UWB_MAS_LENGTH_US * UWB_MAS_PER_ZONE;
+
+	queue_delayed_work(rc->rsv_workq, &rc->rsv_update_work, usecs_to_jiffies(delay_us));
+}
+
+/**
+ * uwb_rsv_sched_update - schedule an update of the DRP IEs
+ * @rc: the radio controller.
+ *
+ * To improve performance and ensure correctness with [ECMA-368] the
+ * number of SET-DRP-IE commands that are done are limited.
+ *
+ * DRP IEs update come from two sources: DRP events from the hardware
+ * which all occur at the beginning of the superframe ('syncronous'
+ * events) and reservation establishment/termination requests from
+ * PALs or timers ('asynchronous' events).
+ *
+ * A delayed work ensures that all the synchronous events result in
+ * one SET-DRP-IE command.
+ *
+ * Additional logic (the set_drp_ie_pending and rsv_updated_postponed
+ * flags) will prevent an asynchrous event starting a SET-DRP-IE
+ * command if one is currently awaiting a response.
+ *
+ * FIXME: this does leave a window where an asynchrous event can delay
+ * the SET-DRP-IE for a synchronous event by one superframe.
+ */
 void uwb_rsv_sched_update(struct uwb_rc *rc)
 {
-	queue_work(rc->rsv_workq, &rc->rsv_update_work);
+	spin_lock(&rc->rsvs_lock);
+	if (!delayed_work_pending(&rc->rsv_update_work)) {
+		if (rc->set_drp_ie_pending > 0) {
+			rc->set_drp_ie_pending++;
+			goto unlock;
+		}
+		uwb_rsv_queue_update(rc);
+	}
+unlock:
+	spin_unlock(&rc->rsvs_lock);
 }
 
 /*
@@ -633,7 +887,8 @@ void uwb_rsv_sched_update(struct uwb_rc *rc)
  */
 static void uwb_rsv_update_work(struct work_struct *work)
 {
-	struct uwb_rc *rc = container_of(work, struct uwb_rc, rsv_update_work);
+	struct uwb_rc *rc = container_of(work, struct uwb_rc,
+					 rsv_update_work.work);
 	bool ie_updated;
 
 	mutex_lock(&rc->rsvs_mutex);
@@ -645,18 +900,34 @@ static void uwb_rsv_update_work(struct work_struct *work)
 		ie_updated = true;
 	}
 
-	if (ie_updated)
+	if (ie_updated && (rc->set_drp_ie_pending == 0))
 		uwb_rc_send_all_drp_ie(rc);
 
 	mutex_unlock(&rc->rsvs_mutex);
 }
 
+static void uwb_rsv_alien_bp_work(struct work_struct *work)
+{
+	struct uwb_rc *rc = container_of(work, struct uwb_rc,
+					 rsv_alien_bp_work.work);
+	struct uwb_rsv *rsv;
+
+	mutex_lock(&rc->rsvs_mutex);
+	
+	list_for_each_entry(rsv, &rc->reservations, rc_node) {
+		if (rsv->type != UWB_DRP_TYPE_ALIEN_BP) {
+			rsv->callback(rsv);
+		}
+	}
+
+	mutex_unlock(&rc->rsvs_mutex);
+}
+
 static void uwb_rsv_timer(unsigned long arg)
 {
 	struct uwb_rsv *rsv = (struct uwb_rsv *)arg;
 
-	rsv->expired = true;
-	uwb_rsv_sched_update(rsv->rc);
+	queue_work(rsv->rc->rsv_workq, &rsv->handle_timeout_work);
 }
 
 /**
@@ -673,16 +944,27 @@ void uwb_rsv_remove_all(struct uwb_rc *rc)
 	list_for_each_entry_safe(rsv, t, &rc->reservations, rc_node) {
 		uwb_rsv_remove(rsv);
 	}
+	/* Cancel any postponed update. */
+	rc->set_drp_ie_pending = 0;
 	mutex_unlock(&rc->rsvs_mutex);
 
-	cancel_work_sync(&rc->rsv_update_work);
+	cancel_delayed_work_sync(&rc->rsv_update_work);
 }
 
 void uwb_rsv_init(struct uwb_rc *rc)
 {
 	INIT_LIST_HEAD(&rc->reservations);
+	INIT_LIST_HEAD(&rc->cnflt_alien_list);
 	mutex_init(&rc->rsvs_mutex);
-	INIT_WORK(&rc->rsv_update_work, uwb_rsv_update_work);
+	spin_lock_init(&rc->rsvs_lock);
+	INIT_DELAYED_WORK(&rc->rsv_update_work, uwb_rsv_update_work);
+	INIT_DELAYED_WORK(&rc->rsv_alien_bp_work, uwb_rsv_alien_bp_work);
+	rc->bow.can_reserve_extra_mases = true;
+	rc->bow.total_expired = 0;
+	rc->bow.window = UWB_DRP_BACKOFF_WIN_MIN >> 1;
+	init_timer(&rc->bow.timer);
+	rc->bow.timer.function = uwb_rsv_backoff_win_timer;
+	rc->bow.timer.data     = (unsigned long)&rc->bow;
 
 	bitmap_complement(rc->uwb_dev.streams, rc->uwb_dev.streams, UWB_NUM_STREAMS);
 }
diff --git a/drivers/uwb/uwb-debug.c b/drivers/uwb/uwb-debug.c
index a6debb9baf3..89b2e6a7214 100644
--- a/drivers/uwb/uwb-debug.c
+++ b/drivers/uwb/uwb-debug.c
@@ -82,29 +82,21 @@ struct uwb_dbg {
 	struct dentry *reservations_f;
 	struct dentry *accept_f;
 	struct dentry *drp_avail_f;
+	spinlock_t list_lock;
 };
 
 static struct dentry *root_dir;
 
 static void uwb_dbg_rsv_cb(struct uwb_rsv *rsv)
 {
-	struct uwb_rc *rc = rsv->rc;
-	struct device *dev = &rc->uwb_dev.dev;
-	struct uwb_dev_addr devaddr;
-	char owner[UWB_ADDR_STRSIZE], target[UWB_ADDR_STRSIZE];
-
-	uwb_dev_addr_print(owner, sizeof(owner), &rsv->owner->dev_addr);
-	if (rsv->target.type == UWB_RSV_TARGET_DEV)
-		devaddr = rsv->target.dev->dev_addr;
-	else
-		devaddr = rsv->target.devaddr;
-	uwb_dev_addr_print(target, sizeof(target), &devaddr);
+	struct uwb_dbg *dbg = rsv->pal_priv;
 
-	dev_dbg(dev, "debug: rsv %s -> %s: %s\n",
-		owner, target, uwb_rsv_state_str(rsv->state));
+	uwb_rsv_dump("debug", rsv);
 
 	if (rsv->state == UWB_RSV_STATE_NONE) {
+		spin_lock(&dbg->list_lock);
 		list_del(&rsv->pal_node);
+		spin_unlock(&dbg->list_lock);
 		uwb_rsv_destroy(rsv);
 	}
 }
@@ -128,20 +120,21 @@ static int cmd_rsv_establish(struct uwb_rc *rc,
 		return -ENOMEM;
 	}
 
-	rsv->owner       = &rc->uwb_dev;
-	rsv->target.type = UWB_RSV_TARGET_DEV;
-	rsv->target.dev  = target;
-	rsv->type        = cmd->type;
-	rsv->max_mas     = cmd->max_mas;
-	rsv->min_mas     = cmd->min_mas;
-	rsv->sparsity    = cmd->sparsity;
+	rsv->target.type  = UWB_RSV_TARGET_DEV;
+	rsv->target.dev   = target;
+	rsv->type         = cmd->type;
+	rsv->max_mas      = cmd->max_mas;
+	rsv->min_mas      = cmd->min_mas;
+	rsv->max_interval = cmd->max_interval;
 
 	ret = uwb_rsv_establish(rsv);
 	if (ret)
 		uwb_rsv_destroy(rsv);
-	else
+	else {
+		spin_lock(&(rc->dbg)->list_lock);
 		list_add_tail(&rsv->pal_node, &rc->dbg->rsvs);
-
+		spin_unlock(&(rc->dbg)->list_lock);
+	}
 	return ret;
 }
 
@@ -151,17 +144,24 @@ static int cmd_rsv_terminate(struct uwb_rc *rc,
 	struct uwb_rsv *rsv, *found = NULL;
 	int i = 0;
 
+	spin_lock(&(rc->dbg)->list_lock);
+
 	list_for_each_entry(rsv, &rc->dbg->rsvs, pal_node) {
 		if (i == cmd->index) {
 			found = rsv;
+			uwb_rsv_get(found);
 			break;
 		}
 		i++;
 	}
+
+	spin_unlock(&(rc->dbg)->list_lock);
+
 	if (!found)
 		return -EINVAL;
 
 	uwb_rsv_terminate(found);
+	uwb_rsv_put(found);
 
 	return 0;
 }
@@ -191,7 +191,7 @@ static ssize_t command_write(struct file *file, const char __user *buf,
 	struct uwb_rc *rc = file->private_data;
 	struct uwb_dbg_cmd cmd;
 	int ret = 0;
-
+	
 	if (len != sizeof(struct uwb_dbg_cmd))
 		return -EINVAL;
 
@@ -325,7 +325,9 @@ static void uwb_dbg_new_rsv(struct uwb_pal *pal, struct uwb_rsv *rsv)
 	struct uwb_dbg *dbg = container_of(pal, struct uwb_dbg, pal);
 
 	if (dbg->accept) {
+		spin_lock(&dbg->list_lock);
 		list_add_tail(&rsv->pal_node, &dbg->rsvs);
+		spin_unlock(&dbg->list_lock);
 		uwb_rsv_accept(rsv, uwb_dbg_rsv_cb, dbg);
 	}
 }
@@ -341,6 +343,7 @@ void uwb_dbg_add_rc(struct uwb_rc *rc)
 		return;
 
 	INIT_LIST_HEAD(&rc->dbg->rsvs);
+	spin_lock_init(&(rc->dbg)->list_lock);
 
 	uwb_pal_init(&rc->dbg->pal);
 	rc->dbg->pal.rc = rc;
diff --git a/drivers/uwb/uwb-internal.h b/drivers/uwb/uwb-internal.h
index f0f21f406bf..d5bcfc1c227 100644
--- a/drivers/uwb/uwb-internal.h
+++ b/drivers/uwb/uwb-internal.h
@@ -92,6 +92,12 @@ extern const char *uwb_rc_strerror(unsigned code);
 
 struct uwb_rc_neh;
 
+extern int uwb_rc_cmd_async(struct uwb_rc *rc, const char *cmd_name,
+			    struct uwb_rccb *cmd, size_t cmd_size,
+			    u8 expected_type, u16 expected_event,
+			    uwb_rc_cmd_cb_f cb, void *arg);
+
+
 void uwb_rc_neh_create(struct uwb_rc *rc);
 void uwb_rc_neh_destroy(struct uwb_rc *rc);
 
@@ -106,7 +112,69 @@ void uwb_rc_neh_put(struct uwb_rc_neh *neh);
 extern int uwb_est_create(void);
 extern void uwb_est_destroy(void);
 
+/*
+ * UWB conflicting alien reservations
+ */
+struct uwb_cnflt_alien {
+	struct uwb_rc *rc;
+	struct list_head rc_node;
+	struct uwb_mas_bm mas;
+	struct timer_list timer;
+	struct work_struct cnflt_update_work;
+};
+
+enum uwb_uwb_rsv_alloc_result {
+	UWB_RSV_ALLOC_FOUND = 0,
+	UWB_RSV_ALLOC_NOT_FOUND,
+};
+
+enum uwb_rsv_mas_status {
+	UWB_RSV_MAS_NOT_AVAIL = 1,
+	UWB_RSV_MAS_SAFE,
+	UWB_RSV_MAS_UNSAFE,
+};
+
+struct uwb_rsv_col_set_info {
+	unsigned char start_col;
+	unsigned char interval;
+	unsigned char safe_mas_per_col;
+	unsigned char unsafe_mas_per_col;
+};
+
+struct uwb_rsv_col_info {
+	unsigned char max_avail_safe;
+	unsigned char max_avail_unsafe;
+	unsigned char highest_mas[UWB_MAS_PER_ZONE];
+	struct uwb_rsv_col_set_info csi;
+};
+
+struct uwb_rsv_row_info {
+	unsigned char avail[UWB_MAS_PER_ZONE];
+	unsigned char free_rows;
+	unsigned char used_rows;
+};
+
+/*
+ * UWB find allocation
+ */
+struct uwb_rsv_alloc_info {
+	unsigned char bm[UWB_MAS_PER_ZONE * UWB_NUM_ZONES];
+	struct uwb_rsv_col_info ci[UWB_NUM_ZONES];
+	struct uwb_rsv_row_info ri;
+	struct uwb_mas_bm *not_available;
+	struct uwb_mas_bm *result;
+	int min_mas;
+	int max_mas;
+	int max_interval;
+	int total_allocated_mases;
+	int safe_allocated_mases;
+	int unsafe_allocated_mases;
+	int interval;
+};
 
+int uwb_rsv_find_best_allocation(struct uwb_rsv *rsv, struct uwb_mas_bm *available, 
+				 struct uwb_mas_bm *result);
+void uwb_rsv_handle_drp_avail_change(struct uwb_rc *rc);
 /*
  * UWB Events & management daemon
  */
@@ -254,18 +322,28 @@ void uwb_rsv_init(struct uwb_rc *rc);
 int uwb_rsv_setup(struct uwb_rc *rc);
 void uwb_rsv_cleanup(struct uwb_rc *rc);
 void uwb_rsv_remove_all(struct uwb_rc *rc);
+void uwb_rsv_get(struct uwb_rsv *rsv);
+void uwb_rsv_put(struct uwb_rsv *rsv);
+bool uwb_rsv_has_two_drp_ies(struct uwb_rsv *rsv);
+void uwb_rsv_dump(char *text, struct uwb_rsv *rsv);
+int uwb_rsv_try_move(struct uwb_rsv *rsv, struct uwb_mas_bm *available);
+void uwb_rsv_backoff_win_timer(unsigned long arg);
+void uwb_rsv_backoff_win_increment(struct uwb_rc *rc);
+int uwb_rsv_status(struct uwb_rsv *rsv);
+int uwb_rsv_companion_status(struct uwb_rsv *rsv);
 
 void uwb_rsv_set_state(struct uwb_rsv *rsv, enum uwb_rsv_state new_state);
 void uwb_rsv_remove(struct uwb_rsv *rsv);
 struct uwb_rsv *uwb_rsv_find(struct uwb_rc *rc, struct uwb_dev *src,
 			     struct uwb_ie_drp *drp_ie);
 void uwb_rsv_sched_update(struct uwb_rc *rc);
+void uwb_rsv_queue_update(struct uwb_rc *rc);
 
-void uwb_drp_handle_timeout(struct uwb_rsv *rsv);
 int uwb_drp_ie_update(struct uwb_rsv *rsv);
 void uwb_drp_ie_to_bm(struct uwb_mas_bm *bm, const struct uwb_ie_drp *drp_ie);
 
 void uwb_drp_avail_init(struct uwb_rc *rc);
+void uwb_drp_available(struct uwb_rc *rc, struct uwb_mas_bm *avail);
 int  uwb_drp_avail_reserve_pending(struct uwb_rc *rc, struct uwb_mas_bm *mas);
 void uwb_drp_avail_reserve(struct uwb_rc *rc, struct uwb_mas_bm *mas);
 void uwb_drp_avail_release(struct uwb_rc *rc, struct uwb_mas_bm *mas);
diff --git a/include/linux/uwb.h b/include/linux/uwb.h
index d7ed5201ade..c02128991ff 100644
--- a/include/linux/uwb.h
+++ b/include/linux/uwb.h
@@ -67,6 +67,7 @@ struct uwb_dev {
 	struct uwb_dev_addr dev_addr;
 	int beacon_slot;
 	DECLARE_BITMAP(streams, UWB_NUM_STREAMS);
+	DECLARE_BITMAP(last_availability_bm, UWB_NUM_MAS);
 };
 #define to_uwb_dev(d) container_of(d, struct uwb_dev, dev)
 
@@ -109,6 +110,9 @@ struct uwbd {
  */
 struct uwb_mas_bm {
 	DECLARE_BITMAP(bm, UWB_NUM_MAS);
+	DECLARE_BITMAP(unsafe_bm, UWB_NUM_MAS);
+	int safe;
+	int unsafe;
 };
 
 /**
@@ -134,14 +138,24 @@ struct uwb_mas_bm {
  * FIXME: further target states TBD.
  */
 enum uwb_rsv_state {
-	UWB_RSV_STATE_NONE,
+	UWB_RSV_STATE_NONE = 0,
 	UWB_RSV_STATE_O_INITIATED,
 	UWB_RSV_STATE_O_PENDING,
 	UWB_RSV_STATE_O_MODIFIED,
 	UWB_RSV_STATE_O_ESTABLISHED,
+	UWB_RSV_STATE_O_TO_BE_MOVED,
+	UWB_RSV_STATE_O_MOVE_EXPANDING,
+	UWB_RSV_STATE_O_MOVE_COMBINING,
+	UWB_RSV_STATE_O_MOVE_REDUCING,
 	UWB_RSV_STATE_T_ACCEPTED,
 	UWB_RSV_STATE_T_DENIED,
+	UWB_RSV_STATE_T_CONFLICT,
 	UWB_RSV_STATE_T_PENDING,
+	UWB_RSV_STATE_T_EXPANDING_ACCEPTED,
+	UWB_RSV_STATE_T_EXPANDING_CONFLICT,
+	UWB_RSV_STATE_T_EXPANDING_PENDING,
+	UWB_RSV_STATE_T_EXPANDING_DENIED,
+	UWB_RSV_STATE_T_RESIZED,
 
 	UWB_RSV_STATE_LAST,
 };
@@ -166,6 +180,12 @@ struct uwb_rsv_target {
 	};
 };
 
+struct uwb_rsv_move {
+	struct uwb_mas_bm final_mas;
+	struct uwb_ie_drp *companion_drp_ie;
+	struct uwb_mas_bm companion_mas;
+};
+
 /*
  * Number of streams reserved for reservations targeted at DevAddrs.
  */
@@ -203,6 +223,7 @@ typedef void (*uwb_rsv_cb_f)(struct uwb_rsv *rsv);
  *
  * @status:         negotiation status
  * @stream:         stream index allocated for this reservation
+ * @tiebreaker:     conflict tiebreaker for this reservation
  * @mas:            reserved MAS
  * @drp_ie:         the DRP IE
  * @ie_valid:       true iff the DRP IE matches the reservation parameters
@@ -225,19 +246,22 @@ struct uwb_rsv {
 	enum uwb_drp_type type;
 	int max_mas;
 	int min_mas;
-	int sparsity;
+	int max_interval;
 	bool is_multicast;
 
 	uwb_rsv_cb_f callback;
 	void *pal_priv;
 
 	enum uwb_rsv_state state;
+	bool needs_release_companion_mas;
 	u8 stream;
+	u8 tiebreaker;
 	struct uwb_mas_bm mas;
 	struct uwb_ie_drp *drp_ie;
+	struct uwb_rsv_move mv;
 	bool ie_valid;
 	struct timer_list timer;
-	bool expired;
+	struct work_struct handle_timeout_work;
 };
 
 static const
@@ -279,6 +303,13 @@ struct uwb_drp_avail {
 	bool ie_valid;
 };
 
+struct uwb_drp_backoff_win {
+	u8 window;
+	u8 n;
+	int total_expired;
+	struct timer_list timer;
+	bool can_reserve_extra_mases;
+};
 
 const char *uwb_rsv_state_str(enum uwb_rsv_state state);
 const char *uwb_rsv_type_str(enum uwb_drp_type type);
@@ -294,6 +325,8 @@ void uwb_rsv_terminate(struct uwb_rsv *rsv);
 
 void uwb_rsv_accept(struct uwb_rsv *rsv, uwb_rsv_cb_f cb, void *pal_priv);
 
+void uwb_rsv_get_usable_mas(struct uwb_rsv *orig_rsv, struct uwb_mas_bm *mas);
+
 /**
  * Radio Control Interface instance
  *
@@ -364,12 +397,18 @@ struct uwb_rc {
 
 	struct uwbd uwbd;
 
+	struct uwb_drp_backoff_win bow;
 	struct uwb_drp_avail drp_avail;
 	struct list_head reservations;
+	struct list_head cnflt_alien_list;
+	struct uwb_mas_bm cnflt_alien_bitmap;
 	struct mutex rsvs_mutex;
+	spinlock_t rsvs_lock;
 	struct workqueue_struct *rsv_workq;
-	struct work_struct rsv_update_work;
 
+	struct delayed_work rsv_update_work;
+	struct delayed_work rsv_alien_bp_work;
+	int set_drp_ie_pending;
 	struct mutex ies_mutex;
 	struct uwb_rc_cmd_set_ie *ies;
 	size_t ies_capacity;
diff --git a/include/linux/uwb/debug-cmd.h b/include/linux/uwb/debug-cmd.h
index 07efbe17db5..8da004e2562 100644
--- a/include/linux/uwb/debug-cmd.h
+++ b/include/linux/uwb/debug-cmd.h
@@ -43,7 +43,7 @@ struct uwb_dbg_cmd_rsv_establish {
 	__u8  type;
 	__u16 max_mas;
 	__u16 min_mas;
-	__u8  sparsity;
+	__u8  max_interval;
 };
 
 struct uwb_dbg_cmd_rsv_terminate {
diff --git a/include/linux/uwb/spec.h b/include/linux/uwb/spec.h
index a30436ea53a..b52e44f1bd3 100644
--- a/include/linux/uwb/spec.h
+++ b/include/linux/uwb/spec.h
@@ -58,6 +58,11 @@ enum { UWB_NUM_ZONES = 16 };
  */
 #define UWB_MAS_PER_ZONE (UWB_NUM_MAS / UWB_NUM_ZONES)
 
+/*
+ * Number of MAS required before a row can be considered available.
+ */
+#define UWB_USABLE_MAS_PER_ROW (UWB_NUM_ZONES - 1)
+
 /*
  * Number of streams per DRP reservation between a pair of devices.
  *
@@ -93,6 +98,26 @@ enum { UWB_BEACON_SLOT_LENGTH_US = 85 };
  */
 enum { UWB_MAX_LOST_BEACONS = 3 };
 
+/*
+ * mDRPBackOffWinMin
+ *
+ * The minimum number of superframes to wait before trying to reserve
+ * extra MAS.
+ *
+ * [ECMA-368] section 17.16
+ */
+enum { UWB_DRP_BACKOFF_WIN_MIN = 2 };
+
+/*
+ * mDRPBackOffWinMax
+ *
+ * The maximum number of superframes to wait before trying to reserve
+ * extra MAS.
+ *
+ * [ECMA-368] section 17.16
+ */
+enum { UWB_DRP_BACKOFF_WIN_MAX = 16 };
+
 /*
  * Length of a superframe in microseconds.
  */
-- 
cgit v1.2.3-70-g09d2


From 27af4245b6ce99e08c6a7c38825383bf51119cc9 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Mon, 1 Dec 2008 14:18:13 -0800
Subject: posix-timers: use "struct pid*" instead of "struct task_struct*"

Impact: restructure, clean up code

k_itimer holds the ref to the ->it_process until sys_timer_delete(). This
allows to pin up to RLIMIT_SIGPENDING dead task_struct's. Change the code
to use "struct pid *" instead.

The patch doesn't kill ->it_process, it places ->it_pid into the union.
->it_process is still used by do_cpu_nanosleep() as before. It would be
trivial to change the nanosleep code as well, but since it uses it_process
in a special way I think it is better to keep this field for grep.

The patch bloats the kernel by 104 bytes and it also adds the new pointer,
->it_signal, to k_itimer. It is used by lock_timer() to verify that the
found timer was not created by another process. It is not clear why do we
use the global database (and thus the global idr_lock) for posix timers.
We still need the signal_struct->posix_timers which contains all useable
timers, perhaps it is better to use some form of per-process array
instead.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/posix-timers.h |  6 +++++-
 kernel/posix-timers.c        | 43 +++++++++++++++++++++++--------------------
 2 files changed, 28 insertions(+), 21 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/posix-timers.h b/include/linux/posix-timers.h
index a7c72135554..4f71bf4e628 100644
--- a/include/linux/posix-timers.h
+++ b/include/linux/posix-timers.h
@@ -45,7 +45,11 @@ struct k_itimer {
 	int it_requeue_pending;		/* waiting to requeue this timer */
 #define REQUEUE_PENDING 1
 	int it_sigev_notify;		/* notify word of sigevent struct */
-	struct task_struct *it_process;	/* process to send signal to */
+	struct signal_struct *it_signal;
+	union {
+		struct pid *it_pid;	/* pid of process to send signal to */
+		struct task_struct *it_process;	/* for clock_nanosleep */
+	};
 	struct sigqueue *sigq;		/* signal queue entry. */
 	union {
 		struct {
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index 5e79c662294..42a39afd694 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -116,7 +116,7 @@ static DEFINE_SPINLOCK(idr_lock);
  *	    must supply functions here, even if the function just returns
  *	    ENOSYS.  The standard POSIX timer management code assumes the
  *	    following: 1.) The k_itimer struct (sched.h) is used for the
- *	    timer.  2.) The list, it_lock, it_clock, it_id and it_process
+ *	    timer.  2.) The list, it_lock, it_clock, it_id and it_pid
  *	    fields are not modified by timer code.
  *
  *          At this time all functions EXCEPT clock_nanosleep can be
@@ -313,7 +313,8 @@ void do_schedule_next_timer(struct siginfo *info)
 
 int posix_timer_event(struct k_itimer *timr, int si_private)
 {
-	int shared, ret;
+	struct task_struct *task;
+	int shared, ret = -1;
 	/*
 	 * FIXME: if ->sigq is queued we can race with
 	 * dequeue_signal()->do_schedule_next_timer().
@@ -327,8 +328,13 @@ int posix_timer_event(struct k_itimer *timr, int si_private)
 	 */
 	timr->sigq->info.si_sys_private = si_private;
 
-	shared = !(timr->it_sigev_notify & SIGEV_THREAD_ID);
-	ret = send_sigqueue(timr->sigq, timr->it_process, shared);
+	rcu_read_lock();
+	task = pid_task(timr->it_pid, PIDTYPE_PID);
+	if (task) {
+		shared = !(timr->it_sigev_notify & SIGEV_THREAD_ID);
+		ret = send_sigqueue(timr->sigq, task, shared);
+	}
+	rcu_read_unlock();
 	/* If we failed to send the signal the timer stops. */
 	return ret > 0;
 }
@@ -405,7 +411,7 @@ static enum hrtimer_restart posix_timer_fn(struct hrtimer *timer)
 	return ret;
 }
 
-static struct task_struct * good_sigevent(sigevent_t * event)
+static struct pid *good_sigevent(sigevent_t * event)
 {
 	struct task_struct *rtn = current->group_leader;
 
@@ -419,7 +425,7 @@ static struct task_struct * good_sigevent(sigevent_t * event)
 	    ((event->sigev_signo <= 0) || (event->sigev_signo > SIGRTMAX)))
 		return NULL;
 
-	return rtn;
+	return task_pid(rtn);
 }
 
 void register_posix_clock(const clockid_t clock_id, struct k_clock *new_clock)
@@ -471,7 +477,7 @@ sys_timer_create(const clockid_t which_clock,
 {
 	struct k_itimer *new_timer;
 	int error, new_timer_id;
-	struct task_struct *process;
+	struct pid *it_pid;
 	sigevent_t event;
 	int it_id_set = IT_ID_NOT_SET;
 
@@ -525,11 +531,9 @@ sys_timer_create(const clockid_t which_clock,
 			goto out;
 		}
 		rcu_read_lock();
-		process = good_sigevent(&event);
-		if (process)
-			get_task_struct(process);
+		it_pid = get_pid(good_sigevent(&event));
 		rcu_read_unlock();
-		if (!process) {
+		if (!it_pid) {
 			error = -EINVAL;
 			goto out;
 		}
@@ -537,8 +541,7 @@ sys_timer_create(const clockid_t which_clock,
 		event.sigev_notify = SIGEV_SIGNAL;
 		event.sigev_signo = SIGALRM;
 		event.sigev_value.sival_int = new_timer->it_id;
-		process = current->group_leader;
-		get_task_struct(process);
+		it_pid = get_pid(task_tgid(current));
 	}
 
 	new_timer->it_sigev_notify     = event.sigev_notify;
@@ -548,7 +551,8 @@ sys_timer_create(const clockid_t which_clock,
 	new_timer->sigq->info.si_code  = SI_TIMER;
 
 	spin_lock_irq(&current->sighand->siglock);
-	new_timer->it_process = process;
+	new_timer->it_pid = it_pid;
+	new_timer->it_signal = current->signal;
 	list_add(&new_timer->list, &current->signal->posix_timers);
 	spin_unlock_irq(&current->sighand->siglock);
 
@@ -583,8 +587,7 @@ static struct k_itimer *lock_timer(timer_t timer_id, unsigned long *flags)
 	timr = idr_find(&posix_timers_id, (int)timer_id);
 	if (timr) {
 		spin_lock(&timr->it_lock);
-		if (timr->it_process &&
-		    same_thread_group(timr->it_process, current)) {
+		if (timr->it_pid && timr->it_signal == current->signal) {
 			spin_unlock(&idr_lock);
 			return timr;
 		}
@@ -831,8 +834,8 @@ retry_delete:
 	 * This keeps any tasks waiting on the spin lock from thinking
 	 * they got something (see the lock code above).
 	 */
-	put_task_struct(timer->it_process);
-	timer->it_process = NULL;
+	put_pid(timer->it_pid);
+	timer->it_pid = NULL;
 
 	unlock_timer(timer, flags);
 	release_posix_timer(timer, IT_ID_SET);
@@ -858,8 +861,8 @@ retry_delete:
 	 * This keeps any tasks waiting on the spin lock from thinking
 	 * they got something (see the lock code above).
 	 */
-	put_task_struct(timer->it_process);
-	timer->it_process = NULL;
+	put_pid(timer->it_pid);
+	timer->it_pid = NULL;
 
 	unlock_timer(timer, flags);
 	release_posix_timer(timer, IT_ID_SET);
-- 
cgit v1.2.3-70-g09d2


From c29541b24fb2c6301021637229ae5347c877330a Mon Sep 17 00:00:00 2001
From: Mike Frysinger <vapier@gentoo.org>
Date: Mon, 1 Dec 2008 14:18:11 -0800
Subject: linux/timex.h: cleanup for userspace

Impact: fix user-space exported use

Move all the kernel-specific defines and includes into the __KERNEL__
section so that they don't get leaked into userspace.

[akpm@linux-foundation.org: coding-style fixes]

Signed-off-by: Mike Frysinger <vapier@gentoo.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/timex.h | 73 ++++++++++++++++++++++++++-------------------------
 1 file changed, 37 insertions(+), 36 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/timex.h b/include/linux/timex.h
index 9007313b5b7..998a55d80ac 100644
--- a/include/linux/timex.h
+++ b/include/linux/timex.h
@@ -53,46 +53,10 @@
 #ifndef _LINUX_TIMEX_H
 #define _LINUX_TIMEX_H
 
-#include <linux/compiler.h>
 #include <linux/time.h>
 
-#include <asm/param.h>
-
 #define NTP_API		4	/* NTP API version */
 
-/*
- * SHIFT_KG and SHIFT_KF establish the damping of the PLL and are chosen
- * for a slightly underdamped convergence characteristic. SHIFT_KH
- * establishes the damping of the FLL and is chosen by wisdom and black
- * art.
- *
- * MAXTC establishes the maximum time constant of the PLL. With the
- * SHIFT_KG and SHIFT_KF values given and a time constant range from
- * zero to MAXTC, the PLL will converge in 15 minutes to 16 hours,
- * respectively.
- */
-#define SHIFT_PLL	4	/* PLL frequency factor (shift) */
-#define SHIFT_FLL	2	/* FLL frequency factor (shift) */
-#define MAXTC		10	/* maximum time constant (shift) */
-
-/*
- * SHIFT_USEC defines the scaling (shift) of the time_freq and
- * time_tolerance variables, which represent the current frequency
- * offset and maximum frequency tolerance.
- */
-#define SHIFT_USEC 16		/* frequency offset scale (shift) */
-#define PPM_SCALE (NSEC_PER_USEC << (NTP_SCALE_SHIFT - SHIFT_USEC))
-#define PPM_SCALE_INV_SHIFT 19
-#define PPM_SCALE_INV ((1ll << (PPM_SCALE_INV_SHIFT + NTP_SCALE_SHIFT)) / \
-		       PPM_SCALE + 1)
-
-#define MAXPHASE 500000000l	/* max phase error (ns) */
-#define MAXFREQ 500000		/* max frequency error (ns/s) */
-#define MAXFREQ_SCALED ((s64)MAXFREQ << NTP_SCALE_SHIFT)
-#define MINSEC 256		/* min interval between updates (s) */
-#define MAXSEC 2048		/* max interval between updates (s) */
-#define NTP_PHASE_LIMIT ((MAXPHASE / NSEC_PER_USEC) << 5) /* beyond max. dispersion */
-
 /*
  * syscall interface - used (mainly by NTP daemon)
  * to discipline kernel clock oscillator
@@ -199,8 +163,45 @@ struct timex {
 #define TIME_BAD	TIME_ERROR /* bw compat */
 
 #ifdef __KERNEL__
+#include <linux/compiler.h>
+#include <linux/types.h>
+#include <linux/param.h>
+
 #include <asm/timex.h>
 
+/*
+ * SHIFT_KG and SHIFT_KF establish the damping of the PLL and are chosen
+ * for a slightly underdamped convergence characteristic. SHIFT_KH
+ * establishes the damping of the FLL and is chosen by wisdom and black
+ * art.
+ *
+ * MAXTC establishes the maximum time constant of the PLL. With the
+ * SHIFT_KG and SHIFT_KF values given and a time constant range from
+ * zero to MAXTC, the PLL will converge in 15 minutes to 16 hours,
+ * respectively.
+ */
+#define SHIFT_PLL	4	/* PLL frequency factor (shift) */
+#define SHIFT_FLL	2	/* FLL frequency factor (shift) */
+#define MAXTC		10	/* maximum time constant (shift) */
+
+/*
+ * SHIFT_USEC defines the scaling (shift) of the time_freq and
+ * time_tolerance variables, which represent the current frequency
+ * offset and maximum frequency tolerance.
+ */
+#define SHIFT_USEC 16		/* frequency offset scale (shift) */
+#define PPM_SCALE (NSEC_PER_USEC << (NTP_SCALE_SHIFT - SHIFT_USEC))
+#define PPM_SCALE_INV_SHIFT 19
+#define PPM_SCALE_INV ((1ll << (PPM_SCALE_INV_SHIFT + NTP_SCALE_SHIFT)) / \
+		       PPM_SCALE + 1)
+
+#define MAXPHASE 500000000l	/* max phase error (ns) */
+#define MAXFREQ 500000		/* max frequency error (ns/s) */
+#define MAXFREQ_SCALED ((s64)MAXFREQ << NTP_SCALE_SHIFT)
+#define MINSEC 256		/* min interval between updates (s) */
+#define MAXSEC 2048		/* max interval between updates (s) */
+#define NTP_PHASE_LIMIT ((MAXPHASE / NSEC_PER_USEC) << 5) /* beyond max. dispersion */
+
 /*
  * kernel variables
  * Note: maximum error = NTP synch distance = dispersion + delay / 2;
-- 
cgit v1.2.3-70-g09d2


From bb608e9db7d29616fb6e0d856c23434610d4a1bd Mon Sep 17 00:00:00 2001
From: Senthil Balasubramanian <senthilkumar@atheros.com>
Date: Thu, 4 Dec 2008 20:38:13 +0530
Subject: wireless: Incorrect LEAP authentication algorithm identifier.

This patch fixes a regression introduced by
"wireless: avoid some net/ieee80211.h vs. linux/ieee80211.h conflicts"
LEAP authentication algorithm identifier should be 128.

Signed-off-by: Senthil Balasubramanian <senthilkumar@atheros.com>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 include/linux/ieee80211.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h
index a6ec928186a..c4e6ca1a630 100644
--- a/include/linux/ieee80211.h
+++ b/include/linux/ieee80211.h
@@ -836,7 +836,7 @@ struct ieee80211_ht_info {
 /* Authentication algorithms */
 #define WLAN_AUTH_OPEN 0
 #define WLAN_AUTH_SHARED_KEY 1
-#define WLAN_AUTH_LEAP 2
+#define WLAN_AUTH_LEAP 128
 
 #define WLAN_AUTH_CHALLENGE_LEN 128
 
-- 
cgit v1.2.3-70-g09d2


From 4dec9b807be757780ca3611a959ac22c28d292a7 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes@sipsolutions.net>
Date: Wed, 10 Dec 2008 17:48:48 +0100
Subject: rfkill: strip pointless notifier chain

No users, so no reason to have it.

Signed-off-by: Johannes Berg <johannes@sipsolutions.net>
Acked-by: Ivo van Doorn <IvDoorn@gmail.com>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 include/linux/rfkill.h |  7 -----
 net/rfkill/rfkill.c    | 83 +++-----------------------------------------------
 2 files changed, 5 insertions(+), 85 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/rfkill.h b/include/linux/rfkill.h
index f376a93927f..164332cbb77 100644
--- a/include/linux/rfkill.h
+++ b/include/linux/rfkill.h
@@ -149,11 +149,4 @@ static inline char *rfkill_get_led_name(struct rfkill *rfkill)
 #endif
 }
 
-/* rfkill notification chain */
-#define RFKILL_STATE_CHANGED		0x0001	/* state of a normal rfkill
-						   switch has changed */
-
-int register_rfkill_notifier(struct notifier_block *nb);
-int unregister_rfkill_notifier(struct notifier_block *nb);
-
 #endif /* RFKILL_H */
diff --git a/net/rfkill/rfkill.c b/net/rfkill/rfkill.c
index 051d2c9ea66..3c94f76d552 100644
--- a/net/rfkill/rfkill.c
+++ b/net/rfkill/rfkill.c
@@ -53,51 +53,6 @@ static struct rfkill_gsw_state rfkill_global_states[RFKILL_TYPE_MAX];
 static unsigned long rfkill_states_lockdflt[BITS_TO_LONGS(RFKILL_TYPE_MAX)];
 static bool rfkill_epo_lock_active;
 
-static BLOCKING_NOTIFIER_HEAD(rfkill_notifier_list);
-
-
-/**
- * register_rfkill_notifier - Add notifier to rfkill notifier chain
- * @nb: pointer to the new entry to add to the chain
- *
- * See blocking_notifier_chain_register() for return value and further
- * observations.
- *
- * Adds a notifier to the rfkill notifier chain.  The chain will be
- * called with a pointer to the relevant rfkill structure as a parameter,
- * refer to include/linux/rfkill.h for the possible events.
- *
- * Notifiers added to this chain are to always return NOTIFY_DONE.  This
- * chain is a blocking notifier chain: notifiers can sleep.
- *
- * Calls to this chain may have been done through a workqueue.  One must
- * assume unordered asynchronous behaviour, there is no way to know if
- * actions related to the event that generated the notification have been
- * carried out already.
- */
-int register_rfkill_notifier(struct notifier_block *nb)
-{
-	BUG_ON(!nb);
-	return blocking_notifier_chain_register(&rfkill_notifier_list, nb);
-}
-EXPORT_SYMBOL_GPL(register_rfkill_notifier);
-
-/**
- * unregister_rfkill_notifier - remove notifier from rfkill notifier chain
- * @nb: pointer to the entry to remove from the chain
- *
- * See blocking_notifier_chain_unregister() for return value and further
- * observations.
- *
- * Removes a notifier from the rfkill notifier chain.
- */
-int unregister_rfkill_notifier(struct notifier_block *nb)
-{
-	BUG_ON(!nb);
-	return blocking_notifier_chain_unregister(&rfkill_notifier_list, nb);
-}
-EXPORT_SYMBOL_GPL(unregister_rfkill_notifier);
-
 
 static void rfkill_led_trigger(struct rfkill *rfkill,
 			       enum rfkill_state state)
@@ -124,12 +79,9 @@ static void rfkill_led_trigger_activate(struct led_classdev *led)
 }
 #endif /* CONFIG_RFKILL_LEDS */
 
-static void notify_rfkill_state_change(struct rfkill *rfkill)
+static void rfkill_uevent(struct rfkill *rfkill)
 {
-	rfkill_led_trigger(rfkill, rfkill->state);
-	blocking_notifier_call_chain(&rfkill_notifier_list,
-			RFKILL_STATE_CHANGED,
-			rfkill);
+	kobject_uevent(&rfkill->dev.kobj, KOBJ_CHANGE);
 }
 
 static void update_rfkill_state(struct rfkill *rfkill)
@@ -142,7 +94,7 @@ static void update_rfkill_state(struct rfkill *rfkill)
 			oldstate = rfkill->state;
 			rfkill->state = newstate;
 			if (oldstate != newstate)
-				notify_rfkill_state_change(rfkill);
+				rfkill_uevent(rfkill);
 		}
 		mutex_unlock(&rfkill->mutex);
 	}
@@ -220,7 +172,7 @@ static int rfkill_toggle_radio(struct rfkill *rfkill,
 	}
 
 	if (force || rfkill->state != oldstate)
-		notify_rfkill_state_change(rfkill);
+		rfkill_uevent(rfkill);
 
 	return retval;
 }
@@ -405,7 +357,7 @@ int rfkill_force_state(struct rfkill *rfkill, enum rfkill_state state)
 	rfkill->state = state;
 
 	if (state != oldstate)
-		notify_rfkill_state_change(rfkill);
+		rfkill_uevent(rfkill);
 
 	mutex_unlock(&rfkill->mutex);
 
@@ -618,28 +570,6 @@ static int rfkill_resume(struct device *dev)
 #define rfkill_resume NULL
 #endif
 
-static int rfkill_blocking_uevent_notifier(struct notifier_block *nb,
-					unsigned long eventid,
-					void *data)
-{
-	struct rfkill *rfkill = (struct rfkill *)data;
-
-	switch (eventid) {
-	case RFKILL_STATE_CHANGED:
-		kobject_uevent(&rfkill->dev.kobj, KOBJ_CHANGE);
-		break;
-	default:
-		break;
-	}
-
-	return NOTIFY_DONE;
-}
-
-static struct notifier_block rfkill_blocking_uevent_nb = {
-	.notifier_call	= rfkill_blocking_uevent_notifier,
-	.priority	= 0,
-};
-
 static int rfkill_dev_uevent(struct device *dev, struct kobj_uevent_env *env)
 {
 	struct rfkill *rfkill = to_rfkill(dev);
@@ -942,14 +872,11 @@ static int __init rfkill_init(void)
 		return error;
 	}
 
-	register_rfkill_notifier(&rfkill_blocking_uevent_nb);
-
 	return 0;
 }
 
 static void __exit rfkill_exit(void)
 {
-	unregister_rfkill_notifier(&rfkill_blocking_uevent_nb);
 	class_unregister(&rfkill_class);
 }
 
-- 
cgit v1.2.3-70-g09d2


From 29c0177e6a4ac094302bed54a1d4bbb6b740a9ef Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Sat, 13 Dec 2008 21:20:25 +1030
Subject: cpumask: change cpumask_scnprintf, cpumask_parse_user, cpulist_parse,
 and cpulist_scnprintf to take pointers.

Impact: change calling convention of existing cpumask APIs

Most cpumask functions started with cpus_: these have been replaced by
cpumask_ ones which take struct cpumask pointers as expected.

These four functions don't have good replacement names; fortunately
they're rarely used, so we just change them over.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
Signed-off-by: Mike Travis <travis@sgi.com>
Acked-by: Ingo Molnar <mingo@elte.hu>
Cc: paulus@samba.org
Cc: mingo@redhat.com
Cc: tony.luck@intel.com
Cc: ralf@linux-mips.org
Cc: Greg Kroah-Hartman <gregkh@suse.de>
Cc: cl@linux-foundation.org
Cc: srostedt@redhat.com
---
 arch/ia64/kernel/topology.c           |  2 +-
 arch/mips/kernel/smp-cmp.c            |  4 +-
 arch/powerpc/platforms/pseries/xics.c |  2 +-
 arch/x86/kernel/cpu/intel_cacheinfo.c |  4 +-
 arch/x86/kernel/setup_percpu.c        |  2 +-
 drivers/base/cpu.c                    |  2 +-
 drivers/base/node.c                   |  4 +-
 drivers/base/topology.c               |  4 +-
 drivers/pci/pci-sysfs.c               |  4 +-
 drivers/pci/probe.c                   |  4 +-
 include/linux/cpumask.h               | 87 +++++++++++++++++++++++------------
 kernel/cpuset.c                       |  4 +-
 kernel/irq/proc.c                     |  4 +-
 kernel/profile.c                      |  4 +-
 kernel/sched.c                        |  4 +-
 kernel/sched_stats.h                  |  2 +-
 kernel/taskstats.c                    |  2 +-
 kernel/trace/trace.c                  |  4 +-
 mm/slub.c                             |  2 +-
 19 files changed, 86 insertions(+), 59 deletions(-)

(limited to 'include/linux')

diff --git a/arch/ia64/kernel/topology.c b/arch/ia64/kernel/topology.c
index c75b914f2d6..a8d61a3e9a9 100644
--- a/arch/ia64/kernel/topology.c
+++ b/arch/ia64/kernel/topology.c
@@ -219,7 +219,7 @@ static ssize_t show_shared_cpu_map(struct cache_info *this_leaf, char *buf)
 	cpumask_t shared_cpu_map;
 
 	cpus_and(shared_cpu_map, this_leaf->shared_cpu_map, cpu_online_map);
-	len = cpumask_scnprintf(buf, NR_CPUS+1, shared_cpu_map);
+	len = cpumask_scnprintf(buf, NR_CPUS+1, &shared_cpu_map);
 	len += sprintf(buf+len, "\n");
 	return len;
 }
diff --git a/arch/mips/kernel/smp-cmp.c b/arch/mips/kernel/smp-cmp.c
index 6789c1a1212..f27beca4b26 100644
--- a/arch/mips/kernel/smp-cmp.c
+++ b/arch/mips/kernel/smp-cmp.c
@@ -51,10 +51,10 @@ static int __init allowcpus(char *str)
 	int len;
 
 	cpus_clear(cpu_allow_map);
-	if (cpulist_parse(str, cpu_allow_map) == 0) {
+	if (cpulist_parse(str, &cpu_allow_map) == 0) {
 		cpu_set(0, cpu_allow_map);
 		cpus_and(cpu_possible_map, cpu_possible_map, cpu_allow_map);
-		len = cpulist_scnprintf(buf, sizeof(buf)-1, cpu_possible_map);
+		len = cpulist_scnprintf(buf, sizeof(buf)-1, &cpu_possible_map);
 		buf[len] = '\0';
 		pr_debug("Allowable CPUs: %s\n", buf);
 		return 1;
diff --git a/arch/powerpc/platforms/pseries/xics.c b/arch/powerpc/platforms/pseries/xics.c
index e1904774a70..64d24310ce7 100644
--- a/arch/powerpc/platforms/pseries/xics.c
+++ b/arch/powerpc/platforms/pseries/xics.c
@@ -358,7 +358,7 @@ static void xics_set_affinity(unsigned int virq, cpumask_t cpumask)
 	irq_server = get_irq_server(virq, 1);
 	if (irq_server == -1) {
 		char cpulist[128];
-		cpumask_scnprintf(cpulist, sizeof(cpulist), cpumask);
+		cpumask_scnprintf(cpulist, sizeof(cpulist), &cpumask);
 		printk(KERN_WARNING
 			"%s: No online cpus in the mask %s for irq %d\n",
 			__func__, cpulist, virq);
diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c
index 3f46afbb1cf..43ea612d3e9 100644
--- a/arch/x86/kernel/cpu/intel_cacheinfo.c
+++ b/arch/x86/kernel/cpu/intel_cacheinfo.c
@@ -626,8 +626,8 @@ static ssize_t show_shared_cpu_map_func(struct _cpuid4_info *this_leaf,
 		cpumask_t *mask = &this_leaf->shared_cpu_map;
 
 		n = type?
-			cpulist_scnprintf(buf, len-2, *mask):
-			cpumask_scnprintf(buf, len-2, *mask);
+			cpulist_scnprintf(buf, len-2, mask) :
+			cpumask_scnprintf(buf, len-2, mask);
 		buf[n++] = '\n';
 		buf[n] = '\0';
 	}
diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c
index ae0c0d3bb77..1c2084291f9 100644
--- a/arch/x86/kernel/setup_percpu.c
+++ b/arch/x86/kernel/setup_percpu.c
@@ -282,7 +282,7 @@ static void __cpuinit numa_set_cpumask(int cpu, int enable)
 	else
 		cpu_clear(cpu, *mask);
 
-	cpulist_scnprintf(buf, sizeof(buf), *mask);
+	cpulist_scnprintf(buf, sizeof(buf), mask);
 	printk(KERN_DEBUG "%s cpu %d node %d: mask now %s\n",
 		enable? "numa_add_cpu":"numa_remove_cpu", cpu, node, buf);
  }
diff --git a/drivers/base/cpu.c b/drivers/base/cpu.c
index 64f5d54f7ed..4259072f5bd 100644
--- a/drivers/base/cpu.c
+++ b/drivers/base/cpu.c
@@ -109,7 +109,7 @@ static SYSDEV_ATTR(crash_notes, 0400, show_crash_notes, NULL);
  */
 static ssize_t print_cpus_map(char *buf, cpumask_t *map)
 {
-	int n = cpulist_scnprintf(buf, PAGE_SIZE-2, *map);
+	int n = cpulist_scnprintf(buf, PAGE_SIZE-2, map);
 
 	buf[n++] = '\n';
 	buf[n] = '\0';
diff --git a/drivers/base/node.c b/drivers/base/node.c
index f5207090885..91636cd8b6c 100644
--- a/drivers/base/node.c
+++ b/drivers/base/node.c
@@ -30,8 +30,8 @@ static ssize_t node_read_cpumap(struct sys_device *dev, int type, char *buf)
 	BUILD_BUG_ON((NR_CPUS/32 * 9) > (PAGE_SIZE-1));
 
 	len = type?
-		cpulist_scnprintf(buf, PAGE_SIZE-2, *mask):
-		cpumask_scnprintf(buf, PAGE_SIZE-2, *mask);
+		cpulist_scnprintf(buf, PAGE_SIZE-2, mask) :
+		cpumask_scnprintf(buf, PAGE_SIZE-2, mask);
  	buf[len++] = '\n';
  	buf[len] = '\0';
 	return len;
diff --git a/drivers/base/topology.c b/drivers/base/topology.c
index 199cd97e32e..a8bc1cbcfa7 100644
--- a/drivers/base/topology.c
+++ b/drivers/base/topology.c
@@ -49,8 +49,8 @@ static ssize_t show_cpumap(int type, cpumask_t *mask, char *buf)
 
 	if (len > 1) {
 		n = type?
-			cpulist_scnprintf(buf, len-2, *mask):
-			cpumask_scnprintf(buf, len-2, *mask);
+			cpulist_scnprintf(buf, len-2, mask) :
+			cpumask_scnprintf(buf, len-2, mask);
 		buf[n++] = '\n';
 		buf[n] = '\0';
 	}
diff --git a/drivers/pci/pci-sysfs.c b/drivers/pci/pci-sysfs.c
index 5d72866897a..c88485860a0 100644
--- a/drivers/pci/pci-sysfs.c
+++ b/drivers/pci/pci-sysfs.c
@@ -74,7 +74,7 @@ static ssize_t local_cpus_show(struct device *dev,
 	int len;
 
 	mask = pcibus_to_cpumask(to_pci_dev(dev)->bus);
-	len = cpumask_scnprintf(buf, PAGE_SIZE-2, mask);
+	len = cpumask_scnprintf(buf, PAGE_SIZE-2, &mask);
 	buf[len++] = '\n';
 	buf[len] = '\0';
 	return len;
@@ -88,7 +88,7 @@ static ssize_t local_cpulist_show(struct device *dev,
 	int len;
 
 	mask = pcibus_to_cpumask(to_pci_dev(dev)->bus);
-	len = cpulist_scnprintf(buf, PAGE_SIZE-2, mask);
+	len = cpulist_scnprintf(buf, PAGE_SIZE-2, &mask);
 	buf[len++] = '\n';
 	buf[len] = '\0';
 	return len;
diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c
index 003a9b3c293..5b3f5937ecf 100644
--- a/drivers/pci/probe.c
+++ b/drivers/pci/probe.c
@@ -55,8 +55,8 @@ static ssize_t pci_bus_show_cpuaffinity(struct device *dev,
 
 	cpumask = pcibus_to_cpumask(to_pci_bus(dev));
 	ret = type?
-		cpulist_scnprintf(buf, PAGE_SIZE-2, cpumask):
-		cpumask_scnprintf(buf, PAGE_SIZE-2, cpumask);
+		cpulist_scnprintf(buf, PAGE_SIZE-2, &cpumask) :
+		cpumask_scnprintf(buf, PAGE_SIZE-2, &cpumask);
 	buf[ret++] = '\n';
 	buf[ret] = '\0';
 	return ret;
diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h
index 21e1dd43e52..94a2ab88ae8 100644
--- a/include/linux/cpumask.h
+++ b/include/linux/cpumask.h
@@ -339,36 +339,6 @@ extern cpumask_t cpu_mask_all;
 #endif
 #define	CPUMASK_PTR(v, m) 	cpumask_t *v = &(m->v)
 
-#define cpumask_scnprintf(buf, len, src) \
-			__cpumask_scnprintf((buf), (len), &(src), NR_CPUS)
-static inline int __cpumask_scnprintf(char *buf, int len,
-					const cpumask_t *srcp, int nbits)
-{
-	return bitmap_scnprintf(buf, len, srcp->bits, nbits);
-}
-
-#define cpumask_parse_user(ubuf, ulen, dst) \
-			__cpumask_parse_user((ubuf), (ulen), &(dst), NR_CPUS)
-static inline int __cpumask_parse_user(const char __user *buf, int len,
-					cpumask_t *dstp, int nbits)
-{
-	return bitmap_parse_user(buf, len, dstp->bits, nbits);
-}
-
-#define cpulist_scnprintf(buf, len, src) \
-			__cpulist_scnprintf((buf), (len), &(src), NR_CPUS)
-static inline int __cpulist_scnprintf(char *buf, int len,
-					const cpumask_t *srcp, int nbits)
-{
-	return bitmap_scnlistprintf(buf, len, srcp->bits, nbits);
-}
-
-#define cpulist_parse(buf, dst) __cpulist_parse((buf), &(dst), NR_CPUS)
-static inline int __cpulist_parse(const char *buf, cpumask_t *dstp, int nbits)
-{
-	return bitmap_parselist(buf, dstp->bits, nbits);
-}
-
 #define cpu_remap(oldbit, old, new) \
 		__cpu_remap((oldbit), &(old), &(new), NR_CPUS)
 static inline int __cpu_remap(int oldbit,
@@ -945,6 +915,63 @@ static inline void cpumask_copy(struct cpumask *dstp,
  */
 #define cpumask_of(cpu) (get_cpu_mask(cpu))
 
+/**
+ * cpumask_scnprintf - print a cpumask into a string as comma-separated hex
+ * @buf: the buffer to sprintf into
+ * @len: the length of the buffer
+ * @srcp: the cpumask to print
+ *
+ * If len is zero, returns zero.  Otherwise returns the length of the
+ * (nul-terminated) @buf string.
+ */
+static inline int cpumask_scnprintf(char *buf, int len,
+				    const struct cpumask *srcp)
+{
+	return bitmap_scnprintf(buf, len, srcp->bits, nr_cpumask_bits);
+}
+
+/**
+ * cpumask_parse_user - extract a cpumask from a user string
+ * @buf: the buffer to extract from
+ * @len: the length of the buffer
+ * @dstp: the cpumask to set.
+ *
+ * Returns -errno, or 0 for success.
+ */
+static inline int cpumask_parse_user(const char __user *buf, int len,
+				     struct cpumask *dstp)
+{
+	return bitmap_parse_user(buf, len, dstp->bits, nr_cpumask_bits);
+}
+
+/**
+ * cpulist_scnprintf - print a cpumask into a string as comma-separated list
+ * @buf: the buffer to sprintf into
+ * @len: the length of the buffer
+ * @srcp: the cpumask to print
+ *
+ * If len is zero, returns zero.  Otherwise returns the length of the
+ * (nul-terminated) @buf string.
+ */
+static inline int cpulist_scnprintf(char *buf, int len,
+				    const struct cpumask *srcp)
+{
+	return bitmap_scnlistprintf(buf, len, srcp->bits, nr_cpumask_bits);
+}
+
+/**
+ * cpulist_parse_user - extract a cpumask from a user string of ranges
+ * @buf: the buffer to extract from
+ * @len: the length of the buffer
+ * @dstp: the cpumask to set.
+ *
+ * Returns -errno, or 0 for success.
+ */
+static inline int cpulist_parse(const char *buf, struct cpumask *dstp)
+{
+	return bitmap_parselist(buf, dstp->bits, nr_cpumask_bits);
+}
+
 /**
  * to_cpumask - convert an NR_CPUS bitmap to a struct cpumask *
  * @bitmap: the bitmap
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 96c0ba13b8c..39c1a4c1c5a 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -896,7 +896,7 @@ static int update_cpumask(struct cpuset *cs, const char *buf)
 	if (!*buf) {
 		cpus_clear(trialcs.cpus_allowed);
 	} else {
-		retval = cpulist_parse(buf, trialcs.cpus_allowed);
+		retval = cpulist_parse(buf, &trialcs.cpus_allowed);
 		if (retval < 0)
 			return retval;
 
@@ -1482,7 +1482,7 @@ static int cpuset_sprintf_cpulist(char *page, struct cpuset *cs)
 	mask = cs->cpus_allowed;
 	mutex_unlock(&callback_mutex);
 
-	return cpulist_scnprintf(page, PAGE_SIZE, mask);
+	return cpulist_scnprintf(page, PAGE_SIZE, &mask);
 }
 
 static int cpuset_sprintf_memlist(char *page, struct cpuset *cs)
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index d257e7d6a8a..f293349d49d 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -47,7 +47,7 @@ static ssize_t irq_affinity_proc_write(struct file *file,
 	    irq_balancing_disabled(irq))
 		return -EIO;
 
-	err = cpumask_parse_user(buffer, count, new_value);
+	err = cpumask_parse_user(buffer, count, &new_value);
 	if (err)
 		return err;
 
@@ -95,7 +95,7 @@ static ssize_t default_affinity_write(struct file *file,
 	cpumask_t new_value;
 	int err;
 
-	err = cpumask_parse_user(buffer, count, new_value);
+	err = cpumask_parse_user(buffer, count, &new_value);
 	if (err)
 		return err;
 
diff --git a/kernel/profile.c b/kernel/profile.c
index dc41827fbfe..7d620dfdde5 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -442,7 +442,7 @@ void profile_tick(int type)
 static int prof_cpu_mask_read_proc(char *page, char **start, off_t off,
 			int count, int *eof, void *data)
 {
-	int len = cpumask_scnprintf(page, count, *(cpumask_t *)data);
+	int len = cpumask_scnprintf(page, count, (cpumask_t *)data);
 	if (count - len < 2)
 		return -EINVAL;
 	len += sprintf(page + len, "\n");
@@ -456,7 +456,7 @@ static int prof_cpu_mask_write_proc(struct file *file,
 	unsigned long full_count = count, err;
 	cpumask_t new_value;
 
-	err = cpumask_parse_user(buffer, count, new_value);
+	err = cpumask_parse_user(buffer, count, &new_value);
 	if (err)
 		return err;
 
diff --git a/kernel/sched.c b/kernel/sched.c
index e4bb1dd7b30..d2d16d1273b 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -6666,7 +6666,7 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
 	struct sched_group *group = sd->groups;
 	char str[256];
 
-	cpulist_scnprintf(str, sizeof(str), sd->span);
+	cpulist_scnprintf(str, sizeof(str), &sd->span);
 	cpus_clear(*groupmask);
 
 	printk(KERN_DEBUG "%*s domain %d: ", level, "", level);
@@ -6720,7 +6720,7 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
 
 		cpus_or(*groupmask, *groupmask, group->cpumask);
 
-		cpulist_scnprintf(str, sizeof(str), group->cpumask);
+		cpulist_scnprintf(str, sizeof(str), &group->cpumask);
 		printk(KERN_CONT " %s", str);
 
 		group = group->next;
diff --git a/kernel/sched_stats.h b/kernel/sched_stats.h
index 7dbf72a2b02..6beff1e4eea 100644
--- a/kernel/sched_stats.h
+++ b/kernel/sched_stats.h
@@ -42,7 +42,7 @@ static int show_schedstat(struct seq_file *seq, void *v)
 		for_each_domain(cpu, sd) {
 			enum cpu_idle_type itype;
 
-			cpumask_scnprintf(mask_str, mask_len, sd->span);
+			cpumask_scnprintf(mask_str, mask_len, &sd->span);
 			seq_printf(seq, "domain%d %s", dcount++, mask_str);
 			for (itype = CPU_IDLE; itype < CPU_MAX_IDLE_TYPES;
 					itype++) {
diff --git a/kernel/taskstats.c b/kernel/taskstats.c
index bd6be76303c..6d7dc4ec4aa 100644
--- a/kernel/taskstats.c
+++ b/kernel/taskstats.c
@@ -352,7 +352,7 @@ static int parse(struct nlattr *na, cpumask_t *mask)
 	if (!data)
 		return -ENOMEM;
 	nla_strlcpy(data, na, len);
-	ret = cpulist_parse(data, *mask);
+	ret = cpulist_parse(data, mask);
 	kfree(data);
 	return ret;
 }
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index d86e3252f30..d2e75479dc5 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -2126,7 +2126,7 @@ tracing_cpumask_read(struct file *filp, char __user *ubuf,
 
 	mutex_lock(&tracing_cpumask_update_lock);
 
-	len = cpumask_scnprintf(mask_str, count, tracing_cpumask);
+	len = cpumask_scnprintf(mask_str, count, &tracing_cpumask);
 	if (count - len < 2) {
 		count = -EINVAL;
 		goto out_err;
@@ -2147,7 +2147,7 @@ tracing_cpumask_write(struct file *filp, const char __user *ubuf,
 	int err, cpu;
 
 	mutex_lock(&tracing_cpumask_update_lock);
-	err = cpumask_parse_user(ubuf, count, tracing_cpumask_new);
+	err = cpumask_parse_user(ubuf, count, &tracing_cpumask_new);
 	if (err)
 		goto err_unlock;
 
diff --git a/mm/slub.c b/mm/slub.c
index a2cd47d89e0..8e516e29f98 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -3626,7 +3626,7 @@ static int list_locations(struct kmem_cache *s, char *buf,
 				len < PAGE_SIZE - 60) {
 			len += sprintf(buf + len, " cpus=");
 			len += cpulist_scnprintf(buf + len, PAGE_SIZE - len - 50,
-					l->cpus);
+					&l->cpus);
 		}
 
 		if (num_online_nodes() > 1 && !nodes_empty(l->nodes) &&
-- 
cgit v1.2.3-70-g09d2


From 0de26520c7cabf36e1de090ea8092f011a6106ce Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Sat, 13 Dec 2008 21:20:26 +1030
Subject: cpumask: make irq_set_affinity() take a const struct cpumask

Impact: change existing irq_chip API

Not much point with gentle transition here: the struct irq_chip's
setaffinity method signature needs to change.

Fortunately, not widely used code, but hits a few architectures.

Note: In irq_select_affinity() I save a temporary in by mangling
irq_desc[irq].affinity directly.  Ingo, does this break anything?

(Folded in fix from KOSAKI Motohiro)

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
Signed-off-by: Mike Travis <travis@sgi.com>
Reviewed-by: Grant Grundler <grundler@parisc-linux.org>
Acked-by: Ingo Molnar <mingo@redhat.com>
Cc: ralf@linux-mips.org
Cc: grundler@parisc-linux.org
Cc: jeremy@xensource.com
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
---
 arch/alpha/kernel/irq.c               |  2 +-
 arch/alpha/kernel/sys_dp264.c         |  8 ++--
 arch/alpha/kernel/sys_titan.c         |  4 +-
 arch/arm/common/gic.c                 |  4 +-
 arch/arm/kernel/irq.c                 |  2 +-
 arch/arm/oprofile/op_model_mpcore.c   |  4 +-
 arch/cris/arch-v32/kernel/irq.c       |  4 +-
 arch/ia64/hp/sim/hpsim_irq.c          |  2 +-
 arch/ia64/kernel/iosapic.c            | 12 +++---
 arch/ia64/kernel/irq.c                |  9 ++--
 arch/ia64/kernel/msi_ia64.c           | 12 +++---
 arch/ia64/kernel/smpboot.c            |  4 +-
 arch/ia64/sn/kernel/irq.c             |  6 +--
 arch/ia64/sn/kernel/msi_sn.c          |  7 +--
 arch/mips/include/asm/irq.h           |  3 +-
 arch/mips/kernel/cevt-bcm1480.c       |  2 +-
 arch/mips/kernel/cevt-sb1250.c        |  2 +-
 arch/mips/kernel/irq-gic.c            |  6 +--
 arch/mips/mti-malta/malta-smtc.c      |  6 +--
 arch/mips/sibyte/bcm1480/irq.c        |  8 ++--
 arch/mips/sibyte/sb1250/irq.c         |  8 ++--
 arch/parisc/kernel/irq.c              |  6 +--
 arch/powerpc/kernel/irq.c             |  2 +-
 arch/powerpc/platforms/pseries/xics.c |  6 +--
 arch/powerpc/sysdev/mpic.c            |  4 +-
 arch/powerpc/sysdev/mpic.h            |  2 +-
 arch/sparc64/kernel/irq.c             | 11 +++--
 arch/sparc64/kernel/of_device.c       |  2 +-
 arch/sparc64/kernel/pci_msi.c         |  2 +-
 arch/x86/kernel/hpet.c                |  4 +-
 arch/x86/kernel/io_apic.c             | 81 +++++++++++++++++------------------
 arch/x86/kernel/irq_32.c              |  2 +-
 arch/x86/kernel/irq_64.c              |  2 +-
 drivers/parisc/iosapic.c              |  7 +--
 drivers/xen/events.c                  |  6 +--
 include/linux/interrupt.h             |  4 +-
 include/linux/irq.h                   |  3 +-
 kernel/irq/chip.c                     |  2 +-
 kernel/irq/manage.c                   | 22 +++++-----
 kernel/irq/migration.c                | 14 +++---
 kernel/irq/proc.c                     | 29 ++++++++-----
 kernel/time/tick-common.c             |  6 +--
 42 files changed, 171 insertions(+), 161 deletions(-)

(limited to 'include/linux')

diff --git a/arch/alpha/kernel/irq.c b/arch/alpha/kernel/irq.c
index c626a821cdc..d0f1620007f 100644
--- a/arch/alpha/kernel/irq.c
+++ b/arch/alpha/kernel/irq.c
@@ -55,7 +55,7 @@ int irq_select_affinity(unsigned int irq)
 	last_cpu = cpu;
 
 	irq_desc[irq].affinity = cpumask_of_cpu(cpu);
-	irq_desc[irq].chip->set_affinity(irq, cpumask_of_cpu(cpu));
+	irq_desc[irq].chip->set_affinity(irq, cpumask_of(cpu));
 	return 0;
 }
 #endif /* CONFIG_SMP */
diff --git a/arch/alpha/kernel/sys_dp264.c b/arch/alpha/kernel/sys_dp264.c
index c71b0fd7a61..ab44c164d9d 100644
--- a/arch/alpha/kernel/sys_dp264.c
+++ b/arch/alpha/kernel/sys_dp264.c
@@ -177,19 +177,19 @@ cpu_set_irq_affinity(unsigned int irq, cpumask_t affinity)
 }
 
 static void
-dp264_set_affinity(unsigned int irq, cpumask_t affinity)
+dp264_set_affinity(unsigned int irq, const struct cpumask *affinity)
 { 
 	spin_lock(&dp264_irq_lock);
-	cpu_set_irq_affinity(irq, affinity);
+	cpu_set_irq_affinity(irq, *affinity);
 	tsunami_update_irq_hw(cached_irq_mask);
 	spin_unlock(&dp264_irq_lock);
 }
 
 static void
-clipper_set_affinity(unsigned int irq, cpumask_t affinity)
+clipper_set_affinity(unsigned int irq, const struct cpumask *affinity)
 { 
 	spin_lock(&dp264_irq_lock);
-	cpu_set_irq_affinity(irq - 16, affinity);
+	cpu_set_irq_affinity(irq - 16, *affinity);
 	tsunami_update_irq_hw(cached_irq_mask);
 	spin_unlock(&dp264_irq_lock);
 }
diff --git a/arch/alpha/kernel/sys_titan.c b/arch/alpha/kernel/sys_titan.c
index 52c91ccc164..27f840a4ad3 100644
--- a/arch/alpha/kernel/sys_titan.c
+++ b/arch/alpha/kernel/sys_titan.c
@@ -158,10 +158,10 @@ titan_cpu_set_irq_affinity(unsigned int irq, cpumask_t affinity)
 }
 
 static void
-titan_set_irq_affinity(unsigned int irq, cpumask_t affinity)
+titan_set_irq_affinity(unsigned int irq, const struct cpumask *affinity)
 { 
 	spin_lock(&titan_irq_lock);
-	titan_cpu_set_irq_affinity(irq - 16, affinity);
+	titan_cpu_set_irq_affinity(irq - 16, *affinity);
 	titan_update_irq_hw(titan_cached_irq_mask);
 	spin_unlock(&titan_irq_lock);
 }
diff --git a/arch/arm/common/gic.c b/arch/arm/common/gic.c
index 7fc9860a97d..c6884ba1d5e 100644
--- a/arch/arm/common/gic.c
+++ b/arch/arm/common/gic.c
@@ -109,11 +109,11 @@ static void gic_unmask_irq(unsigned int irq)
 }
 
 #ifdef CONFIG_SMP
-static void gic_set_cpu(unsigned int irq, cpumask_t mask_val)
+static void gic_set_cpu(unsigned int irq, const struct cpumask *mask_val)
 {
 	void __iomem *reg = gic_dist_base(irq) + GIC_DIST_TARGET + (gic_irq(irq) & ~3);
 	unsigned int shift = (irq % 4) * 8;
-	unsigned int cpu = first_cpu(mask_val);
+	unsigned int cpu = cpumask_first(mask_val);
 	u32 val;
 
 	spin_lock(&irq_controller_lock);
diff --git a/arch/arm/kernel/irq.c b/arch/arm/kernel/irq.c
index 2f3eb795fa6..7141cee1fab 100644
--- a/arch/arm/kernel/irq.c
+++ b/arch/arm/kernel/irq.c
@@ -174,7 +174,7 @@ static void route_irq(struct irq_desc *desc, unsigned int irq, unsigned int cpu)
 	pr_debug("IRQ%u: moving from cpu%u to cpu%u\n", irq, desc->cpu, cpu);
 
 	spin_lock_irq(&desc->lock);
-	desc->chip->set_affinity(irq, cpumask_of_cpu(cpu));
+	desc->chip->set_affinity(irq, cpumask_of(cpu));
 	spin_unlock_irq(&desc->lock);
 }
 
diff --git a/arch/arm/oprofile/op_model_mpcore.c b/arch/arm/oprofile/op_model_mpcore.c
index 4de366e8b4c..6d6bd589924 100644
--- a/arch/arm/oprofile/op_model_mpcore.c
+++ b/arch/arm/oprofile/op_model_mpcore.c
@@ -260,10 +260,10 @@ static void em_stop(void)
 static void em_route_irq(int irq, unsigned int cpu)
 {
 	struct irq_desc *desc = irq_desc + irq;
-	cpumask_t mask = cpumask_of_cpu(cpu);
+	const struct cpumask *mask = cpumask_of(cpu);
 
 	spin_lock_irq(&desc->lock);
-	desc->affinity = mask;
+	desc->affinity = *mask;
 	desc->chip->set_affinity(irq, mask);
 	spin_unlock_irq(&desc->lock);
 }
diff --git a/arch/cris/arch-v32/kernel/irq.c b/arch/cris/arch-v32/kernel/irq.c
index 173c141ac9b..295131fee71 100644
--- a/arch/cris/arch-v32/kernel/irq.c
+++ b/arch/cris/arch-v32/kernel/irq.c
@@ -325,11 +325,11 @@ static void end_crisv32_irq(unsigned int irq)
 {
 }
 
-void set_affinity_crisv32_irq(unsigned int irq, cpumask_t dest)
+void set_affinity_crisv32_irq(unsigned int irq, const struct cpumask *dest)
 {
 	unsigned long flags;
 	spin_lock_irqsave(&irq_lock, flags);
-	irq_allocations[irq - FIRST_IRQ].mask = dest;
+	irq_allocations[irq - FIRST_IRQ].mask = *dest;
 	spin_unlock_irqrestore(&irq_lock, flags);
 }
 
diff --git a/arch/ia64/hp/sim/hpsim_irq.c b/arch/ia64/hp/sim/hpsim_irq.c
index c2f58ff364e..cc0a3182db3 100644
--- a/arch/ia64/hp/sim/hpsim_irq.c
+++ b/arch/ia64/hp/sim/hpsim_irq.c
@@ -22,7 +22,7 @@ hpsim_irq_noop (unsigned int irq)
 }
 
 static void
-hpsim_set_affinity_noop (unsigned int a, cpumask_t b)
+hpsim_set_affinity_noop(unsigned int a, const struct cpumask *b)
 {
 }
 
diff --git a/arch/ia64/kernel/iosapic.c b/arch/ia64/kernel/iosapic.c
index 5c4674ae8ae..c8adecd5b41 100644
--- a/arch/ia64/kernel/iosapic.c
+++ b/arch/ia64/kernel/iosapic.c
@@ -330,25 +330,25 @@ unmask_irq (unsigned int irq)
 
 
 static void
-iosapic_set_affinity (unsigned int irq, cpumask_t mask)
+iosapic_set_affinity(unsigned int irq, const struct cpumask *mask)
 {
 #ifdef CONFIG_SMP
 	u32 high32, low32;
-	int dest, rte_index;
+	int cpu, dest, rte_index;
 	int redir = (irq & IA64_IRQ_REDIRECTED) ? 1 : 0;
 	struct iosapic_rte_info *rte;
 	struct iosapic *iosapic;
 
 	irq &= (~IA64_IRQ_REDIRECTED);
 
-	cpus_and(mask, mask, cpu_online_map);
-	if (cpus_empty(mask))
+	cpu = cpumask_first_and(cpu_online_mask, mask);
+	if (cpu >= nr_cpu_ids)
 		return;
 
-	if (irq_prepare_move(irq, first_cpu(mask)))
+	if (irq_prepare_move(irq, cpu))
 		return;
 
-	dest = cpu_physical_id(first_cpu(mask));
+	dest = cpu_physical_id(cpu);
 
 	if (!iosapic_intr_info[irq].count)
 		return;			/* not an IOSAPIC interrupt */
diff --git a/arch/ia64/kernel/irq.c b/arch/ia64/kernel/irq.c
index 7fd18f54c05..0b6db53fedc 100644
--- a/arch/ia64/kernel/irq.c
+++ b/arch/ia64/kernel/irq.c
@@ -133,7 +133,6 @@ unsigned int vectors_in_migration[NR_IRQS];
  */
 static void migrate_irqs(void)
 {
-	cpumask_t	mask;
 	irq_desc_t *desc;
 	int 		irq, new_cpu;
 
@@ -152,15 +151,14 @@ static void migrate_irqs(void)
 		if (desc->status == IRQ_PER_CPU)
 			continue;
 
-		cpus_and(mask, irq_desc[irq].affinity, cpu_online_map);
-		if (any_online_cpu(mask) == NR_CPUS) {
+		if (cpumask_any_and(&irq_desc[irq].affinity, cpu_online_mask)
+		    >= nr_cpu_ids) {
 			/*
 			 * Save it for phase 2 processing
 			 */
 			vectors_in_migration[irq] = irq;
 
 			new_cpu = any_online_cpu(cpu_online_map);
-			mask = cpumask_of_cpu(new_cpu);
 
 			/*
 			 * Al three are essential, currently WARN_ON.. maybe panic?
@@ -168,7 +166,8 @@ static void migrate_irqs(void)
 			if (desc->chip && desc->chip->disable &&
 				desc->chip->enable && desc->chip->set_affinity) {
 				desc->chip->disable(irq);
-				desc->chip->set_affinity(irq, mask);
+				desc->chip->set_affinity(irq,
+							 cpumask_of(new_cpu));
 				desc->chip->enable(irq);
 			} else {
 				WARN_ON((!(desc->chip) || !(desc->chip->disable) ||
diff --git a/arch/ia64/kernel/msi_ia64.c b/arch/ia64/kernel/msi_ia64.c
index 702a09c1323..89033933903 100644
--- a/arch/ia64/kernel/msi_ia64.c
+++ b/arch/ia64/kernel/msi_ia64.c
@@ -49,11 +49,12 @@
 static struct irq_chip	ia64_msi_chip;
 
 #ifdef CONFIG_SMP
-static void ia64_set_msi_irq_affinity(unsigned int irq, cpumask_t cpu_mask)
+static void ia64_set_msi_irq_affinity(unsigned int irq,
+				      const cpumask_t *cpu_mask)
 {
 	struct msi_msg msg;
 	u32 addr, data;
-	int cpu = first_cpu(cpu_mask);
+	int cpu = first_cpu(*cpu_mask);
 
 	if (!cpu_online(cpu))
 		return;
@@ -166,12 +167,11 @@ void arch_teardown_msi_irq(unsigned int irq)
 
 #ifdef CONFIG_DMAR
 #ifdef CONFIG_SMP
-static void dmar_msi_set_affinity(unsigned int irq, cpumask_t mask)
+static void dmar_msi_set_affinity(unsigned int irq, const struct cpumask *mask)
 {
 	struct irq_cfg *cfg = irq_cfg + irq;
 	struct msi_msg msg;
-	int cpu = first_cpu(mask);
-
+	int cpu = cpumask_first(mask);
 
 	if (!cpu_online(cpu))
 		return;
@@ -187,7 +187,7 @@ static void dmar_msi_set_affinity(unsigned int irq, cpumask_t mask)
 	msg.address_lo |= MSI_ADDR_DESTID_CPU(cpu_physical_id(cpu));
 
 	dmar_msi_write(irq, &msg);
-	irq_desc[irq].affinity = mask;
+	irq_desc[irq].affinity = *mask;
 }
 #endif /* CONFIG_SMP */
 
diff --git a/arch/ia64/kernel/smpboot.c b/arch/ia64/kernel/smpboot.c
index 4ede6e571c3..11463994a7d 100644
--- a/arch/ia64/kernel/smpboot.c
+++ b/arch/ia64/kernel/smpboot.c
@@ -682,7 +682,7 @@ int migrate_platform_irqs(unsigned int cpu)
 {
 	int new_cpei_cpu;
 	irq_desc_t *desc = NULL;
-	cpumask_t 	mask;
+	const struct cpumask *mask;
 	int 		retval = 0;
 
 	/*
@@ -695,7 +695,7 @@ int migrate_platform_irqs(unsigned int cpu)
 			 * Now re-target the CPEI to a different processor
 			 */
 			new_cpei_cpu = any_online_cpu(cpu_online_map);
-			mask = cpumask_of_cpu(new_cpei_cpu);
+			mask = cpumask_of(new_cpei_cpu);
 			set_cpei_target_cpu(new_cpei_cpu);
 			desc = irq_desc + ia64_cpe_irq;
 			/*
diff --git a/arch/ia64/sn/kernel/irq.c b/arch/ia64/sn/kernel/irq.c
index 0c66dbdd1d7..66fd705e82c 100644
--- a/arch/ia64/sn/kernel/irq.c
+++ b/arch/ia64/sn/kernel/irq.c
@@ -227,14 +227,14 @@ finish_up:
 	return new_irq_info;
 }
 
-static void sn_set_affinity_irq(unsigned int irq, cpumask_t mask)
+static void sn_set_affinity_irq(unsigned int irq, const struct cpumask *mask)
 {
 	struct sn_irq_info *sn_irq_info, *sn_irq_info_safe;
 	nasid_t nasid;
 	int slice;
 
-	nasid = cpuid_to_nasid(first_cpu(mask));
-	slice = cpuid_to_slice(first_cpu(mask));
+	nasid = cpuid_to_nasid(cpumask_first(mask));
+	slice = cpuid_to_slice(cpumask_first(mask));
 
 	list_for_each_entry_safe(sn_irq_info, sn_irq_info_safe,
 				 sn_irq_lh[irq], list)
diff --git a/arch/ia64/sn/kernel/msi_sn.c b/arch/ia64/sn/kernel/msi_sn.c
index 83f190ffe35..ca553b0429c 100644
--- a/arch/ia64/sn/kernel/msi_sn.c
+++ b/arch/ia64/sn/kernel/msi_sn.c
@@ -151,7 +151,8 @@ int sn_setup_msi_irq(struct pci_dev *pdev, struct msi_desc *entry)
 }
 
 #ifdef CONFIG_SMP
-static void sn_set_msi_irq_affinity(unsigned int irq, cpumask_t cpu_mask)
+static void sn_set_msi_irq_affinity(unsigned int irq,
+				    const struct cpumask *cpu_mask)
 {
 	struct msi_msg msg;
 	int slice;
@@ -164,7 +165,7 @@ static void sn_set_msi_irq_affinity(unsigned int irq, cpumask_t cpu_mask)
 	struct sn_pcibus_provider *provider;
 	unsigned int cpu;
 
-	cpu = first_cpu(cpu_mask);
+	cpu = cpumask_first(cpu_mask);
 	sn_irq_info = sn_msi_info[irq].sn_irq_info;
 	if (sn_irq_info == NULL || sn_irq_info->irq_int_bit >= 0)
 		return;
@@ -204,7 +205,7 @@ static void sn_set_msi_irq_affinity(unsigned int irq, cpumask_t cpu_mask)
 	msg.address_lo = (u32)(bus_addr & 0x00000000ffffffff);
 
 	write_msi_msg(irq, &msg);
-	irq_desc[irq].affinity = cpu_mask;
+	irq_desc[irq].affinity = *cpu_mask;
 }
 #endif /* CONFIG_SMP */
 
diff --git a/arch/mips/include/asm/irq.h b/arch/mips/include/asm/irq.h
index a58f0eecc68..abc62aa744a 100644
--- a/arch/mips/include/asm/irq.h
+++ b/arch/mips/include/asm/irq.h
@@ -49,7 +49,8 @@ static inline void smtc_im_ack_irq(unsigned int irq)
 #ifdef CONFIG_MIPS_MT_SMTC_IRQAFF
 #include <linux/cpumask.h>
 
-extern void plat_set_irq_affinity(unsigned int irq, cpumask_t affinity);
+extern void plat_set_irq_affinity(unsigned int irq,
+				  const struct cpumask *affinity);
 extern void smtc_forward_irq(unsigned int irq);
 
 /*
diff --git a/arch/mips/kernel/cevt-bcm1480.c b/arch/mips/kernel/cevt-bcm1480.c
index 0a57f86945f..d7e21bc8cd2 100644
--- a/arch/mips/kernel/cevt-bcm1480.c
+++ b/arch/mips/kernel/cevt-bcm1480.c
@@ -148,6 +148,6 @@ void __cpuinit sb1480_clockevent_init(void)
 	action->name	= name;
 	action->dev_id	= cd;
 
-	irq_set_affinity(irq, cpumask_of_cpu(cpu));
+	irq_set_affinity(irq, cpumask_of(cpu));
 	setup_irq(irq, action);
 }
diff --git a/arch/mips/kernel/cevt-sb1250.c b/arch/mips/kernel/cevt-sb1250.c
index 63ac3ad462b..0f188cd46e0 100644
--- a/arch/mips/kernel/cevt-sb1250.c
+++ b/arch/mips/kernel/cevt-sb1250.c
@@ -147,6 +147,6 @@ void __cpuinit sb1250_clockevent_init(void)
 	action->name	= name;
 	action->dev_id	= cd;
 
-	irq_set_affinity(irq, cpumask_of_cpu(cpu));
+	irq_set_affinity(irq, cpumask_of(cpu));
 	setup_irq(irq, action);
 }
diff --git a/arch/mips/kernel/irq-gic.c b/arch/mips/kernel/irq-gic.c
index f0a4bb19e09..494a49a317e 100644
--- a/arch/mips/kernel/irq-gic.c
+++ b/arch/mips/kernel/irq-gic.c
@@ -155,7 +155,7 @@ static void gic_unmask_irq(unsigned int irq)
 
 static DEFINE_SPINLOCK(gic_lock);
 
-static void gic_set_affinity(unsigned int irq, cpumask_t cpumask)
+static void gic_set_affinity(unsigned int irq, const struct cpumask *cpumask)
 {
 	cpumask_t	tmp = CPU_MASK_NONE;
 	unsigned long	flags;
@@ -164,7 +164,7 @@ static void gic_set_affinity(unsigned int irq, cpumask_t cpumask)
 	pr_debug(KERN_DEBUG "%s called\n", __func__);
 	irq -= _irqbase;
 
-	cpus_and(tmp, cpumask, cpu_online_map);
+	cpumask_and(&tmp, cpumask, cpu_online_mask);
 	if (cpus_empty(tmp))
 		return;
 
@@ -187,7 +187,7 @@ static void gic_set_affinity(unsigned int irq, cpumask_t cpumask)
 		set_bit(irq, pcpu_masks[first_cpu(tmp)].pcpu_mask);
 
 	}
-	irq_desc[irq].affinity = cpumask;
+	irq_desc[irq].affinity = *cpumask;
 	spin_unlock_irqrestore(&gic_lock, flags);
 
 }
diff --git a/arch/mips/mti-malta/malta-smtc.c b/arch/mips/mti-malta/malta-smtc.c
index f84a46a8ae6..aabd7274507 100644
--- a/arch/mips/mti-malta/malta-smtc.c
+++ b/arch/mips/mti-malta/malta-smtc.c
@@ -114,9 +114,9 @@ struct plat_smp_ops msmtc_smp_ops = {
  */
 
 
-void plat_set_irq_affinity(unsigned int irq, cpumask_t affinity)
+void plat_set_irq_affinity(unsigned int irq, const struct cpumask *affinity)
 {
-	cpumask_t tmask = affinity;
+	cpumask_t tmask = *affinity;
 	int cpu = 0;
 	void smtc_set_irq_affinity(unsigned int irq, cpumask_t aff);
 
@@ -139,7 +139,7 @@ void plat_set_irq_affinity(unsigned int irq, cpumask_t affinity)
 	 * be made to forward to an offline "CPU".
 	 */
 
-	for_each_cpu_mask(cpu, affinity) {
+	for_each_cpu(cpu, affinity) {
 		if ((cpu_data[cpu].vpe_id != 0) || !cpu_online(cpu))
 			cpu_clear(cpu, tmask);
 	}
diff --git a/arch/mips/sibyte/bcm1480/irq.c b/arch/mips/sibyte/bcm1480/irq.c
index a35818ed426..12b465d404d 100644
--- a/arch/mips/sibyte/bcm1480/irq.c
+++ b/arch/mips/sibyte/bcm1480/irq.c
@@ -50,7 +50,7 @@ static void enable_bcm1480_irq(unsigned int irq);
 static void disable_bcm1480_irq(unsigned int irq);
 static void ack_bcm1480_irq(unsigned int irq);
 #ifdef CONFIG_SMP
-static void bcm1480_set_affinity(unsigned int irq, cpumask_t mask);
+static void bcm1480_set_affinity(unsigned int irq, const struct cpumask *mask);
 #endif
 
 #ifdef CONFIG_PCI
@@ -109,7 +109,7 @@ void bcm1480_unmask_irq(int cpu, int irq)
 }
 
 #ifdef CONFIG_SMP
-static void bcm1480_set_affinity(unsigned int irq, cpumask_t mask)
+static void bcm1480_set_affinity(unsigned int irq, const struct cpumask *mask)
 {
 	int i = 0, old_cpu, cpu, int_on, k;
 	u64 cur_ints;
@@ -117,11 +117,11 @@ static void bcm1480_set_affinity(unsigned int irq, cpumask_t mask)
 	unsigned long flags;
 	unsigned int irq_dirty;
 
-	if (cpus_weight(mask) != 1) {
+	if (cpumask_weight(mask) != 1) {
 		printk("attempted to set irq affinity for irq %d to multiple CPUs\n", irq);
 		return;
 	}
-	i = first_cpu(mask);
+	i = cpumask_first(mask);
 
 	/* Convert logical CPU to physical CPU */
 	cpu = cpu_logical_map(i);
diff --git a/arch/mips/sibyte/sb1250/irq.c b/arch/mips/sibyte/sb1250/irq.c
index a5158483986..808ac2959b8 100644
--- a/arch/mips/sibyte/sb1250/irq.c
+++ b/arch/mips/sibyte/sb1250/irq.c
@@ -50,7 +50,7 @@ static void enable_sb1250_irq(unsigned int irq);
 static void disable_sb1250_irq(unsigned int irq);
 static void ack_sb1250_irq(unsigned int irq);
 #ifdef CONFIG_SMP
-static void sb1250_set_affinity(unsigned int irq, cpumask_t mask);
+static void sb1250_set_affinity(unsigned int irq, const struct cpumask *mask);
 #endif
 
 #ifdef CONFIG_SIBYTE_HAS_LDT
@@ -103,16 +103,16 @@ void sb1250_unmask_irq(int cpu, int irq)
 }
 
 #ifdef CONFIG_SMP
-static void sb1250_set_affinity(unsigned int irq, cpumask_t mask)
+static void sb1250_set_affinity(unsigned int irq, const struct cpumask *mask)
 {
 	int i = 0, old_cpu, cpu, int_on;
 	u64 cur_ints;
 	struct irq_desc *desc = irq_desc + irq;
 	unsigned long flags;
 
-	i = first_cpu(mask);
+	i = cpumask_first(mask);
 
-	if (cpus_weight(mask) > 1) {
+	if (cpumask_weight(mask) > 1) {
 		printk("attempted to set irq affinity for irq %d to multiple CPUs\n", irq);
 		return;
 	}
diff --git a/arch/parisc/kernel/irq.c b/arch/parisc/kernel/irq.c
index 23ef950df00..4cea935e2f9 100644
--- a/arch/parisc/kernel/irq.c
+++ b/arch/parisc/kernel/irq.c
@@ -131,12 +131,12 @@ int cpu_check_affinity(unsigned int irq, cpumask_t *dest)
 	return 0;
 }
 
-static void cpu_set_affinity_irq(unsigned int irq, cpumask_t dest)
+static void cpu_set_affinity_irq(unsigned int irq, const struct cpumask *dest)
 {
-	if (cpu_check_affinity(irq, &dest))
+	if (cpu_check_affinity(irq, dest))
 		return;
 
-	irq_desc[irq].affinity = dest;
+	irq_desc[irq].affinity = *dest;
 }
 #endif
 
diff --git a/arch/powerpc/kernel/irq.c b/arch/powerpc/kernel/irq.c
index ac222d0ab12..23b8b5e36f9 100644
--- a/arch/powerpc/kernel/irq.c
+++ b/arch/powerpc/kernel/irq.c
@@ -237,7 +237,7 @@ void fixup_irqs(cpumask_t map)
 			mask = map;
 		}
 		if (irq_desc[irq].chip->set_affinity)
-			irq_desc[irq].chip->set_affinity(irq, mask);
+			irq_desc[irq].chip->set_affinity(irq, &mask);
 		else if (irq_desc[irq].action && !(warned++))
 			printk("Cannot set affinity for irq %i\n", irq);
 	}
diff --git a/arch/powerpc/platforms/pseries/xics.c b/arch/powerpc/platforms/pseries/xics.c
index 64d24310ce7..424b335a71c 100644
--- a/arch/powerpc/platforms/pseries/xics.c
+++ b/arch/powerpc/platforms/pseries/xics.c
@@ -332,7 +332,7 @@ static void xics_eoi_lpar(unsigned int virq)
 	lpar_xirr_info_set((0xff << 24) | irq);
 }
 
-static void xics_set_affinity(unsigned int virq, cpumask_t cpumask)
+static void xics_set_affinity(unsigned int virq, const struct cpumask *cpumask)
 {
 	unsigned int irq;
 	int status;
@@ -358,7 +358,7 @@ static void xics_set_affinity(unsigned int virq, cpumask_t cpumask)
 	irq_server = get_irq_server(virq, 1);
 	if (irq_server == -1) {
 		char cpulist[128];
-		cpumask_scnprintf(cpulist, sizeof(cpulist), &cpumask);
+		cpumask_scnprintf(cpulist, sizeof(cpulist), cpumask);
 		printk(KERN_WARNING
 			"%s: No online cpus in the mask %s for irq %d\n",
 			__func__, cpulist, virq);
@@ -845,7 +845,7 @@ void xics_migrate_irqs_away(void)
 
 		/* Reset affinity to all cpus */
 		irq_desc[virq].affinity = CPU_MASK_ALL;
-		desc->chip->set_affinity(virq, CPU_MASK_ALL);
+		desc->chip->set_affinity(virq, cpu_all_mask);
 unlock:
 		spin_unlock_irqrestore(&desc->lock, flags);
 	}
diff --git a/arch/powerpc/sysdev/mpic.c b/arch/powerpc/sysdev/mpic.c
index 1890fb085cd..5d7f9f0c93c 100644
--- a/arch/powerpc/sysdev/mpic.c
+++ b/arch/powerpc/sysdev/mpic.c
@@ -817,7 +817,7 @@ static void mpic_end_ipi(unsigned int irq)
 
 #endif /* CONFIG_SMP */
 
-void mpic_set_affinity(unsigned int irq, cpumask_t cpumask)
+void mpic_set_affinity(unsigned int irq, const struct cpumask *cpumask)
 {
 	struct mpic *mpic = mpic_from_irq(irq);
 	unsigned int src = mpic_irq_to_hw(irq);
@@ -829,7 +829,7 @@ void mpic_set_affinity(unsigned int irq, cpumask_t cpumask)
 	} else {
 		cpumask_t tmp;
 
-		cpus_and(tmp, cpumask, cpu_online_map);
+		cpumask_and(&tmp, cpumask, cpu_online_mask);
 
 		mpic_irq_write(src, MPIC_INFO(IRQ_DESTINATION),
 			       mpic_physmask(cpus_addr(tmp)[0]));
diff --git a/arch/powerpc/sysdev/mpic.h b/arch/powerpc/sysdev/mpic.h
index 6209c62a426..3cef2af10f4 100644
--- a/arch/powerpc/sysdev/mpic.h
+++ b/arch/powerpc/sysdev/mpic.h
@@ -36,6 +36,6 @@ static inline int mpic_pasemi_msi_init(struct mpic *mpic)
 
 extern int mpic_set_irq_type(unsigned int virq, unsigned int flow_type);
 extern void mpic_set_vector(unsigned int virq, unsigned int vector);
-extern void mpic_set_affinity(unsigned int irq, cpumask_t cpumask);
+extern void mpic_set_affinity(unsigned int irq, const struct cpumask *cpumask);
 
 #endif /* _POWERPC_SYSDEV_MPIC_H */
diff --git a/arch/sparc64/kernel/irq.c b/arch/sparc64/kernel/irq.c
index 52fc836f464..4aaf18e83c8 100644
--- a/arch/sparc64/kernel/irq.c
+++ b/arch/sparc64/kernel/irq.c
@@ -312,7 +312,8 @@ static void sun4u_irq_enable(unsigned int virt_irq)
 	}
 }
 
-static void sun4u_set_affinity(unsigned int virt_irq, cpumask_t mask)
+static void sun4u_set_affinity(unsigned int virt_irq,
+			       const struct cpumask *mask)
 {
 	sun4u_irq_enable(virt_irq);
 }
@@ -362,7 +363,8 @@ static void sun4v_irq_enable(unsigned int virt_irq)
 		       ino, err);
 }
 
-static void sun4v_set_affinity(unsigned int virt_irq, cpumask_t mask)
+static void sun4v_set_affinity(unsigned int virt_irq,
+			       const struct cpumask *mask)
 {
 	unsigned int ino = virt_irq_table[virt_irq].dev_ino;
 	unsigned long cpuid = irq_choose_cpu(virt_irq);
@@ -429,7 +431,8 @@ static void sun4v_virq_enable(unsigned int virt_irq)
 		       dev_handle, dev_ino, err);
 }
 
-static void sun4v_virt_set_affinity(unsigned int virt_irq, cpumask_t mask)
+static void sun4v_virt_set_affinity(unsigned int virt_irq,
+				    const struct cpumask *mask)
 {
 	unsigned long cpuid, dev_handle, dev_ino;
 	int err;
@@ -788,7 +791,7 @@ void fixup_irqs(void)
 		    !(irq_desc[irq].status & IRQ_PER_CPU)) {
 			if (irq_desc[irq].chip->set_affinity)
 				irq_desc[irq].chip->set_affinity(irq,
-					irq_desc[irq].affinity);
+					&irq_desc[irq].affinity);
 		}
 		spin_unlock_irqrestore(&irq_desc[irq].lock, flags);
 	}
diff --git a/arch/sparc64/kernel/of_device.c b/arch/sparc64/kernel/of_device.c
index 0f616ae3246..df2efb7fc14 100644
--- a/arch/sparc64/kernel/of_device.c
+++ b/arch/sparc64/kernel/of_device.c
@@ -780,7 +780,7 @@ out:
 	if (nid != -1) {
 		cpumask_t numa_mask = node_to_cpumask(nid);
 
-		irq_set_affinity(irq, numa_mask);
+		irq_set_affinity(irq, &numa_mask);
 	}
 
 	return irq;
diff --git a/arch/sparc64/kernel/pci_msi.c b/arch/sparc64/kernel/pci_msi.c
index 2e680f34f72..0d0cd815e83 100644
--- a/arch/sparc64/kernel/pci_msi.c
+++ b/arch/sparc64/kernel/pci_msi.c
@@ -288,7 +288,7 @@ static int bringup_one_msi_queue(struct pci_pbm_info *pbm,
 	if (nid != -1) {
 		cpumask_t numa_mask = node_to_cpumask(nid);
 
-		irq_set_affinity(irq, numa_mask);
+		irq_set_affinity(irq, &numa_mask);
 	}
 	err = request_irq(irq, sparc64_msiq_interrupt, 0,
 			  "MSIQ",
diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c
index 067d8de913f..940f25851e1 100644
--- a/arch/x86/kernel/hpet.c
+++ b/arch/x86/kernel/hpet.c
@@ -301,7 +301,7 @@ static void hpet_set_mode(enum clock_event_mode mode,
 			struct hpet_dev *hdev = EVT_TO_HPET_DEV(evt);
 			hpet_setup_msi_irq(hdev->irq);
 			disable_irq(hdev->irq);
-			irq_set_affinity(hdev->irq, cpumask_of_cpu(hdev->cpu));
+			irq_set_affinity(hdev->irq, cpumask_of(hdev->cpu));
 			enable_irq(hdev->irq);
 		}
 		break;
@@ -449,7 +449,7 @@ static int hpet_setup_irq(struct hpet_dev *dev)
 		return -1;
 
 	disable_irq(dev->irq);
-	irq_set_affinity(dev->irq, cpumask_of_cpu(dev->cpu));
+	irq_set_affinity(dev->irq, cpumask_of(dev->cpu));
 	enable_irq(dev->irq);
 
 	printk(KERN_DEBUG "hpet: %s irq %d for MSI\n",
diff --git a/arch/x86/kernel/io_apic.c b/arch/x86/kernel/io_apic.c
index 9043251210f..1184210e6d0 100644
--- a/arch/x86/kernel/io_apic.c
+++ b/arch/x86/kernel/io_apic.c
@@ -361,7 +361,8 @@ static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, u8 vector)
 
 static int assign_irq_vector(int irq, cpumask_t mask);
 
-static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t mask)
+static void set_ioapic_affinity_irq(unsigned int irq,
+				    const struct cpumask *mask)
 {
 	struct irq_cfg *cfg;
 	unsigned long flags;
@@ -369,15 +370,14 @@ static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t mask)
 	cpumask_t tmp;
 	struct irq_desc *desc;
 
-	cpus_and(tmp, mask, cpu_online_map);
-	if (cpus_empty(tmp))
+	if (!cpumask_intersects(mask, cpu_online_mask))
 		return;
 
 	cfg = irq_cfg(irq);
-	if (assign_irq_vector(irq, mask))
+	if (assign_irq_vector(irq, *mask))
 		return;
 
-	cpus_and(tmp, cfg->domain, mask);
+	cpumask_and(&tmp, &cfg->domain, mask);
 	dest = cpu_mask_to_apicid(tmp);
 	/*
 	 * Only the high 8 bits are valid.
@@ -387,7 +387,7 @@ static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t mask)
 	desc = irq_to_desc(irq);
 	spin_lock_irqsave(&ioapic_lock, flags);
 	__target_IO_APIC_irq(irq, dest, cfg->vector);
-	desc->affinity = mask;
+	cpumask_copy(&desc->affinity, mask);
 	spin_unlock_irqrestore(&ioapic_lock, flags);
 }
 #endif /* CONFIG_SMP */
@@ -2189,7 +2189,7 @@ static void ir_irq_migration(struct work_struct *work)
 				continue;
 			}
 
-			desc->chip->set_affinity(irq, desc->pending_mask);
+			desc->chip->set_affinity(irq, &desc->pending_mask);
 			spin_unlock_irqrestore(&desc->lock, flags);
 		}
 	}
@@ -2198,18 +2198,19 @@ static void ir_irq_migration(struct work_struct *work)
 /*
  * Migrates the IRQ destination in the process context.
  */
-static void set_ir_ioapic_affinity_irq(unsigned int irq, cpumask_t mask)
+static void set_ir_ioapic_affinity_irq(unsigned int irq,
+				       const struct cpumask *mask)
 {
 	struct irq_desc *desc = irq_to_desc(irq);
 
 	if (desc->status & IRQ_LEVEL) {
 		desc->status |= IRQ_MOVE_PENDING;
-		desc->pending_mask = mask;
+		cpumask_copy(&desc->pending_mask, mask);
 		migrate_irq_remapped_level(irq);
 		return;
 	}
 
-	migrate_ioapic_irq(irq, mask);
+	migrate_ioapic_irq(irq, *mask);
 }
 #endif
 
@@ -3027,7 +3028,7 @@ static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_ms
 }
 
 #ifdef CONFIG_SMP
-static void set_msi_irq_affinity(unsigned int irq, cpumask_t mask)
+static void set_msi_irq_affinity(unsigned int irq, const struct cpumask *mask)
 {
 	struct irq_cfg *cfg;
 	struct msi_msg msg;
@@ -3035,15 +3036,14 @@ static void set_msi_irq_affinity(unsigned int irq, cpumask_t mask)
 	cpumask_t tmp;
 	struct irq_desc *desc;
 
-	cpus_and(tmp, mask, cpu_online_map);
-	if (cpus_empty(tmp))
+	if (!cpumask_intersects(mask, cpu_online_mask))
 		return;
 
-	if (assign_irq_vector(irq, mask))
+	if (assign_irq_vector(irq, *mask))
 		return;
 
 	cfg = irq_cfg(irq);
-	cpus_and(tmp, cfg->domain, mask);
+	cpumask_and(&tmp, &cfg->domain, mask);
 	dest = cpu_mask_to_apicid(tmp);
 
 	read_msi_msg(irq, &msg);
@@ -3055,7 +3055,7 @@ static void set_msi_irq_affinity(unsigned int irq, cpumask_t mask)
 
 	write_msi_msg(irq, &msg);
 	desc = irq_to_desc(irq);
-	desc->affinity = mask;
+	cpumask_copy(&desc->affinity, mask);
 }
 
 #ifdef CONFIG_INTR_REMAP
@@ -3063,7 +3063,8 @@ static void set_msi_irq_affinity(unsigned int irq, cpumask_t mask)
  * Migrate the MSI irq to another cpumask. This migration is
  * done in the process context using interrupt-remapping hardware.
  */
-static void ir_set_msi_irq_affinity(unsigned int irq, cpumask_t mask)
+static void ir_set_msi_irq_affinity(unsigned int irq,
+				    const struct cpumask *mask)
 {
 	struct irq_cfg *cfg;
 	unsigned int dest;
@@ -3071,18 +3072,17 @@ static void ir_set_msi_irq_affinity(unsigned int irq, cpumask_t mask)
 	struct irte irte;
 	struct irq_desc *desc;
 
-	cpus_and(tmp, mask, cpu_online_map);
-	if (cpus_empty(tmp))
+	if (!cpumask_intersects(mask, cpu_online_mask))
 		return;
 
 	if (get_irte(irq, &irte))
 		return;
 
-	if (assign_irq_vector(irq, mask))
+	if (assign_irq_vector(irq, *mask))
 		return;
 
 	cfg = irq_cfg(irq);
-	cpus_and(tmp, cfg->domain, mask);
+	cpumask_and(&tmp, &cfg->domain, mask);
 	dest = cpu_mask_to_apicid(tmp);
 
 	irte.vector = cfg->vector;
@@ -3106,7 +3106,7 @@ static void ir_set_msi_irq_affinity(unsigned int irq, cpumask_t mask)
 	}
 
 	desc = irq_to_desc(irq);
-	desc->affinity = mask;
+	cpumask_copy(&desc->affinity, mask);
 }
 #endif
 #endif /* CONFIG_SMP */
@@ -3308,7 +3308,7 @@ void arch_teardown_msi_irq(unsigned int irq)
 
 #ifdef CONFIG_DMAR
 #ifdef CONFIG_SMP
-static void dmar_msi_set_affinity(unsigned int irq, cpumask_t mask)
+static void dmar_msi_set_affinity(unsigned int irq, const struct cpumask *mask)
 {
 	struct irq_cfg *cfg;
 	struct msi_msg msg;
@@ -3316,15 +3316,14 @@ static void dmar_msi_set_affinity(unsigned int irq, cpumask_t mask)
 	cpumask_t tmp;
 	struct irq_desc *desc;
 
-	cpus_and(tmp, mask, cpu_online_map);
-	if (cpus_empty(tmp))
+	if (!cpumask_intersects(mask, cpu_online_mask))
 		return;
 
-	if (assign_irq_vector(irq, mask))
+	if (assign_irq_vector(irq, *mask))
 		return;
 
 	cfg = irq_cfg(irq);
-	cpus_and(tmp, cfg->domain, mask);
+	cpumask_and(&tmp, &cfg->domain, mask);
 	dest = cpu_mask_to_apicid(tmp);
 
 	dmar_msi_read(irq, &msg);
@@ -3336,7 +3335,7 @@ static void dmar_msi_set_affinity(unsigned int irq, cpumask_t mask)
 
 	dmar_msi_write(irq, &msg);
 	desc = irq_to_desc(irq);
-	desc->affinity = mask;
+	cpumask_copy(&desc->affinity, mask);
 }
 #endif /* CONFIG_SMP */
 
@@ -3369,7 +3368,7 @@ int arch_setup_dmar_msi(unsigned int irq)
 #ifdef CONFIG_HPET_TIMER
 
 #ifdef CONFIG_SMP
-static void hpet_msi_set_affinity(unsigned int irq, cpumask_t mask)
+static void hpet_msi_set_affinity(unsigned int irq, const struct cpumask *mask)
 {
 	struct irq_cfg *cfg;
 	struct irq_desc *desc;
@@ -3377,15 +3376,14 @@ static void hpet_msi_set_affinity(unsigned int irq, cpumask_t mask)
 	unsigned int dest;
 	cpumask_t tmp;
 
-	cpus_and(tmp, mask, cpu_online_map);
-	if (cpus_empty(tmp))
+	if (!cpumask_intersects(mask, cpu_online_mask))
 		return;
 
-	if (assign_irq_vector(irq, mask))
+	if (assign_irq_vector(irq, *mask))
 		return;
 
 	cfg = irq_cfg(irq);
-	cpus_and(tmp, cfg->domain, mask);
+	cpumask_and(&tmp, &cfg->domain, mask);
 	dest = cpu_mask_to_apicid(tmp);
 
 	hpet_msi_read(irq, &msg);
@@ -3397,7 +3395,7 @@ static void hpet_msi_set_affinity(unsigned int irq, cpumask_t mask)
 
 	hpet_msi_write(irq, &msg);
 	desc = irq_to_desc(irq);
-	desc->affinity = mask;
+	cpumask_copy(&desc->affinity, mask);
 }
 #endif /* CONFIG_SMP */
 
@@ -3451,27 +3449,26 @@ static void target_ht_irq(unsigned int irq, unsigned int dest, u8 vector)
 	write_ht_irq_msg(irq, &msg);
 }
 
-static void set_ht_irq_affinity(unsigned int irq, cpumask_t mask)
+static void set_ht_irq_affinity(unsigned int irq, const struct cpumask *mask)
 {
 	struct irq_cfg *cfg;
 	unsigned int dest;
 	cpumask_t tmp;
 	struct irq_desc *desc;
 
-	cpus_and(tmp, mask, cpu_online_map);
-	if (cpus_empty(tmp))
+	if (!cpumask_intersects(mask, cpu_online_mask))
 		return;
 
-	if (assign_irq_vector(irq, mask))
+	if (assign_irq_vector(irq, *mask))
 		return;
 
 	cfg = irq_cfg(irq);
-	cpus_and(tmp, cfg->domain, mask);
+	cpumask_and(&tmp, &cfg->domain, mask);
 	dest = cpu_mask_to_apicid(tmp);
 
 	target_ht_irq(irq, dest, cfg->vector);
 	desc = irq_to_desc(irq);
-	desc->affinity = mask;
+	cpumask_copy(&desc->affinity, mask);
 }
 #endif
 
@@ -3794,10 +3791,10 @@ void __init setup_ioapic_dest(void)
 
 #ifdef CONFIG_INTR_REMAP
 			if (intr_remapping_enabled)
-				set_ir_ioapic_affinity_irq(irq, mask);
+				set_ir_ioapic_affinity_irq(irq, &mask);
 			else
 #endif
-				set_ioapic_affinity_irq(irq, mask);
+				set_ioapic_affinity_irq(irq, &mask);
 		}
 
 	}
diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c
index a51382672de..87870a49be4 100644
--- a/arch/x86/kernel/irq_32.c
+++ b/arch/x86/kernel/irq_32.c
@@ -251,7 +251,7 @@ void fixup_irqs(cpumask_t map)
 			mask = map;
 		}
 		if (desc->chip->set_affinity)
-			desc->chip->set_affinity(irq, mask);
+			desc->chip->set_affinity(irq, &mask);
 		else if (desc->action && !(warned++))
 			printk("Cannot set affinity for irq %i\n", irq);
 	}
diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c
index 60eb84eb77a..7d37f847544 100644
--- a/arch/x86/kernel/irq_64.c
+++ b/arch/x86/kernel/irq_64.c
@@ -116,7 +116,7 @@ void fixup_irqs(cpumask_t map)
 			desc->chip->mask(irq);
 
 		if (desc->chip->set_affinity)
-			desc->chip->set_affinity(irq, mask);
+			desc->chip->set_affinity(irq, &mask);
 		else if (!(warned++))
 			set_affinity = 0;
 
diff --git a/drivers/parisc/iosapic.c b/drivers/parisc/iosapic.c
index 7beffcab274..9dedbbd218c 100644
--- a/drivers/parisc/iosapic.c
+++ b/drivers/parisc/iosapic.c
@@ -704,16 +704,17 @@ static unsigned int iosapic_startup_irq(unsigned int irq)
 }
 
 #ifdef CONFIG_SMP
-static void iosapic_set_affinity_irq(unsigned int irq, cpumask_t dest)
+static void iosapic_set_affinity_irq(unsigned int irq,
+				     const struct cpumask *dest)
 {
 	struct vector_info *vi = iosapic_get_vector(irq);
 	u32 d0, d1, dummy_d0;
 	unsigned long flags;
 
-	if (cpu_check_affinity(irq, &dest))
+	if (cpu_check_affinity(irq, dest))
 		return;
 
-	vi->txn_addr = txn_affinity_addr(irq, first_cpu(dest));
+	vi->txn_addr = txn_affinity_addr(irq, cpumask_first(dest));
 
 	spin_lock_irqsave(&iosapic_lock, flags);
 	/* d1 contains the destination CPU, so only want to set that
diff --git a/drivers/xen/events.c b/drivers/xen/events.c
index 1e3b934a4cf..eba5ec5b020 100644
--- a/drivers/xen/events.c
+++ b/drivers/xen/events.c
@@ -579,7 +579,7 @@ void rebind_evtchn_irq(int evtchn, int irq)
 	spin_unlock(&irq_mapping_update_lock);
 
 	/* new event channels are always bound to cpu 0 */
-	irq_set_affinity(irq, cpumask_of_cpu(0));
+	irq_set_affinity(irq, cpumask_of(0));
 
 	/* Unmask the event channel. */
 	enable_irq(irq);
@@ -608,9 +608,9 @@ static void rebind_irq_to_cpu(unsigned irq, unsigned tcpu)
 }
 
 
-static void set_affinity_irq(unsigned irq, cpumask_t dest)
+static void set_affinity_irq(unsigned irq, const struct cpumask *dest)
 {
-	unsigned tcpu = first_cpu(dest);
+	unsigned tcpu = cpumask_first(dest);
 	rebind_irq_to_cpu(irq, tcpu);
 }
 
diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index f58a0cf8929..48e63934fab 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -109,13 +109,13 @@ extern void enable_irq(unsigned int irq);
 
 extern cpumask_t irq_default_affinity;
 
-extern int irq_set_affinity(unsigned int irq, cpumask_t cpumask);
+extern int irq_set_affinity(unsigned int irq, const struct cpumask *cpumask);
 extern int irq_can_set_affinity(unsigned int irq);
 extern int irq_select_affinity(unsigned int irq);
 
 #else /* CONFIG_SMP */
 
-static inline int irq_set_affinity(unsigned int irq, cpumask_t cpumask)
+static inline int irq_set_affinity(unsigned int irq, const struct cpumask *m)
 {
 	return -EINVAL;
 }
diff --git a/include/linux/irq.h b/include/linux/irq.h
index 3dddfa703eb..ab70fd604d3 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -113,7 +113,8 @@ struct irq_chip {
 	void		(*eoi)(unsigned int irq);
 
 	void		(*end)(unsigned int irq);
-	void		(*set_affinity)(unsigned int irq, cpumask_t dest);
+	void		(*set_affinity)(unsigned int irq,
+					const struct cpumask *dest);
 	int		(*retrigger)(unsigned int irq);
 	int		(*set_type)(unsigned int irq, unsigned int flow_type);
 	int		(*set_wake)(unsigned int irq, unsigned int on);
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index 10b5092e9bf..58d8e31daa4 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -45,7 +45,7 @@ void dynamic_irq_init(unsigned int irq)
 	desc->irq_count = 0;
 	desc->irqs_unhandled = 0;
 #ifdef CONFIG_SMP
-	cpus_setall(desc->affinity);
+	cpumask_setall(&desc->affinity);
 #endif
 	spin_unlock_irqrestore(&desc->lock, flags);
 }
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 801addda3c4..10ad2f87ed9 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -79,7 +79,7 @@ int irq_can_set_affinity(unsigned int irq)
  *	@cpumask:	cpumask
  *
  */
-int irq_set_affinity(unsigned int irq, cpumask_t cpumask)
+int irq_set_affinity(unsigned int irq, const struct cpumask *cpumask)
 {
 	struct irq_desc *desc = irq_to_desc(irq);
 	unsigned long flags;
@@ -91,14 +91,14 @@ int irq_set_affinity(unsigned int irq, cpumask_t cpumask)
 
 #ifdef CONFIG_GENERIC_PENDING_IRQ
 	if (desc->status & IRQ_MOVE_PCNTXT || desc->status & IRQ_DISABLED) {
-		desc->affinity = cpumask;
+		cpumask_copy(&desc->affinity, cpumask);
 		desc->chip->set_affinity(irq, cpumask);
 	} else {
 		desc->status |= IRQ_MOVE_PENDING;
-		desc->pending_mask = cpumask;
+		cpumask_copy(&desc->pending_mask, cpumask);
 	}
 #else
-	desc->affinity = cpumask;
+	cpumask_copy(&desc->affinity, cpumask);
 	desc->chip->set_affinity(irq, cpumask);
 #endif
 	desc->status |= IRQ_AFFINITY_SET;
@@ -112,26 +112,24 @@ int irq_set_affinity(unsigned int irq, cpumask_t cpumask)
  */
 int do_irq_select_affinity(unsigned int irq, struct irq_desc *desc)
 {
-	cpumask_t mask;
-
 	if (!irq_can_set_affinity(irq))
 		return 0;
 
-	cpus_and(mask, cpu_online_map, irq_default_affinity);
-
 	/*
 	 * Preserve an userspace affinity setup, but make sure that
 	 * one of the targets is online.
 	 */
 	if (desc->status & (IRQ_AFFINITY_SET | IRQ_NO_BALANCING)) {
-		if (cpus_intersects(desc->affinity, cpu_online_map))
-			mask = desc->affinity;
+		if (cpumask_any_and(&desc->affinity, cpu_online_mask)
+		    < nr_cpu_ids)
+			goto set_affinity;
 		else
 			desc->status &= ~IRQ_AFFINITY_SET;
 	}
 
-	desc->affinity = mask;
-	desc->chip->set_affinity(irq, mask);
+	cpumask_and(&desc->affinity, cpu_online_mask, &irq_default_affinity);
+set_affinity:
+	desc->chip->set_affinity(irq, &desc->affinity);
 
 	return 0;
 }
diff --git a/kernel/irq/migration.c b/kernel/irq/migration.c
index 9db681d9581..bd72329e630 100644
--- a/kernel/irq/migration.c
+++ b/kernel/irq/migration.c
@@ -4,7 +4,6 @@
 void move_masked_irq(int irq)
 {
 	struct irq_desc *desc = irq_to_desc(irq);
-	cpumask_t tmp;
 
 	if (likely(!(desc->status & IRQ_MOVE_PENDING)))
 		return;
@@ -19,7 +18,7 @@ void move_masked_irq(int irq)
 
 	desc->status &= ~IRQ_MOVE_PENDING;
 
-	if (unlikely(cpus_empty(desc->pending_mask)))
+	if (unlikely(cpumask_empty(&desc->pending_mask)))
 		return;
 
 	if (!desc->chip->set_affinity)
@@ -27,8 +26,6 @@ void move_masked_irq(int irq)
 
 	assert_spin_locked(&desc->lock);
 
-	cpus_and(tmp, desc->pending_mask, cpu_online_map);
-
 	/*
 	 * If there was a valid mask to work with, please
 	 * do the disable, re-program, enable sequence.
@@ -41,10 +38,13 @@ void move_masked_irq(int irq)
 	 * For correct operation this depends on the caller
 	 * masking the irqs.
 	 */
-	if (likely(!cpus_empty(tmp))) {
-		desc->chip->set_affinity(irq,tmp);
+	if (likely(cpumask_any_and(&desc->pending_mask, cpu_online_mask)
+		   < nr_cpu_ids)) {
+		cpumask_and(&desc->affinity,
+			    &desc->pending_mask, cpu_online_mask);
+		desc->chip->set_affinity(irq, &desc->affinity);
 	}
-	cpus_clear(desc->pending_mask);
+	cpumask_clear(&desc->pending_mask);
 }
 
 void move_native_irq(int irq)
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index f293349d49d..8e91c976252 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -40,33 +40,42 @@ static ssize_t irq_affinity_proc_write(struct file *file,
 		const char __user *buffer, size_t count, loff_t *pos)
 {
 	unsigned int irq = (int)(long)PDE(file->f_path.dentry->d_inode)->data;
-	cpumask_t new_value;
+	cpumask_var_t new_value;
 	int err;
 
 	if (!irq_to_desc(irq)->chip->set_affinity || no_irq_affinity ||
 	    irq_balancing_disabled(irq))
 		return -EIO;
 
-	err = cpumask_parse_user(buffer, count, &new_value);
+	if (!alloc_cpumask_var(&new_value, GFP_KERNEL))
+		return -ENOMEM;
+
+	err = cpumask_parse_user(buffer, count, new_value);
 	if (err)
-		return err;
+		goto free_cpumask;
 
-	if (!is_affinity_mask_valid(new_value))
-		return -EINVAL;
+	if (!is_affinity_mask_valid(*new_value)) {
+		err = -EINVAL;
+		goto free_cpumask;
+	}
 
 	/*
 	 * Do not allow disabling IRQs completely - it's a too easy
 	 * way to make the system unusable accidentally :-) At least
 	 * one online CPU still has to be targeted.
 	 */
-	if (!cpus_intersects(new_value, cpu_online_map))
+	if (!cpumask_intersects(new_value, cpu_online_mask)) {
 		/* Special case for empty set - allow the architecture
 		   code to set default SMP affinity. */
-		return irq_select_affinity_usr(irq) ? -EINVAL : count;
-
-	irq_set_affinity(irq, new_value);
+		err = irq_select_affinity_usr(irq) ? -EINVAL : count;
+	} else {
+		irq_set_affinity(irq, new_value);
+		err = count;
+	}
 
-	return count;
+free_cpumask:
+	free_cpumask_var(new_value);
+	return err;
 }
 
 static int irq_affinity_proc_open(struct inode *inode, struct file *file)
diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c
index df12434b43c..ab65d217583 100644
--- a/kernel/time/tick-common.c
+++ b/kernel/time/tick-common.c
@@ -136,7 +136,7 @@ void tick_setup_periodic(struct clock_event_device *dev, int broadcast)
  */
 static void tick_setup_device(struct tick_device *td,
 			      struct clock_event_device *newdev, int cpu,
-			      const cpumask_t *cpumask)
+			      const struct cpumask *cpumask)
 {
 	ktime_t next_event;
 	void (*handler)(struct clock_event_device *) = NULL;
@@ -171,8 +171,8 @@ static void tick_setup_device(struct tick_device *td,
 	 * When the device is not per cpu, pin the interrupt to the
 	 * current cpu:
 	 */
-	if (!cpus_equal(newdev->cpumask, *cpumask))
-		irq_set_affinity(newdev->irq, *cpumask);
+	if (!cpumask_equal(&newdev->cpumask, cpumask))
+		irq_set_affinity(newdev->irq, cpumask);
 
 	/*
 	 * When global broadcasting is active, check if the current
-- 
cgit v1.2.3-70-g09d2


From 320ab2b0b1e08e3805a3e1084a2f0eb1938d5d67 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Sat, 13 Dec 2008 21:20:26 +1030
Subject: cpumask: convert struct clock_event_device to cpumask pointers.

Impact: change calling convention of existing clock_event APIs

struct clock_event_timer's cpumask field gets changed to take pointer,
as does the ->broadcast function.

Another single-patch change.  For safety, we BUG_ON() in
clockevents_register_device() if it's not set.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
Cc: Ingo Molnar <mingo@elte.hu>
---
 arch/arm/mach-at91/at91rm9200_time.c    | 3 +--
 arch/arm/mach-at91/at91sam926x_time.c   | 2 +-
 arch/arm/mach-davinci/time.c            | 2 +-
 arch/arm/mach-imx/time.c                | 2 +-
 arch/arm/mach-ixp4xx/common.c           | 2 +-
 arch/arm/mach-msm/timer.c               | 2 +-
 arch/arm/mach-ns9xxx/time-ns9360.c      | 2 +-
 arch/arm/mach-omap1/time.c              | 2 +-
 arch/arm/mach-omap1/timer32k.c          | 2 +-
 arch/arm/mach-omap2/timer-gp.c          | 2 +-
 arch/arm/mach-pxa/time.c                | 2 +-
 arch/arm/mach-realview/core.c           | 2 +-
 arch/arm/mach-realview/localtimer.c     | 4 ++--
 arch/arm/mach-sa1100/time.c             | 2 +-
 arch/arm/mach-versatile/core.c          | 2 +-
 arch/arm/plat-mxc/time.c                | 2 +-
 arch/arm/plat-orion/time.c              | 2 +-
 arch/avr32/kernel/time.c                | 2 +-
 arch/blackfin/kernel/time-ts.c          | 2 +-
 arch/m68knommu/platform/coldfire/pit.c  | 2 +-
 arch/mips/jazz/irq.c                    | 2 +-
 arch/mips/kernel/cevt-bcm1480.c         | 2 +-
 arch/mips/kernel/cevt-ds1287.c          | 2 +-
 arch/mips/kernel/cevt-gt641xx.c         | 2 +-
 arch/mips/kernel/cevt-r4k.c             | 2 +-
 arch/mips/kernel/cevt-sb1250.c          | 2 +-
 arch/mips/kernel/cevt-smtc.c            | 2 +-
 arch/mips/kernel/cevt-txx9.c            | 2 +-
 arch/mips/kernel/i8253.c                | 2 +-
 arch/mips/nxp/pnx8550/common/time.c     | 1 +
 arch/mips/sgi-ip27/ip27-timer.c         | 2 +-
 arch/mips/sni/time.c                    | 2 +-
 arch/powerpc/kernel/time.c              | 2 +-
 arch/s390/kernel/time.c                 | 2 +-
 arch/sh/include/asm/smp.h               | 2 +-
 arch/sh/kernel/smp.c                    | 4 ++--
 arch/sh/kernel/timers/timer-broadcast.c | 2 +-
 arch/sh/kernel/timers/timer-tmu.c       | 2 +-
 arch/sparc64/kernel/time.c              | 2 +-
 arch/um/kernel/time.c                   | 2 +-
 arch/x86/kernel/apic.c                  | 8 ++++----
 arch/x86/kernel/hpet.c                  | 4 ++--
 arch/x86/kernel/i8253.c                 | 2 +-
 arch/x86/kernel/mfgpt_32.c              | 2 +-
 arch/x86/kernel/vmiclock_32.c           | 2 +-
 arch/x86/lguest/boot.c                  | 2 +-
 arch/x86/xen/time.c                     | 2 +-
 drivers/clocksource/tcb_clksrc.c        | 2 +-
 include/linux/clockchips.h              | 4 ++--
 kernel/time/clockevents.c               | 2 ++
 kernel/time/tick-broadcast.c            | 2 +-
 kernel/time/tick-common.c               | 8 ++++----
 52 files changed, 63 insertions(+), 61 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arm/mach-at91/at91rm9200_time.c b/arch/arm/mach-at91/at91rm9200_time.c
index a72e798a2a4..72f51d39202 100644
--- a/arch/arm/mach-at91/at91rm9200_time.c
+++ b/arch/arm/mach-at91/at91rm9200_time.c
@@ -169,7 +169,6 @@ static struct clock_event_device clkevt = {
 	.features	= CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT,
 	.shift		= 32,
 	.rating		= 150,
-	.cpumask	= CPU_MASK_CPU0,
 	.set_next_event	= clkevt32k_next_event,
 	.set_mode	= clkevt32k_mode,
 };
@@ -197,7 +196,7 @@ void __init at91rm9200_timer_init(void)
 	clkevt.mult = div_sc(AT91_SLOW_CLOCK, NSEC_PER_SEC, clkevt.shift);
 	clkevt.max_delta_ns = clockevent_delta2ns(AT91_ST_ALMV, &clkevt);
 	clkevt.min_delta_ns = clockevent_delta2ns(2, &clkevt) + 1;
-	clkevt.cpumask = cpumask_of_cpu(0);
+	clkevt.cpumask = cpumask_of(0);
 	clockevents_register_device(&clkevt);
 
 	/* register clocksource */
diff --git a/arch/arm/mach-at91/at91sam926x_time.c b/arch/arm/mach-at91/at91sam926x_time.c
index 122fd77ed58..b63e1d5f1ba 100644
--- a/arch/arm/mach-at91/at91sam926x_time.c
+++ b/arch/arm/mach-at91/at91sam926x_time.c
@@ -91,7 +91,6 @@ static struct clock_event_device pit_clkevt = {
 	.features	= CLOCK_EVT_FEAT_PERIODIC,
 	.shift		= 32,
 	.rating		= 100,
-	.cpumask	= CPU_MASK_CPU0,
 	.set_mode	= pit_clkevt_mode,
 };
 
@@ -173,6 +172,7 @@ static void __init at91sam926x_pit_init(void)
 
 	/* Set up and register clockevents */
 	pit_clkevt.mult = div_sc(pit_rate, NSEC_PER_SEC, pit_clkevt.shift);
+	pit_clkevt.cpumask = cpumask_of(0);
 	clockevents_register_device(&pit_clkevt);
 }
 
diff --git a/arch/arm/mach-davinci/time.c b/arch/arm/mach-davinci/time.c
index 3b9a296b5c4..f8bcd29d17a 100644
--- a/arch/arm/mach-davinci/time.c
+++ b/arch/arm/mach-davinci/time.c
@@ -322,7 +322,7 @@ static void __init davinci_timer_init(void)
 	clockevent_davinci.min_delta_ns =
 		clockevent_delta2ns(1, &clockevent_davinci);
 
-	clockevent_davinci.cpumask = cpumask_of_cpu(0);
+	clockevent_davinci.cpumask = cpumask_of(0);
 	clockevents_register_device(&clockevent_davinci);
 }
 
diff --git a/arch/arm/mach-imx/time.c b/arch/arm/mach-imx/time.c
index a11765f5f23..aff0ebcfa84 100644
--- a/arch/arm/mach-imx/time.c
+++ b/arch/arm/mach-imx/time.c
@@ -184,7 +184,7 @@ static int __init imx_clockevent_init(unsigned long rate)
 	clockevent_imx.min_delta_ns =
 		clockevent_delta2ns(0xf, &clockevent_imx);
 
-	clockevent_imx.cpumask = cpumask_of_cpu(0);
+	clockevent_imx.cpumask = cpumask_of(0);
 
 	clockevents_register_device(&clockevent_imx);
 
diff --git a/arch/arm/mach-ixp4xx/common.c b/arch/arm/mach-ixp4xx/common.c
index 7766f469456..f4656d2ac8a 100644
--- a/arch/arm/mach-ixp4xx/common.c
+++ b/arch/arm/mach-ixp4xx/common.c
@@ -487,7 +487,7 @@ static int __init ixp4xx_clockevent_init(void)
 		clockevent_delta2ns(0xfffffffe, &clockevent_ixp4xx);
 	clockevent_ixp4xx.min_delta_ns =
 		clockevent_delta2ns(0xf, &clockevent_ixp4xx);
-	clockevent_ixp4xx.cpumask = cpumask_of_cpu(0);
+	clockevent_ixp4xx.cpumask = cpumask_of(0);
 
 	clockevents_register_device(&clockevent_ixp4xx);
 	return 0;
diff --git a/arch/arm/mach-msm/timer.c b/arch/arm/mach-msm/timer.c
index 345a14cb73c..444d9c0f5ca 100644
--- a/arch/arm/mach-msm/timer.c
+++ b/arch/arm/mach-msm/timer.c
@@ -182,7 +182,7 @@ static void __init msm_timer_init(void)
 			clockevent_delta2ns(0xf0000000 >> clock->shift, ce);
 		/* 4 gets rounded down to 3 */
 		ce->min_delta_ns = clockevent_delta2ns(4, ce);
-		ce->cpumask = cpumask_of_cpu(0);
+		ce->cpumask = cpumask_of(0);
 
 		cs->mult = clocksource_hz2mult(clock->freq, cs->shift);
 		res = clocksource_register(cs);
diff --git a/arch/arm/mach-ns9xxx/time-ns9360.c b/arch/arm/mach-ns9xxx/time-ns9360.c
index a63424d083d..41df6972176 100644
--- a/arch/arm/mach-ns9xxx/time-ns9360.c
+++ b/arch/arm/mach-ns9xxx/time-ns9360.c
@@ -173,7 +173,7 @@ static void __init ns9360_timer_init(void)
 	ns9360_clockevent_device.min_delta_ns =
 		clockevent_delta2ns(1, &ns9360_clockevent_device);
 
-	ns9360_clockevent_device.cpumask = cpumask_of_cpu(0);
+	ns9360_clockevent_device.cpumask = cpumask_of(0);
 	clockevents_register_device(&ns9360_clockevent_device);
 
 	setup_irq(IRQ_NS9360_TIMER0 + TIMER_CLOCKEVENT,
diff --git a/arch/arm/mach-omap1/time.c b/arch/arm/mach-omap1/time.c
index 2cf7e32bd29..495a32c287b 100644
--- a/arch/arm/mach-omap1/time.c
+++ b/arch/arm/mach-omap1/time.c
@@ -173,7 +173,7 @@ static __init void omap_init_mpu_timer(unsigned long rate)
 	clockevent_mpu_timer1.min_delta_ns =
 		clockevent_delta2ns(1, &clockevent_mpu_timer1);
 
-	clockevent_mpu_timer1.cpumask = cpumask_of_cpu(0);
+	clockevent_mpu_timer1.cpumask = cpumask_of(0);
 	clockevents_register_device(&clockevent_mpu_timer1);
 }
 
diff --git a/arch/arm/mach-omap1/timer32k.c b/arch/arm/mach-omap1/timer32k.c
index 705367ece17..fd3f7396e16 100644
--- a/arch/arm/mach-omap1/timer32k.c
+++ b/arch/arm/mach-omap1/timer32k.c
@@ -187,7 +187,7 @@ static __init void omap_init_32k_timer(void)
 	clockevent_32k_timer.min_delta_ns =
 		clockevent_delta2ns(1, &clockevent_32k_timer);
 
-	clockevent_32k_timer.cpumask = cpumask_of_cpu(0);
+	clockevent_32k_timer.cpumask = cpumask_of(0);
 	clockevents_register_device(&clockevent_32k_timer);
 }
 
diff --git a/arch/arm/mach-omap2/timer-gp.c b/arch/arm/mach-omap2/timer-gp.c
index 589393bedad..ae6036300f6 100644
--- a/arch/arm/mach-omap2/timer-gp.c
+++ b/arch/arm/mach-omap2/timer-gp.c
@@ -120,7 +120,7 @@ static void __init omap2_gp_clockevent_init(void)
 	clockevent_gpt.min_delta_ns =
 		clockevent_delta2ns(1, &clockevent_gpt);
 
-	clockevent_gpt.cpumask = cpumask_of_cpu(0);
+	clockevent_gpt.cpumask = cpumask_of(0);
 	clockevents_register_device(&clockevent_gpt);
 }
 
diff --git a/arch/arm/mach-pxa/time.c b/arch/arm/mach-pxa/time.c
index f8a9a62959e..bf3c9a4aad5 100644
--- a/arch/arm/mach-pxa/time.c
+++ b/arch/arm/mach-pxa/time.c
@@ -122,7 +122,6 @@ static struct clock_event_device ckevt_pxa_osmr0 = {
 	.features	= CLOCK_EVT_FEAT_ONESHOT,
 	.shift		= 32,
 	.rating		= 200,
-	.cpumask	= CPU_MASK_CPU0,
 	.set_next_event	= pxa_osmr0_set_next_event,
 	.set_mode	= pxa_osmr0_set_mode,
 };
@@ -170,6 +169,7 @@ static void __init pxa_timer_init(void)
 		clockevent_delta2ns(0x7fffffff, &ckevt_pxa_osmr0);
 	ckevt_pxa_osmr0.min_delta_ns =
 		clockevent_delta2ns(MIN_OSCR_DELTA * 2, &ckevt_pxa_osmr0) + 1;
+	ckevt_pxa_osmr0.cpumask = cpumask_of(0);
 
 	cksrc_pxa_oscr0.mult =
 		clocksource_hz2mult(clock_tick_rate, cksrc_pxa_oscr0.shift);
diff --git a/arch/arm/mach-realview/core.c b/arch/arm/mach-realview/core.c
index 2f04d54711e..b07cb9b7adb 100644
--- a/arch/arm/mach-realview/core.c
+++ b/arch/arm/mach-realview/core.c
@@ -511,7 +511,7 @@ static struct clock_event_device timer0_clockevent =	 {
 	.set_mode	= timer_set_mode,
 	.set_next_event	= timer_set_next_event,
 	.rating		= 300,
-	.cpumask	= CPU_MASK_ALL,
+	.cpumask	= cpu_all_mask,
 };
 
 static void __init realview_clockevents_init(unsigned int timer_irq)
diff --git a/arch/arm/mach-realview/localtimer.c b/arch/arm/mach-realview/localtimer.c
index 44d178cd573..504961ef343 100644
--- a/arch/arm/mach-realview/localtimer.c
+++ b/arch/arm/mach-realview/localtimer.c
@@ -161,7 +161,7 @@ void __cpuinit local_timer_setup(unsigned int cpu)
 	clk->set_mode		= local_timer_set_mode;
 	clk->set_next_event	= local_timer_set_next_event;
 	clk->irq		= IRQ_LOCALTIMER;
-	clk->cpumask		= cpumask_of_cpu(cpu);
+	clk->cpumask		= cpumask_of(cpu);
 	clk->shift		= 20;
 	clk->mult		= div_sc(mpcore_timer_rate, NSEC_PER_SEC, clk->shift);
 	clk->max_delta_ns	= clockevent_delta2ns(0xffffffff, clk);
@@ -199,7 +199,7 @@ void __cpuinit local_timer_setup(unsigned int cpu)
 	clk->rating		= 200;
 	clk->set_mode		= dummy_timer_set_mode;
 	clk->broadcast		= smp_timer_broadcast;
-	clk->cpumask		= cpumask_of_cpu(cpu);
+	clk->cpumask		= cpumask_of(cpu);
 
 	clockevents_register_device(clk);
 }
diff --git a/arch/arm/mach-sa1100/time.c b/arch/arm/mach-sa1100/time.c
index 24c0a4bae85..1cac4ac0b4b 100644
--- a/arch/arm/mach-sa1100/time.c
+++ b/arch/arm/mach-sa1100/time.c
@@ -73,7 +73,6 @@ static struct clock_event_device ckevt_sa1100_osmr0 = {
 	.features	= CLOCK_EVT_FEAT_ONESHOT,
 	.shift		= 32,
 	.rating		= 200,
-	.cpumask	= CPU_MASK_CPU0,
 	.set_next_event	= sa1100_osmr0_set_next_event,
 	.set_mode	= sa1100_osmr0_set_mode,
 };
@@ -110,6 +109,7 @@ static void __init sa1100_timer_init(void)
 		clockevent_delta2ns(0x7fffffff, &ckevt_sa1100_osmr0);
 	ckevt_sa1100_osmr0.min_delta_ns =
 		clockevent_delta2ns(MIN_OSCR_DELTA * 2, &ckevt_sa1100_osmr0) + 1;
+	ckevt_sa1100_osmr0.cpumask = cpumask_of(0);
 
 	cksrc_sa1100_oscr.mult =
 		clocksource_hz2mult(CLOCK_TICK_RATE, cksrc_sa1100_oscr.shift);
diff --git a/arch/arm/mach-versatile/core.c b/arch/arm/mach-versatile/core.c
index 565e0ba0d67..a3f1933434e 100644
--- a/arch/arm/mach-versatile/core.c
+++ b/arch/arm/mach-versatile/core.c
@@ -965,7 +965,7 @@ static void __init versatile_timer_init(void)
 	timer0_clockevent.min_delta_ns =
 		clockevent_delta2ns(0xf, &timer0_clockevent);
 
-	timer0_clockevent.cpumask = cpumask_of_cpu(0);
+	timer0_clockevent.cpumask = cpumask_of(0);
 	clockevents_register_device(&timer0_clockevent);
 }
 
diff --git a/arch/arm/plat-mxc/time.c b/arch/arm/plat-mxc/time.c
index fd28f5194f7..758a1293bcf 100644
--- a/arch/arm/plat-mxc/time.c
+++ b/arch/arm/plat-mxc/time.c
@@ -190,7 +190,7 @@ static int __init mxc_clockevent_init(void)
 	clockevent_mxc.min_delta_ns =
 			clockevent_delta2ns(0xff, &clockevent_mxc);
 
-	clockevent_mxc.cpumask = cpumask_of_cpu(0);
+	clockevent_mxc.cpumask = cpumask_of(0);
 
 	clockevents_register_device(&clockevent_mxc);
 
diff --git a/arch/arm/plat-orion/time.c b/arch/arm/plat-orion/time.c
index 544d6b327f3..6fa2923e6dc 100644
--- a/arch/arm/plat-orion/time.c
+++ b/arch/arm/plat-orion/time.c
@@ -149,7 +149,6 @@ static struct clock_event_device orion_clkevt = {
 	.features	= CLOCK_EVT_FEAT_ONESHOT | CLOCK_EVT_FEAT_PERIODIC,
 	.shift		= 32,
 	.rating		= 300,
-	.cpumask	= CPU_MASK_CPU0,
 	.set_next_event	= orion_clkevt_next_event,
 	.set_mode	= orion_clkevt_mode,
 };
@@ -199,5 +198,6 @@ void __init orion_time_init(unsigned int irq, unsigned int tclk)
 	orion_clkevt.mult = div_sc(tclk, NSEC_PER_SEC, orion_clkevt.shift);
 	orion_clkevt.max_delta_ns = clockevent_delta2ns(0xfffffffe, &orion_clkevt);
 	orion_clkevt.min_delta_ns = clockevent_delta2ns(1, &orion_clkevt);
+	orion_clkevt.cpumask = cpumask_of(0);
 	clockevents_register_device(&orion_clkevt);
 }
diff --git a/arch/avr32/kernel/time.c b/arch/avr32/kernel/time.c
index 283481d74a5..0ff46bf873b 100644
--- a/arch/avr32/kernel/time.c
+++ b/arch/avr32/kernel/time.c
@@ -106,7 +106,6 @@ static struct clock_event_device comparator = {
 	.features	= CLOCK_EVT_FEAT_ONESHOT,
 	.shift		= 16,
 	.rating		= 50,
-	.cpumask	= CPU_MASK_CPU0,
 	.set_next_event	= comparator_next_event,
 	.set_mode	= comparator_mode,
 };
@@ -134,6 +133,7 @@ void __init time_init(void)
 	comparator.mult = div_sc(counter_hz, NSEC_PER_SEC, comparator.shift);
 	comparator.max_delta_ns = clockevent_delta2ns((u32)~0, &comparator);
 	comparator.min_delta_ns = clockevent_delta2ns(50, &comparator) + 1;
+	comparator.cpumask = cpumask_of(0);
 
 	sysreg_write(COMPARE, 0);
 	timer_irqaction.dev_id = &comparator;
diff --git a/arch/blackfin/kernel/time-ts.c b/arch/blackfin/kernel/time-ts.c
index e887efc86c2..0ed2badfd74 100644
--- a/arch/blackfin/kernel/time-ts.c
+++ b/arch/blackfin/kernel/time-ts.c
@@ -162,7 +162,6 @@ static struct clock_event_device clockevent_bfin = {
 	.name		= "bfin_core_timer",
 	.features	= CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT,
 	.shift		= 32,
-	.cpumask	= CPU_MASK_CPU0,
 	.set_next_event = bfin_timer_set_next_event,
 	.set_mode	= bfin_timer_set_mode,
 };
@@ -193,6 +192,7 @@ static int __init bfin_clockevent_init(void)
 	clockevent_bfin.mult = div_sc(timer_clk, NSEC_PER_SEC, clockevent_bfin.shift);
 	clockevent_bfin.max_delta_ns = clockevent_delta2ns(-1, &clockevent_bfin);
 	clockevent_bfin.min_delta_ns = clockevent_delta2ns(100, &clockevent_bfin);
+	clockevent_bfin.cpumask = cpumask_of(0);
 	clockevents_register_device(&clockevent_bfin);
 
 	return 0;
diff --git a/arch/m68knommu/platform/coldfire/pit.c b/arch/m68knommu/platform/coldfire/pit.c
index c5b916700b2..2a12e7fa974 100644
--- a/arch/m68knommu/platform/coldfire/pit.c
+++ b/arch/m68knommu/platform/coldfire/pit.c
@@ -156,7 +156,7 @@ void hw_timer_init(void)
 {
 	u32 imr;
 
-	cf_pit_clockevent.cpumask = cpumask_of_cpu(smp_processor_id());
+	cf_pit_clockevent.cpumask = cpumask_of(smp_processor_id());
 	cf_pit_clockevent.mult = div_sc(FREQ, NSEC_PER_SEC, 32);
 	cf_pit_clockevent.max_delta_ns =
 		clockevent_delta2ns(0xFFFF, &cf_pit_clockevent);
diff --git a/arch/mips/jazz/irq.c b/arch/mips/jazz/irq.c
index d7f8a782aae..03965cb1b25 100644
--- a/arch/mips/jazz/irq.c
+++ b/arch/mips/jazz/irq.c
@@ -146,7 +146,7 @@ void __init plat_time_init(void)
 
 	BUG_ON(HZ != 100);
 
-	cd->cpumask             = cpumask_of_cpu(cpu);
+	cd->cpumask             = cpumask_of(cpu);
 	clockevents_register_device(cd);
 	action->dev_id = cd;
 	setup_irq(JAZZ_TIMER_IRQ, action);
diff --git a/arch/mips/kernel/cevt-bcm1480.c b/arch/mips/kernel/cevt-bcm1480.c
index d7e21bc8cd2..b820661678b 100644
--- a/arch/mips/kernel/cevt-bcm1480.c
+++ b/arch/mips/kernel/cevt-bcm1480.c
@@ -126,7 +126,7 @@ void __cpuinit sb1480_clockevent_init(void)
 	cd->min_delta_ns	= clockevent_delta2ns(2, cd);
 	cd->rating		= 200;
 	cd->irq			= irq;
-	cd->cpumask		= cpumask_of_cpu(cpu);
+	cd->cpumask		= cpumask_of(cpu);
 	cd->set_next_event	= sibyte_next_event;
 	cd->set_mode		= sibyte_set_mode;
 	clockevents_register_device(cd);
diff --git a/arch/mips/kernel/cevt-ds1287.c b/arch/mips/kernel/cevt-ds1287.c
index df4acb68bfb..1ada45ea070 100644
--- a/arch/mips/kernel/cevt-ds1287.c
+++ b/arch/mips/kernel/cevt-ds1287.c
@@ -88,7 +88,6 @@ static void ds1287_event_handler(struct clock_event_device *dev)
 static struct clock_event_device ds1287_clockevent = {
 	.name		= "ds1287",
 	.features	= CLOCK_EVT_FEAT_PERIODIC,
-	.cpumask	= CPU_MASK_CPU0,
 	.set_next_event	= ds1287_set_next_event,
 	.set_mode	= ds1287_set_mode,
 	.event_handler	= ds1287_event_handler,
@@ -122,6 +121,7 @@ int __init ds1287_clockevent_init(int irq)
 	clockevent_set_clock(cd, 32768);
 	cd->max_delta_ns = clockevent_delta2ns(0x7fffffff, cd);
 	cd->min_delta_ns = clockevent_delta2ns(0x300, cd);
+	cd->cpumask = cpumask_of(0);
 
 	clockevents_register_device(&ds1287_clockevent);
 
diff --git a/arch/mips/kernel/cevt-gt641xx.c b/arch/mips/kernel/cevt-gt641xx.c
index 6e2f58520af..e9b787feedc 100644
--- a/arch/mips/kernel/cevt-gt641xx.c
+++ b/arch/mips/kernel/cevt-gt641xx.c
@@ -96,7 +96,6 @@ static void gt641xx_timer0_event_handler(struct clock_event_device *dev)
 static struct clock_event_device gt641xx_timer0_clockevent = {
 	.name		= "gt641xx-timer0",
 	.features	= CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT,
-	.cpumask	= CPU_MASK_CPU0,
 	.irq		= GT641XX_TIMER0_IRQ,
 	.set_next_event	= gt641xx_timer0_set_next_event,
 	.set_mode	= gt641xx_timer0_set_mode,
@@ -132,6 +131,7 @@ static int __init gt641xx_timer0_clockevent_init(void)
 	clockevent_set_clock(cd, gt641xx_base_clock);
 	cd->max_delta_ns = clockevent_delta2ns(0x7fffffff, cd);
 	cd->min_delta_ns = clockevent_delta2ns(0x300, cd);
+	cd->cpumask = cpumask_of(0);
 
 	clockevents_register_device(&gt641xx_timer0_clockevent);
 
diff --git a/arch/mips/kernel/cevt-r4k.c b/arch/mips/kernel/cevt-r4k.c
index 4a4c59f2737..e1ec83b6803 100644
--- a/arch/mips/kernel/cevt-r4k.c
+++ b/arch/mips/kernel/cevt-r4k.c
@@ -195,7 +195,7 @@ int __cpuinit mips_clockevent_init(void)
 
 	cd->rating		= 300;
 	cd->irq			= irq;
-	cd->cpumask		= cpumask_of_cpu(cpu);
+	cd->cpumask		= cpumask_of(cpu);
 	cd->set_next_event	= mips_next_event;
 	cd->set_mode		= mips_set_clock_mode;
 	cd->event_handler	= mips_event_handler;
diff --git a/arch/mips/kernel/cevt-sb1250.c b/arch/mips/kernel/cevt-sb1250.c
index 0f188cd46e0..a2eebaafda5 100644
--- a/arch/mips/kernel/cevt-sb1250.c
+++ b/arch/mips/kernel/cevt-sb1250.c
@@ -125,7 +125,7 @@ void __cpuinit sb1250_clockevent_init(void)
 	cd->min_delta_ns	= clockevent_delta2ns(2, cd);
 	cd->rating		= 200;
 	cd->irq			= irq;
-	cd->cpumask		= cpumask_of_cpu(cpu);
+	cd->cpumask		= cpumask_of(cpu);
 	cd->set_next_event	= sibyte_next_event;
 	cd->set_mode		= sibyte_set_mode;
 	clockevents_register_device(cd);
diff --git a/arch/mips/kernel/cevt-smtc.c b/arch/mips/kernel/cevt-smtc.c
index 5162fe4b595..6d45e24db5b 100644
--- a/arch/mips/kernel/cevt-smtc.c
+++ b/arch/mips/kernel/cevt-smtc.c
@@ -292,7 +292,7 @@ int __cpuinit mips_clockevent_init(void)
 
 	cd->rating		= 300;
 	cd->irq			= irq;
-	cd->cpumask		= cpumask_of_cpu(cpu);
+	cd->cpumask		= cpumask_of(cpu);
 	cd->set_next_event	= mips_next_event;
 	cd->set_mode		= mips_set_clock_mode;
 	cd->event_handler	= mips_event_handler;
diff --git a/arch/mips/kernel/cevt-txx9.c b/arch/mips/kernel/cevt-txx9.c
index b5fc4eb412d..eccf7d6096b 100644
--- a/arch/mips/kernel/cevt-txx9.c
+++ b/arch/mips/kernel/cevt-txx9.c
@@ -112,7 +112,6 @@ static struct clock_event_device txx9tmr_clock_event_device = {
 	.name		= "TXx9",
 	.features	= CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT,
 	.rating		= 200,
-	.cpumask	= CPU_MASK_CPU0,
 	.set_mode	= txx9tmr_set_mode,
 	.set_next_event	= txx9tmr_set_next_event,
 };
@@ -150,6 +149,7 @@ void __init txx9_clockevent_init(unsigned long baseaddr, int irq,
 		clockevent_delta2ns(0xffffffff >> (32 - TXX9_TIMER_BITS), cd);
 	cd->min_delta_ns = clockevent_delta2ns(0xf, cd);
 	cd->irq = irq;
+	cd->cpumask = cpumask_of(0),
 	clockevents_register_device(cd);
 	setup_irq(irq, &txx9tmr_irq);
 	printk(KERN_INFO "TXx9: clockevent device at 0x%lx, irq %d\n",
diff --git a/arch/mips/kernel/i8253.c b/arch/mips/kernel/i8253.c
index b6ac55162b9..f4d187825f9 100644
--- a/arch/mips/kernel/i8253.c
+++ b/arch/mips/kernel/i8253.c
@@ -115,7 +115,7 @@ void __init setup_pit_timer(void)
 	 * Start pit with the boot cpu mask and make it global after the
 	 * IO_APIC has been initialized.
 	 */
-	cd->cpumask = cpumask_of_cpu(cpu);
+	cd->cpumask = cpumask_of(cpu);
 	clockevent_set_clock(cd, CLOCK_TICK_RATE);
 	cd->max_delta_ns = clockevent_delta2ns(0x7FFF, cd);
 	cd->min_delta_ns = clockevent_delta2ns(0xF, cd);
diff --git a/arch/mips/nxp/pnx8550/common/time.c b/arch/mips/nxp/pnx8550/common/time.c
index 62f495b57f9..cf293b27909 100644
--- a/arch/mips/nxp/pnx8550/common/time.c
+++ b/arch/mips/nxp/pnx8550/common/time.c
@@ -102,6 +102,7 @@ __init void plat_time_init(void)
 	unsigned int p;
 	unsigned int pow2p;
 
+	pnx8xxx_clockevent.cpumask = cpu_none_mask;
 	clockevents_register_device(&pnx8xxx_clockevent);
 	clocksource_register(&pnx_clocksource);
 
diff --git a/arch/mips/sgi-ip27/ip27-timer.c b/arch/mips/sgi-ip27/ip27-timer.c
index 1327c2746fb..f024057a35f 100644
--- a/arch/mips/sgi-ip27/ip27-timer.c
+++ b/arch/mips/sgi-ip27/ip27-timer.c
@@ -134,7 +134,7 @@ void __cpuinit hub_rt_clock_event_init(void)
 	cd->min_delta_ns        = clockevent_delta2ns(0x300, cd);
 	cd->rating		= 200;
 	cd->irq			= irq;
-	cd->cpumask		= cpumask_of_cpu(cpu);
+	cd->cpumask		= cpumask_of(cpu);
 	cd->set_next_event	= rt_next_event;
 	cd->set_mode		= rt_set_mode;
 	clockevents_register_device(cd);
diff --git a/arch/mips/sni/time.c b/arch/mips/sni/time.c
index 796e3ce2872..69f5f88711c 100644
--- a/arch/mips/sni/time.c
+++ b/arch/mips/sni/time.c
@@ -80,7 +80,7 @@ static void __init sni_a20r_timer_setup(void)
 	struct irqaction *action = &a20r_irqaction;
 	unsigned int cpu = smp_processor_id();
 
-	cd->cpumask             = cpumask_of_cpu(cpu);
+	cd->cpumask             = cpumask_of(cpu);
 	clockevents_register_device(cd);
 	action->dev_id = cd;
 	setup_irq(SNI_A20R_IRQ_TIMER, &a20r_irqaction);
diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c
index e2ee66b5831..6f39d35d6f5 100644
--- a/arch/powerpc/kernel/time.c
+++ b/arch/powerpc/kernel/time.c
@@ -869,7 +869,7 @@ static void register_decrementer_clockevent(int cpu)
 	struct clock_event_device *dec = &per_cpu(decrementers, cpu).event;
 
 	*dec = decrementer_clockevent;
-	dec->cpumask = cpumask_of_cpu(cpu);
+	dec->cpumask = cpumask_of(cpu);
 
 	printk(KERN_DEBUG "clockevent: %s mult[%lx] shift[%d] cpu[%d]\n",
 	       dec->name, dec->mult, dec->shift, cpu);
diff --git a/arch/s390/kernel/time.c b/arch/s390/kernel/time.c
index eccefbbff88..f5bd141c844 100644
--- a/arch/s390/kernel/time.c
+++ b/arch/s390/kernel/time.c
@@ -154,7 +154,7 @@ void init_cpu_timer(void)
 	cd->min_delta_ns	= 1;
 	cd->max_delta_ns	= LONG_MAX;
 	cd->rating		= 400;
-	cd->cpumask		= cpumask_of_cpu(cpu);
+	cd->cpumask		= cpumask_of(cpu);
 	cd->set_next_event	= s390_next_event;
 	cd->set_mode		= s390_set_mode;
 
diff --git a/arch/sh/include/asm/smp.h b/arch/sh/include/asm/smp.h
index 85b660c17eb..c24e9c6a173 100644
--- a/arch/sh/include/asm/smp.h
+++ b/arch/sh/include/asm/smp.h
@@ -31,7 +31,7 @@ enum {
 };
 
 void smp_message_recv(unsigned int msg);
-void smp_timer_broadcast(cpumask_t mask);
+void smp_timer_broadcast(const struct cpumask *mask);
 
 void local_timer_interrupt(void);
 void local_timer_setup(unsigned int cpu);
diff --git a/arch/sh/kernel/smp.c b/arch/sh/kernel/smp.c
index 593937d0c49..8f402741261 100644
--- a/arch/sh/kernel/smp.c
+++ b/arch/sh/kernel/smp.c
@@ -184,11 +184,11 @@ void arch_send_call_function_single_ipi(int cpu)
 	plat_send_ipi(cpu, SMP_MSG_FUNCTION_SINGLE);
 }
 
-void smp_timer_broadcast(cpumask_t mask)
+void smp_timer_broadcast(const struct cpumask *mask)
 {
 	int cpu;
 
-	for_each_cpu_mask(cpu, mask)
+	for_each_cpu(cpu, mask)
 		plat_send_ipi(cpu, SMP_MSG_TIMER);
 }
 
diff --git a/arch/sh/kernel/timers/timer-broadcast.c b/arch/sh/kernel/timers/timer-broadcast.c
index c2317635230..96e8eaea1e6 100644
--- a/arch/sh/kernel/timers/timer-broadcast.c
+++ b/arch/sh/kernel/timers/timer-broadcast.c
@@ -51,7 +51,7 @@ void __cpuinit local_timer_setup(unsigned int cpu)
 	clk->mult		= 1;
 	clk->set_mode		= dummy_timer_set_mode;
 	clk->broadcast		= smp_timer_broadcast;
-	clk->cpumask		= cpumask_of_cpu(cpu);
+	clk->cpumask		= cpumask_of(cpu);
 
 	clockevents_register_device(clk);
 }
diff --git a/arch/sh/kernel/timers/timer-tmu.c b/arch/sh/kernel/timers/timer-tmu.c
index 3c61ddd4d43..0db3f951033 100644
--- a/arch/sh/kernel/timers/timer-tmu.c
+++ b/arch/sh/kernel/timers/timer-tmu.c
@@ -263,7 +263,7 @@ static int tmu_timer_init(void)
 	tmu0_clockevent.min_delta_ns =
 			clockevent_delta2ns(1, &tmu0_clockevent);
 
-	tmu0_clockevent.cpumask = cpumask_of_cpu(0);
+	tmu0_clockevent.cpumask = cpumask_of(0);
 
 	clockevents_register_device(&tmu0_clockevent);
 
diff --git a/arch/sparc64/kernel/time.c b/arch/sparc64/kernel/time.c
index 141da375909..9df8f095a8b 100644
--- a/arch/sparc64/kernel/time.c
+++ b/arch/sparc64/kernel/time.c
@@ -763,7 +763,7 @@ void __devinit setup_sparc64_timer(void)
 	sevt = &__get_cpu_var(sparc64_events);
 
 	memcpy(sevt, &sparc64_clockevent, sizeof(*sevt));
-	sevt->cpumask = cpumask_of_cpu(smp_processor_id());
+	sevt->cpumask = cpumask_of(smp_processor_id());
 
 	clockevents_register_device(sevt);
 }
diff --git a/arch/um/kernel/time.c b/arch/um/kernel/time.c
index 47f04f4a346..b13a87a3ec9 100644
--- a/arch/um/kernel/time.c
+++ b/arch/um/kernel/time.c
@@ -50,7 +50,7 @@ static int itimer_next_event(unsigned long delta,
 static struct clock_event_device itimer_clockevent = {
 	.name		= "itimer",
 	.rating		= 250,
-	.cpumask	= CPU_MASK_ALL,
+	.cpumask	= cpu_all_mask,
 	.features	= CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT,
 	.set_mode	= itimer_set_mode,
 	.set_next_event = itimer_next_event,
diff --git a/arch/x86/kernel/apic.c b/arch/x86/kernel/apic.c
index 16f94879b52..b2cef49f308 100644
--- a/arch/x86/kernel/apic.c
+++ b/arch/x86/kernel/apic.c
@@ -141,7 +141,7 @@ static int lapic_next_event(unsigned long delta,
 			    struct clock_event_device *evt);
 static void lapic_timer_setup(enum clock_event_mode mode,
 			      struct clock_event_device *evt);
-static void lapic_timer_broadcast(cpumask_t mask);
+static void lapic_timer_broadcast(const struct cpumask *mask);
 static void apic_pm_activate(void);
 
 /*
@@ -453,10 +453,10 @@ static void lapic_timer_setup(enum clock_event_mode mode,
 /*
  * Local APIC timer broadcast function
  */
-static void lapic_timer_broadcast(cpumask_t mask)
+static void lapic_timer_broadcast(const struct cpumask *mask)
 {
 #ifdef CONFIG_SMP
-	send_IPI_mask(mask, LOCAL_TIMER_VECTOR);
+	send_IPI_mask(*mask, LOCAL_TIMER_VECTOR);
 #endif
 }
 
@@ -469,7 +469,7 @@ static void __cpuinit setup_APIC_timer(void)
 	struct clock_event_device *levt = &__get_cpu_var(lapic_events);
 
 	memcpy(levt, &lapic_clockevent, sizeof(*levt));
-	levt->cpumask = cpumask_of_cpu(smp_processor_id());
+	levt->cpumask = cpumask_of(smp_processor_id());
 
 	clockevents_register_device(levt);
 }
diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c
index 940f25851e1..e76d7e27297 100644
--- a/arch/x86/kernel/hpet.c
+++ b/arch/x86/kernel/hpet.c
@@ -246,7 +246,7 @@ static void hpet_legacy_clockevent_register(void)
 	 * Start hpet with the boot cpu mask and make it
 	 * global after the IO_APIC has been initialized.
 	 */
-	hpet_clockevent.cpumask = cpumask_of_cpu(smp_processor_id());
+	hpet_clockevent.cpumask = cpumask_of(smp_processor_id());
 	clockevents_register_device(&hpet_clockevent);
 	global_clock_event = &hpet_clockevent;
 	printk(KERN_DEBUG "hpet clockevent registered\n");
@@ -500,7 +500,7 @@ static void init_one_hpet_msi_clockevent(struct hpet_dev *hdev, int cpu)
 	/* 5 usec minimum reprogramming delta. */
 	evt->min_delta_ns = 5000;
 
-	evt->cpumask = cpumask_of_cpu(hdev->cpu);
+	evt->cpumask = cpumask_of(hdev->cpu);
 	clockevents_register_device(evt);
 }
 
diff --git a/arch/x86/kernel/i8253.c b/arch/x86/kernel/i8253.c
index c1b5e3ece1f..10f92fb532f 100644
--- a/arch/x86/kernel/i8253.c
+++ b/arch/x86/kernel/i8253.c
@@ -114,7 +114,7 @@ void __init setup_pit_timer(void)
 	 * Start pit with the boot cpu mask and make it global after the
 	 * IO_APIC has been initialized.
 	 */
-	pit_clockevent.cpumask = cpumask_of_cpu(smp_processor_id());
+	pit_clockevent.cpumask = cpumask_of(smp_processor_id());
 	pit_clockevent.mult = div_sc(CLOCK_TICK_RATE, NSEC_PER_SEC,
 				     pit_clockevent.shift);
 	pit_clockevent.max_delta_ns =
diff --git a/arch/x86/kernel/mfgpt_32.c b/arch/x86/kernel/mfgpt_32.c
index 3b599518c32..c12314c9e86 100644
--- a/arch/x86/kernel/mfgpt_32.c
+++ b/arch/x86/kernel/mfgpt_32.c
@@ -287,7 +287,7 @@ static struct clock_event_device mfgpt_clockevent = {
 	.set_mode = mfgpt_set_mode,
 	.set_next_event = mfgpt_next_event,
 	.rating = 250,
-	.cpumask = CPU_MASK_ALL,
+	.cpumask = cpu_all_mask,
 	.shift = 32
 };
 
diff --git a/arch/x86/kernel/vmiclock_32.c b/arch/x86/kernel/vmiclock_32.c
index 254ee07f863..c4c1f9e0940 100644
--- a/arch/x86/kernel/vmiclock_32.c
+++ b/arch/x86/kernel/vmiclock_32.c
@@ -226,7 +226,7 @@ static void __devinit vmi_time_init_clockevent(void)
 	/* Upper bound is clockevent's use of ulong for cycle deltas. */
 	evt->max_delta_ns = clockevent_delta2ns(ULONG_MAX, evt);
 	evt->min_delta_ns = clockevent_delta2ns(1, evt);
-	evt->cpumask = cpumask_of_cpu(cpu);
+	evt->cpumask = cpumask_of(cpu);
 
 	printk(KERN_WARNING "vmi: registering clock event %s. mult=%lu shift=%u\n",
 	       evt->name, evt->mult, evt->shift);
diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c
index a5d8e1ace1c..104c8220a38 100644
--- a/arch/x86/lguest/boot.c
+++ b/arch/x86/lguest/boot.c
@@ -737,7 +737,7 @@ static void lguest_time_init(void)
 
 	/* We can't set cpumask in the initializer: damn C limitations!  Set it
 	 * here and register our timer device. */
-	lguest_clockevent.cpumask = cpumask_of_cpu(0);
+	lguest_clockevent.cpumask = cpumask_of(0);
 	clockevents_register_device(&lguest_clockevent);
 
 	/* Finally, we unblock the timer interrupt. */
diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c
index c9f7cda48ed..65d75a6be0b 100644
--- a/arch/x86/xen/time.c
+++ b/arch/x86/xen/time.c
@@ -437,7 +437,7 @@ void xen_setup_timer(int cpu)
 	evt = &per_cpu(xen_clock_events, cpu);
 	memcpy(evt, xen_clockevent, sizeof(*evt));
 
-	evt->cpumask = cpumask_of_cpu(cpu);
+	evt->cpumask = cpumask_of(cpu);
 	evt->irq = irq;
 
 	setup_runstate_info(cpu);
diff --git a/drivers/clocksource/tcb_clksrc.c b/drivers/clocksource/tcb_clksrc.c
index f450588e585..254f1064d97 100644
--- a/drivers/clocksource/tcb_clksrc.c
+++ b/drivers/clocksource/tcb_clksrc.c
@@ -154,7 +154,6 @@ static struct tc_clkevt_device clkevt = {
 		.shift		= 32,
 		/* Should be lower than at91rm9200's system timer */
 		.rating		= 125,
-		.cpumask	= CPU_MASK_CPU0,
 		.set_next_event	= tc_next_event,
 		.set_mode	= tc_mode,
 	},
@@ -195,6 +194,7 @@ static void __init setup_clkevents(struct atmel_tc *tc, int clk32k_divisor_idx)
 	clkevt.clkevt.max_delta_ns
 		= clockevent_delta2ns(0xffff, &clkevt.clkevt);
 	clkevt.clkevt.min_delta_ns = clockevent_delta2ns(1, &clkevt.clkevt) + 1;
+	clkevt.clkevt.cpumask = cpumask_of(0);
 
 	setup_irq(irq, &tc_irqaction);
 
diff --git a/include/linux/clockchips.h b/include/linux/clockchips.h
index ed3a5d473e5..cea153697ec 100644
--- a/include/linux/clockchips.h
+++ b/include/linux/clockchips.h
@@ -82,13 +82,13 @@ struct clock_event_device {
 	int			shift;
 	int			rating;
 	int			irq;
-	cpumask_t		cpumask;
+	const struct cpumask	*cpumask;
 	int			(*set_next_event)(unsigned long evt,
 						  struct clock_event_device *);
 	void			(*set_mode)(enum clock_event_mode mode,
 					    struct clock_event_device *);
 	void			(*event_handler)(struct clock_event_device *);
-	void			(*broadcast)(cpumask_t mask);
+	void			(*broadcast)(const struct cpumask *mask);
 	struct list_head	list;
 	enum clock_event_mode	mode;
 	ktime_t			next_event;
diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c
index f8d968063ce..ea2f48af83c 100644
--- a/kernel/time/clockevents.c
+++ b/kernel/time/clockevents.c
@@ -166,6 +166,8 @@ static void clockevents_notify_released(void)
 void clockevents_register_device(struct clock_event_device *dev)
 {
 	BUG_ON(dev->mode != CLOCK_EVT_MODE_UNUSED);
+	BUG_ON(!dev->cpumask);
+
 	/*
 	 * A nsec2cyc multiplicator of 0 is invalid and we'd crash
 	 * on it, so fix it up and emit a warning:
diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c
index f98a1b7b16e..9590af2327b 100644
--- a/kernel/time/tick-broadcast.c
+++ b/kernel/time/tick-broadcast.c
@@ -150,7 +150,7 @@ static void tick_do_broadcast(cpumask_t mask)
 		 */
 		cpu = first_cpu(mask);
 		td = &per_cpu(tick_cpu_device, cpu);
-		td->evtdev->broadcast(mask);
+		td->evtdev->broadcast(&mask);
 	}
 }
 
diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c
index ab65d217583..f8372be7412 100644
--- a/kernel/time/tick-common.c
+++ b/kernel/time/tick-common.c
@@ -171,7 +171,7 @@ static void tick_setup_device(struct tick_device *td,
 	 * When the device is not per cpu, pin the interrupt to the
 	 * current cpu:
 	 */
-	if (!cpumask_equal(&newdev->cpumask, cpumask))
+	if (!cpumask_equal(newdev->cpumask, cpumask))
 		irq_set_affinity(newdev->irq, cpumask);
 
 	/*
@@ -202,14 +202,14 @@ static int tick_check_new_device(struct clock_event_device *newdev)
 	spin_lock_irqsave(&tick_device_lock, flags);
 
 	cpu = smp_processor_id();
-	if (!cpu_isset(cpu, newdev->cpumask))
+	if (!cpumask_test_cpu(cpu, newdev->cpumask))
 		goto out_bc;
 
 	td = &per_cpu(tick_cpu_device, cpu);
 	curdev = td->evtdev;
 
 	/* cpu local device ? */
-	if (!cpus_equal(newdev->cpumask, cpumask_of_cpu(cpu))) {
+	if (!cpumask_equal(newdev->cpumask, cpumask_of(cpu))) {
 
 		/*
 		 * If the cpu affinity of the device interrupt can not
@@ -222,7 +222,7 @@ static int tick_check_new_device(struct clock_event_device *newdev)
 		 * If we have a cpu local device already, do not replace it
 		 * by a non cpu local device
 		 */
-		if (curdev && cpus_equal(curdev->cpumask, cpumask_of_cpu(cpu)))
+		if (curdev && cpumask_equal(curdev->cpumask, cpumask_of(cpu)))
 			goto out_bc;
 	}
 
-- 
cgit v1.2.3-70-g09d2


From 7be7585393d311866653564fbcd10a3232773c0b Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Sat, 13 Dec 2008 21:20:28 +1030
Subject: cpumask: Use all NR_CPUS bits unless CONFIG_CPUMASK_OFFSTACK

Impact: futureproof as we convert more code to new APIs

The old cpumask operators treat all NR_CPUS bits as relevent, the new
ones use nr_cpumask_bits.  For large NR_CPUS and small nr_cpu_ids, this
makes a difference.

However, mixing the two can cause problems with undefined bits.  An
arch which sets CONFIG_CPUMASK_OFFSTACK should have converted across
to the new operators, so it's safe in that case.

(Thanks to Stephen Rothwell for bisecting the initial unused-bits bug,
and Mike Travis for this solution).

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
Cc: Mike Travis <travis@sgi.com>
---
 include/linux/cpumask.h | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h
index 94a2ab88ae8..d4bf52603e6 100644
--- a/include/linux/cpumask.h
+++ b/include/linux/cpumask.h
@@ -510,9 +510,6 @@ extern cpumask_t cpu_active_map;
 	[BITS_TO_LONGS(NR_CPUS)-1] = CPU_MASK_LAST_WORD	\
 }
 
-/* This produces more efficient code. */
-#define nr_cpumask_bits	NR_CPUS
-
 #else /* NR_CPUS > BITS_PER_LONG */
 
 #define CPU_BITS_ALL						\
@@ -520,9 +517,15 @@ extern cpumask_t cpu_active_map;
 	[0 ... BITS_TO_LONGS(NR_CPUS)-2] = ~0UL,		\
 	[BITS_TO_LONGS(NR_CPUS)-1] = CPU_MASK_LAST_WORD		\
 }
+#endif /* NR_CPUS > BITS_PER_LONG */
 
+#ifdef CONFIG_CPUMASK_OFFSTACK
+/* Assuming NR_CPUS is huge, a runtime limit is more efficient.  Also,
+ * not all bits may be allocated. */
 #define nr_cpumask_bits	nr_cpu_ids
-#endif /* NR_CPUS > BITS_PER_LONG */
+#else
+#define nr_cpumask_bits	NR_CPUS
+#endif
 
 /* verify cpu argument to cpumask_* operators */
 static inline unsigned int cpumask_check(unsigned int cpu)
-- 
cgit v1.2.3-70-g09d2


From b690ace50be7d10d77cb7a6d5ef1bd9de649852f Mon Sep 17 00:00:00 2001
From: Ben Dooks <ben-linux@fluff.org>
Date: Tue, 21 Oct 2008 14:07:03 +0100
Subject: [ARM] S3C6400: serial support for S3C6400 and S3C6410 SoCs

Add support to the Samsung serial driver for the S3C6400
and S3C6410 serial ports.

Signed-off-by: Ben Dooks <ben-linux@fluff.org>
---
 arch/arm/mach-s3c2410/include/mach/map.h     |   2 +
 arch/arm/mach-s3c24a0/include/mach/map.h     |   2 +
 arch/arm/plat-s3c/include/plat/regs-serial.h |   6 ++
 drivers/serial/Kconfig                       |  10 +-
 drivers/serial/Makefile                      |   1 +
 drivers/serial/s3c6400.c                     | 151 +++++++++++++++++++++++++++
 drivers/serial/samsung.c                     |   8 +-
 include/linux/serial_core.h                  |   2 +
 8 files changed, 178 insertions(+), 4 deletions(-)
 create mode 100644 drivers/serial/s3c6400.c

(limited to 'include/linux')

diff --git a/arch/arm/mach-s3c2410/include/mach/map.h b/arch/arm/mach-s3c2410/include/mach/map.h
index 6b30361a080..918e3463297 100644
--- a/arch/arm/mach-s3c2410/include/mach/map.h
+++ b/arch/arm/mach-s3c2410/include/mach/map.h
@@ -101,4 +101,6 @@
 #define S3C24XX_PA_SDI      S3C2410_PA_SDI
 #define S3C24XX_PA_NAND	    S3C2410_PA_NAND
 
+#define S3C_PA_UART	    S3C24XX_PA_UART
+
 #endif /* __ASM_ARCH_MAP_H */
diff --git a/arch/arm/mach-s3c24a0/include/mach/map.h b/arch/arm/mach-s3c24a0/include/mach/map.h
index 2ce1839de4e..6667355a47a 100644
--- a/arch/arm/mach-s3c24a0/include/mach/map.h
+++ b/arch/arm/mach-s3c24a0/include/mach/map.h
@@ -80,4 +80,6 @@
 #define S3C24XX_PA_SDI		S3C24A0_PA_SDI
 #define S3C24XX_PA_NAND		S3C24A0_PA_NAND
 
+#define S3C_PA_UART		S3C24A0_PA_UART
+
 #endif /* __ASM_ARCH_24A0_MAP_H */
diff --git a/arch/arm/plat-s3c/include/plat/regs-serial.h b/arch/arm/plat-s3c/include/plat/regs-serial.h
index 18ba31c7174..3ca28585cf8 100644
--- a/arch/arm/plat-s3c/include/plat/regs-serial.h
+++ b/arch/arm/plat-s3c/include/plat/regs-serial.h
@@ -77,6 +77,12 @@
 #define S3C2440_UCON_FCLK	  (3<<10)
 #define S3C2443_UCON_EPLL	  (3<<10)
 
+#define S3C6400_UCON_CLKMASK	(3<<10)
+#define S3C6400_UCON_PCLK	(0<<10)
+#define S3C6400_UCON_PCLK2	(2<<10)
+#define S3C6400_UCON_UCLK0	(1<<10)
+#define S3C6400_UCON_UCLK1	(3<<10)
+
 #define S3C2440_UCON2_FCLK_EN	  (1<<15)
 #define S3C2440_UCON0_DIVMASK	  (15 << 12)
 #define S3C2440_UCON1_DIVMASK	  (15 << 12)
diff --git a/drivers/serial/Kconfig b/drivers/serial/Kconfig
index f71a2e8a5f6..e4ae499e587 100644
--- a/drivers/serial/Kconfig
+++ b/drivers/serial/Kconfig
@@ -447,7 +447,7 @@ config SERIAL_CLPS711X_CONSOLE
 
 config SERIAL_SAMSUNG
 	tristate "Samsung SoC serial support"
-	depends on ARM && PLAT_S3C24XX
+	depends on ARM && PLAT_S3C
 	select SERIAL_CORE
 	help
 	  Support for the on-chip UARTs on the Samsung S3C24XX series CPUs,
@@ -515,6 +515,14 @@ config SERIAL_S3C24A0
 	help
 	  Serial port support for the Samsung S3C24A0 SoC
 
+config SERIAL_S3C6400
+	tristate "Samsung S3C6400/S3C6410 Serial port support"
+	depends on SERIAL_SAMSUNG && (CPU_S3C600 || CPU_S3C6410)
+	default y
+	help
+	  Serial port support for the Samsung S3C6400 and S3C6410
+	  SoCs
+
 config SERIAL_DZ
 	bool "DECstation DZ serial driver"
 	depends on MACH_DECSTATION && 32BIT
diff --git a/drivers/serial/Makefile b/drivers/serial/Makefile
index 7769aece54c..dfe775ac45b 100644
--- a/drivers/serial/Makefile
+++ b/drivers/serial/Makefile
@@ -42,6 +42,7 @@ obj-$(CONFIG_SERIAL_S3C2410) += s3c2410.o
 obj-$(CONFIG_SERIAL_S3C2412) += s3c2412.o
 obj-$(CONFIG_SERIAL_S3C2440) += s3c2440.o
 obj-$(CONFIG_SERIAL_S3C24A0) += s3c24a0.o
+obj-$(CONFIG_SERIAL_S3C6400) += s3c6400.o
 obj-$(CONFIG_SERIAL_IP22_ZILOG) += ip22zilog.o
 obj-$(CONFIG_SERIAL_MUX) += mux.o
 obj-$(CONFIG_SERIAL_68328) += 68328serial.o
diff --git a/drivers/serial/s3c6400.c b/drivers/serial/s3c6400.c
new file mode 100644
index 00000000000..06936d13393
--- /dev/null
+++ b/drivers/serial/s3c6400.c
@@ -0,0 +1,151 @@
+/* linux/drivers/serial/s3c6400.c
+ *
+ * Driver for Samsung S3C6400 and S3C6410 SoC onboard UARTs.
+ *
+ * Copyright 2008 Openmoko,  Inc.
+ * Copyright 2008 Simtec Electronics
+ *	Ben Dooks <ben@simtec.co.uk>
+ *	http://armlinux.simtec.co.uk/
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+*/
+
+#include <linux/module.h>
+#include <linux/ioport.h>
+#include <linux/io.h>
+#include <linux/platform_device.h>
+#include <linux/init.h>
+#include <linux/serial_core.h>
+#include <linux/serial.h>
+
+#include <asm/irq.h>
+#include <mach/hardware.h>
+
+#include <plat/regs-serial.h>
+
+#include "samsung.h"
+
+static int s3c6400_serial_setsource(struct uart_port *port,
+				    struct s3c24xx_uart_clksrc *clk)
+{
+	unsigned long ucon = rd_regl(port, S3C2410_UCON);
+
+	if (strcmp(clk->name, "uclk0") == 0) {
+		ucon &= ~S3C6400_UCON_CLKMASK;
+		ucon |= S3C6400_UCON_UCLK0;
+	} else if (strcmp(clk->name, "uclk1") == 0)
+		ucon |= S3C6400_UCON_UCLK1;
+	else if (strcmp(clk->name, "pclk") == 0) {
+		/* See notes about transitioning from UCLK to PCLK */
+		ucon &= ~S3C6400_UCON_UCLK0;
+	} else {
+		printk(KERN_ERR "unknown clock source %s\n", clk->name);
+		return -EINVAL;
+	}
+
+	wr_regl(port, S3C2410_UCON, ucon);
+	return 0;
+}
+
+
+static int s3c6400_serial_getsource(struct uart_port *port,
+				    struct s3c24xx_uart_clksrc *clk)
+{
+	u32 ucon = rd_regl(port, S3C2410_UCON);
+
+	clk->divisor = 1;
+
+	switch (ucon & S3C6400_UCON_CLKMASK) {
+	case S3C6400_UCON_UCLK0:
+		clk->name = "uclk0";
+		break;
+
+	case S3C6400_UCON_UCLK1:
+		clk->name = "uclk1";
+		break;
+
+	case S3C6400_UCON_PCLK:
+	case S3C6400_UCON_PCLK2:
+		clk->name = "pclk";
+		break;
+	}
+
+	return 0;
+}
+
+static int s3c6400_serial_resetport(struct uart_port *port,
+				    struct s3c2410_uartcfg *cfg)
+{
+	unsigned long ucon = rd_regl(port, S3C2410_UCON);
+
+	dbg("s3c6400_serial_resetport: port=%p (%08lx), cfg=%p\n",
+	    port, port->mapbase, cfg);
+
+	/* ensure we don't change the clock settings... */
+
+	ucon &= S3C6400_UCON_CLKMASK;
+
+	wr_regl(port, S3C2410_UCON,  ucon | cfg->ucon);
+	wr_regl(port, S3C2410_ULCON, cfg->ulcon);
+
+	/* reset both fifos */
+
+	wr_regl(port, S3C2410_UFCON, cfg->ufcon | S3C2410_UFCON_RESETBOTH);
+	wr_regl(port, S3C2410_UFCON, cfg->ufcon);
+
+	return 0;
+}
+
+static struct s3c24xx_uart_info s3c6400_uart_inf = {
+	.name		= "Samsung S3C6400 UART",
+	.type		= PORT_S3C6400,
+	.fifosize	= 64,
+	.rx_fifomask	= S3C2440_UFSTAT_RXMASK,
+	.rx_fifoshift	= S3C2440_UFSTAT_RXSHIFT,
+	.rx_fifofull	= S3C2440_UFSTAT_RXFULL,
+	.tx_fifofull	= S3C2440_UFSTAT_TXFULL,
+	.tx_fifomask	= S3C2440_UFSTAT_TXMASK,
+	.tx_fifoshift	= S3C2440_UFSTAT_TXSHIFT,
+	.get_clksrc	= s3c6400_serial_getsource,
+	.set_clksrc	= s3c6400_serial_setsource,
+	.reset_port	= s3c6400_serial_resetport,
+};
+
+/* device management */
+
+static int s3c6400_serial_probe(struct platform_device *dev)
+{
+	dbg("s3c6400_serial_probe: dev=%p\n", dev);
+	return s3c24xx_serial_probe(dev, &s3c6400_uart_inf);
+}
+
+static struct platform_driver s3c6400_serial_drv = {
+	.probe		= s3c6400_serial_probe,
+	.remove		= s3c24xx_serial_remove,
+	.driver		= {
+		.name	= "s3c6400-uart",
+		.owner	= THIS_MODULE,
+	},
+};
+
+s3c24xx_console_init(&s3c6400_serial_drv, &s3c6400_uart_inf);
+
+static int __init s3c6400_serial_init(void)
+{
+	return s3c24xx_serial_init(&s3c6400_serial_drv, &s3c6400_uart_inf);
+}
+
+static void __exit s3c6400_serial_exit(void)
+{
+	platform_driver_unregister(&s3c6400_serial_drv);
+}
+
+module_init(s3c6400_serial_init);
+module_exit(s3c6400_serial_exit);
+
+MODULE_DESCRIPTION("Samsung S3C6400,S3C6410 SoC Serial port driver");
+MODULE_AUTHOR("Ben Dooks <ben@simtec.co.uk>");
+MODULE_LICENSE("GPL v2");
+MODULE_ALIAS("platform:s3c6400-uart");
diff --git a/drivers/serial/samsung.c b/drivers/serial/samsung.c
index bb8b57aae3a..44fc38afa22 100644
--- a/drivers/serial/samsung.c
+++ b/drivers/serial/samsung.c
@@ -47,9 +47,9 @@
 #include <asm/irq.h>
 
 #include <mach/hardware.h>
+#include <mach/map.h>
 
 #include <plat/regs-serial.h>
-#include <mach/regs-gpio.h>
 
 #include "samsung.h"
 
@@ -756,6 +756,8 @@ static const char *s3c24xx_serial_type(struct uart_port *port)
 		return "S3C2440";
 	case PORT_S3C2412:
 		return "S3C2412";
+	case PORT_S3C6400:
+		return "S3C6400/10";
 	default:
 		return NULL;
 	}
@@ -1034,8 +1036,8 @@ static int s3c24xx_serial_init_port(struct s3c24xx_uart_port *ourport,
 
 	dbg("resource %p (%lx..%lx)\n", res, res->start, res->end);
 
-	port->mapbase	= res->start;
-	port->membase	= S3C24XX_VA_UART + (res->start - S3C24XX_PA_UART);
+	port->mapbase = res->start;
+	port->membase = S3C_VA_UART + res->start - (S3C_PA_UART & 0xfff00000);
 	ret = platform_get_irq(platdev, 0);
 	if (ret < 0)
 		port->irq = 0;
diff --git a/include/linux/serial_core.h b/include/linux/serial_core.h
index 4e4f1277f3b..feb3b939ec4 100644
--- a/include/linux/serial_core.h
+++ b/include/linux/serial_core.h
@@ -158,6 +158,8 @@
 /* SH-SCI */
 #define PORT_SCIFA	83
 
+#define PORT_S3C6400	84
+
 #ifdef __KERNEL__
 
 #include <linux/compiler.h>
-- 
cgit v1.2.3-70-g09d2


From d2ff911882b6bc693d86ca9566daac70aacbb2b3 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Mon, 15 Dec 2008 19:04:35 +1030
Subject: Define smp_call_function_many for UP

Otherwise those using it in transition patches (eg. kvm) can't compile
with CONFIG_SMP=n:

arch/x86/kvm/../../../virt/kvm/kvm_main.c: In function 'make_all_cpus_request':
arch/x86/kvm/../../../virt/kvm/kvm_main.c:380: error: implicit declaration of function 'smp_call_function_many'

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/smp.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/smp.h b/include/linux/smp.h
index 3f9a60043a9..6e7ba16ff45 100644
--- a/include/linux/smp.h
+++ b/include/linux/smp.h
@@ -146,6 +146,8 @@ static inline void smp_send_reschedule(int cpu) { }
 })
 #define smp_call_function_mask(mask, func, info, wait) \
 			(up_smp_call_function(func, info))
+#define smp_call_function_many(mask, func, info, wait) \
+			(up_smp_call_function(func, info))
 static inline void init_call_single_data(void)
 {
 }
-- 
cgit v1.2.3-70-g09d2


From b53c7583e26746ef6f66c866841e10450150ed8e Mon Sep 17 00:00:00 2001
From: Kay Sievers <kay.sievers@vrfy.org>
Date: Thu, 4 Dec 2008 10:01:52 -0800
Subject: rapidio: struct device - replace bus_id with dev_name(),
 dev_set_name()

Cc: Matt Porter <mporter@kernel.crashing.org>
Signed-off-by: Kay Sievers <kay.sievers@vrfy.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
Signed-off-by: Paul Mackerras <paulus@samba.org>
---
 drivers/rapidio/rio-scan.c | 8 ++++----
 include/linux/rio_drv.h    | 4 ++--
 2 files changed, 6 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/rapidio/rio-scan.c b/drivers/rapidio/rio-scan.c
index 643a6b98462..5c13f61bfb1 100644
--- a/drivers/rapidio/rio-scan.c
+++ b/drivers/rapidio/rio-scan.c
@@ -365,15 +365,15 @@ static struct rio_dev *rio_setup_device(struct rio_net *net,
 				rdid++)
 			rswitch->route_table[rdid] = RIO_INVALID_ROUTE;
 		rdev->rswitch = rswitch;
-		sprintf(rio_name(rdev), "%02x:s:%04x", rdev->net->id,
-			rdev->rswitch->switchid);
+		dev_set_name(&rdev->dev, "%02x:s:%04x", rdev->net->id,
+			     rdev->rswitch->switchid);
 		rio_route_set_ops(rdev);
 
 		list_add_tail(&rswitch->node, &rio_switches);
 
 	} else
-		sprintf(rio_name(rdev), "%02x:e:%04x", rdev->net->id,
-			rdev->destid);
+		dev_set_name(&rdev->dev, "%02x:e:%04x", rdev->net->id,
+			     rdev->destid);
 
 	rdev->dev.bus = &rio_bus_type;
 
diff --git a/include/linux/rio_drv.h b/include/linux/rio_drv.h
index 90987b7bcc1..32c0547ffaf 100644
--- a/include/linux/rio_drv.h
+++ b/include/linux/rio_drv.h
@@ -427,9 +427,9 @@ void rio_dev_put(struct rio_dev *);
  * Get the unique RIO device identifier. Returns the device
  * identifier string.
  */
-static inline char *rio_name(struct rio_dev *rdev)
+static inline const char *rio_name(struct rio_dev *rdev)
 {
-	return rdev->dev.bus_id;
+	return dev_name(&rdev->dev);
 }
 
 /**
-- 
cgit v1.2.3-70-g09d2


From 1a881f27c50b4fbd6858a8696a189263621136b0 Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Mon, 15 Dec 2008 23:27:47 -0800
Subject: net: Add frag_list support to GSO

This patch allows GSO to handle frag_list in a limited way for the
purposes of allowing packets merged by GRO to be refragmented on
output.

Most hardware won't (and aren't expected to) support handling GRO
frag_list packets directly.  Therefore we will perform GSO in
software for those cases.

However, for drivers that can support it (such as virtual NICs) we
may not have to segment the packets at all.

Whether the added overhead of GRO/GSO is worthwhile for bridges
and routers when weighed against the benefit of potentially
increasing the MTU within the host is still an open question.
However, for the case of host nodes this is undoubtedly a win.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h | 2 ++
 net/core/dev.c            | 2 --
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index b60c26b7d31..bdf5465deb9 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1858,6 +1858,8 @@ static inline int netif_needs_gso(struct net_device *dev, struct sk_buff *skb)
 {
 	return skb_is_gso(skb) &&
 	       (!skb_gso_ok(skb, dev->features) ||
+	        (skb_shinfo(skb)->frag_list &&
+	         !(dev->features & NETIF_F_FRAGLIST)) ||
 		unlikely(skb->ip_summed != CHECKSUM_PARTIAL));
 }
 
diff --git a/net/core/dev.c b/net/core/dev.c
index f54cac76438..e415f0b0d0d 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -1533,8 +1533,6 @@ struct sk_buff *skb_gso_segment(struct sk_buff *skb, int features)
 	__be16 type = skb->protocol;
 	int err;
 
-	BUG_ON(skb_shinfo(skb)->frag_list);
-
 	skb_reset_mac_header(skb);
 	skb->mac_len = skb->network_header - skb->mac_header;
 	__skb_pull(skb, skb->mac_len);
-- 
cgit v1.2.3-70-g09d2


From d565b0a1a9b6ee7dff46e1f68b26b526ac11ae50 Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Mon, 15 Dec 2008 23:38:52 -0800
Subject: net: Add Generic Receive Offload infrastructure

This patch adds the top-level GRO (Generic Receive Offload) infrastructure.
This is pretty similar to LRO except that this is protocol-independent.
Instead of holding packets in an lro_mgr structure, they're now held in
napi_struct.

For drivers that intend to use this, they can set the NETIF_F_GRO bit and
call napi_gro_receive instead of netif_receive_skb or just call netif_rx.
The latter will call napi_receive_skb automatically.  When napi_gro_receive
is used, the driver must either call napi_complete/napi_rx_complete, or
call napi_gro_flush in softirq context if the driver uses the primitives
__napi_complete/__napi_rx_complete.

Protocols will set the gro_receive and gro_complete function pointers in
order to participate in this scheme.

In addition to the packet, gro_receive will get a list of currently held
packets.  Each packet in the list has a same_flow field which is non-zero
if it is a potential match for the new packet.  For each packet that may
match, they also have a flush field which is non-zero if the held packet
must not be merged with the new packet.

Once gro_receive has determined that the new skb matches a held packet,
the held packet may be processed immediately if the new skb cannot be
merged with it.  In this case gro_receive should return the pointer to
the existing skb in gro_list.  Otherwise the new skb should be merged into
the existing packet and NULL should be returned, unless the new skb makes
it impossible for any further merges to be made (e.g., FIN packet) where
the merged skb should be returned.

Whenever the skb is merged into an existing entry, the gro_receive
function should set NAPI_GRO_CB(skb)->same_flow.  Note that if an skb
merely matches an existing entry but can't be merged with it, then
this shouldn't be set.

If gro_receive finds it pointless to hold the new skb for future merging,
it should set NAPI_GRO_CB(skb)->flush.

Held packets will be flushed by napi_gro_flush which is called by
napi_complete and napi_rx_complete.

Currently held packets are stored in a singly liked list just like LRO.
The list is limited to a maximum of 8 entries.  In future, this may be
expanded to use a hash table to allow more flows to be held for merging.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h |  80 +++++++------------
 include/linux/netpoll.h   |   5 --
 net/core/dev.c            | 193 +++++++++++++++++++++++++++++++++++++++++++++-
 3 files changed, 219 insertions(+), 59 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index bdf5465deb9..58856b6737f 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -314,8 +314,9 @@ struct napi_struct {
 	spinlock_t		poll_lock;
 	int			poll_owner;
 	struct net_device	*dev;
-	struct list_head	dev_list;
 #endif
+	struct list_head	dev_list;
+	struct sk_buff		*gro_list;
 };
 
 enum
@@ -376,22 +377,8 @@ static inline int napi_reschedule(struct napi_struct *napi)
  *
  * Mark NAPI processing as complete.
  */
-static inline void __napi_complete(struct napi_struct *n)
-{
-	BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state));
-	list_del(&n->poll_list);
-	smp_mb__before_clear_bit();
-	clear_bit(NAPI_STATE_SCHED, &n->state);
-}
-
-static inline void napi_complete(struct napi_struct *n)
-{
-	unsigned long flags;
-
-	local_irq_save(flags);
-	__napi_complete(n);
-	local_irq_restore(flags);
-}
+extern void __napi_complete(struct napi_struct *n);
+extern void napi_complete(struct napi_struct *n);
 
 /**
  *	napi_disable - prevent NAPI from scheduling
@@ -640,9 +627,7 @@ struct net_device
 	unsigned long		state;
 
 	struct list_head	dev_list;
-#ifdef CONFIG_NETPOLL
 	struct list_head	napi_list;
-#endif
 
 	/* Net device features */
 	unsigned long		features;
@@ -661,6 +646,7 @@ struct net_device
 #define NETIF_F_LLTX		4096	/* LockLess TX - deprecated. Please */
 					/* do not use LLTX in new drivers */
 #define NETIF_F_NETNS_LOCAL	8192	/* Does not change network namespaces */
+#define NETIF_F_GRO		16384	/* Generic receive offload */
 #define NETIF_F_LRO		32768	/* large receive offload */
 
 	/* Segmentation offload features */
@@ -984,22 +970,8 @@ static inline void *netdev_priv(const struct net_device *dev)
  * netif_napi_add() must be used to initialize a napi context prior to calling
  * *any* of the other napi related functions.
  */
-static inline void netif_napi_add(struct net_device *dev,
-				  struct napi_struct *napi,
-				  int (*poll)(struct napi_struct *, int),
-				  int weight)
-{
-	INIT_LIST_HEAD(&napi->poll_list);
-	napi->poll = poll;
-	napi->weight = weight;
-#ifdef CONFIG_NETPOLL
-	napi->dev = dev;
-	list_add(&napi->dev_list, &dev->napi_list);
-	spin_lock_init(&napi->poll_lock);
-	napi->poll_owner = -1;
-#endif
-	set_bit(NAPI_STATE_SCHED, &napi->state);
-}
+void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
+		    int (*poll)(struct napi_struct *, int), int weight);
 
 /**
  *  netif_napi_del - remove a napi context
@@ -1007,12 +979,20 @@ static inline void netif_napi_add(struct net_device *dev,
  *
  *  netif_napi_del() removes a napi context from the network device napi list
  */
-static inline void netif_napi_del(struct napi_struct *napi)
-{
-#ifdef CONFIG_NETPOLL
-	list_del(&napi->dev_list);
-#endif
-}
+void netif_napi_del(struct napi_struct *napi);
+
+struct napi_gro_cb {
+	/* This is non-zero if the packet may be of the same flow. */
+	int same_flow;
+
+	/* This is non-zero if the packet cannot be merged with the new skb. */
+	int flush;
+
+	/* Number of segments aggregated. */
+	int count;
+};
+
+#define NAPI_GRO_CB(skb) ((struct napi_gro_cb *)(skb)->cb)
 
 struct packet_type {
 	__be16			type;	/* This is really htons(ether_type). */
@@ -1024,6 +1004,9 @@ struct packet_type {
 	struct sk_buff		*(*gso_segment)(struct sk_buff *skb,
 						int features);
 	int			(*gso_send_check)(struct sk_buff *skb);
+	struct sk_buff		**(*gro_receive)(struct sk_buff **head,
+					       struct sk_buff *skb);
+	int			(*gro_complete)(struct sk_buff *skb);
 	void			*af_packet_priv;
 	struct list_head	list;
 };
@@ -1377,6 +1360,9 @@ extern int		netif_rx(struct sk_buff *skb);
 extern int		netif_rx_ni(struct sk_buff *skb);
 #define HAVE_NETIF_RECEIVE_SKB 1
 extern int		netif_receive_skb(struct sk_buff *skb);
+extern void		napi_gro_flush(struct napi_struct *napi);
+extern int		napi_gro_receive(struct napi_struct *napi,
+					 struct sk_buff *skb);
 extern void		netif_nit_deliver(struct sk_buff *skb);
 extern int		dev_valid_name(const char *name);
 extern int		dev_ioctl(struct net *net, unsigned int cmd, void __user *);
@@ -1621,17 +1607,7 @@ static inline void __netif_rx_complete(struct net_device *dev,
 static inline void netif_rx_complete(struct net_device *dev,
 				     struct napi_struct *napi)
 {
-	unsigned long flags;
-
-	/*
-	 * don't let napi dequeue from the cpu poll list
-	 * just in case its running on a different cpu
-	 */
-	if (unlikely(test_bit(NAPI_STATE_NPSVC, &napi->state)))
-		return;
-	local_irq_save(flags);
-	__netif_rx_complete(dev, napi);
-	local_irq_restore(flags);
+	napi_complete(napi);
 }
 
 static inline void __netif_tx_lock(struct netdev_queue *txq, int cpu)
diff --git a/include/linux/netpoll.h b/include/linux/netpoll.h
index e3d79593fb3..e38d3c9dccd 100644
--- a/include/linux/netpoll.h
+++ b/include/linux/netpoll.h
@@ -94,11 +94,6 @@ static inline void netpoll_poll_unlock(void *have)
 	rcu_read_unlock();
 }
 
-static inline void netpoll_netdev_init(struct net_device *dev)
-{
-	INIT_LIST_HEAD(&dev->napi_list);
-}
-
 #else
 static inline int netpoll_rx(struct sk_buff *skb)
 {
diff --git a/net/core/dev.c b/net/core/dev.c
index e415f0b0d0d..d8d7d1fccde 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -129,6 +129,9 @@
 
 #include "net-sysfs.h"
 
+/* Instead of increasing this, you should create a hash table. */
+#define MAX_GRO_SKBS 8
+
 /*
  *	The list of packet types we will receive (as opposed to discard)
  *	and the routines to invoke.
@@ -2335,6 +2338,122 @@ static void flush_backlog(void *arg)
 		}
 }
 
+static int napi_gro_complete(struct sk_buff *skb)
+{
+	struct packet_type *ptype;
+	__be16 type = skb->protocol;
+	struct list_head *head = &ptype_base[ntohs(type) & PTYPE_HASH_MASK];
+	int err = -ENOENT;
+
+	if (!skb_shinfo(skb)->frag_list)
+		goto out;
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(ptype, head, list) {
+		if (ptype->type != type || ptype->dev || !ptype->gro_complete)
+			continue;
+
+		err = ptype->gro_complete(skb);
+		break;
+	}
+	rcu_read_unlock();
+
+	if (err) {
+		WARN_ON(&ptype->list == head);
+		kfree_skb(skb);
+		return NET_RX_SUCCESS;
+	}
+
+out:
+	__skb_push(skb, -skb_network_offset(skb));
+	return netif_receive_skb(skb);
+}
+
+void napi_gro_flush(struct napi_struct *napi)
+{
+	struct sk_buff *skb, *next;
+
+	for (skb = napi->gro_list; skb; skb = next) {
+		next = skb->next;
+		skb->next = NULL;
+		napi_gro_complete(skb);
+	}
+
+	napi->gro_list = NULL;
+}
+EXPORT_SYMBOL(napi_gro_flush);
+
+int napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
+{
+	struct sk_buff **pp = NULL;
+	struct packet_type *ptype;
+	__be16 type = skb->protocol;
+	struct list_head *head = &ptype_base[ntohs(type) & PTYPE_HASH_MASK];
+	int count = 0;
+	int mac_len;
+
+	if (!(skb->dev->features & NETIF_F_GRO))
+		goto normal;
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(ptype, head, list) {
+		struct sk_buff *p;
+
+		if (ptype->type != type || ptype->dev || !ptype->gro_receive)
+			continue;
+
+		skb_reset_network_header(skb);
+		mac_len = skb->network_header - skb->mac_header;
+		skb->mac_len = mac_len;
+		NAPI_GRO_CB(skb)->same_flow = 0;
+		NAPI_GRO_CB(skb)->flush = 0;
+
+		for (p = napi->gro_list; p; p = p->next) {
+			count++;
+			NAPI_GRO_CB(p)->same_flow =
+				p->mac_len == mac_len &&
+				!memcmp(skb_mac_header(p), skb_mac_header(skb),
+					mac_len);
+			NAPI_GRO_CB(p)->flush = 0;
+		}
+
+		pp = ptype->gro_receive(&napi->gro_list, skb);
+		break;
+	}
+	rcu_read_unlock();
+
+	if (&ptype->list == head)
+		goto normal;
+
+	if (pp) {
+		struct sk_buff *nskb = *pp;
+
+		*pp = nskb->next;
+		nskb->next = NULL;
+		napi_gro_complete(nskb);
+		count--;
+	}
+
+	if (NAPI_GRO_CB(skb)->same_flow)
+		goto ok;
+
+	if (NAPI_GRO_CB(skb)->flush || count >= MAX_GRO_SKBS) {
+		__skb_push(skb, -skb_network_offset(skb));
+		goto normal;
+	}
+
+	NAPI_GRO_CB(skb)->count = 1;
+	skb->next = napi->gro_list;
+	napi->gro_list = skb;
+
+ok:
+	return NET_RX_SUCCESS;
+
+normal:
+	return netif_receive_skb(skb);
+}
+EXPORT_SYMBOL(napi_gro_receive);
+
 static int process_backlog(struct napi_struct *napi, int quota)
 {
 	int work = 0;
@@ -2354,9 +2473,11 @@ static int process_backlog(struct napi_struct *napi, int quota)
 		}
 		local_irq_enable();
 
-		netif_receive_skb(skb);
+		napi_gro_receive(napi, skb);
 	} while (++work < quota && jiffies == start_time);
 
+	napi_gro_flush(napi);
+
 	return work;
 }
 
@@ -2377,6 +2498,68 @@ void __napi_schedule(struct napi_struct *n)
 }
 EXPORT_SYMBOL(__napi_schedule);
 
+void __napi_complete(struct napi_struct *n)
+{
+	BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state));
+	BUG_ON(n->gro_list);
+
+	list_del(&n->poll_list);
+	smp_mb__before_clear_bit();
+	clear_bit(NAPI_STATE_SCHED, &n->state);
+}
+EXPORT_SYMBOL(__napi_complete);
+
+void napi_complete(struct napi_struct *n)
+{
+	unsigned long flags;
+
+	/*
+	 * don't let napi dequeue from the cpu poll list
+	 * just in case its running on a different cpu
+	 */
+	if (unlikely(test_bit(NAPI_STATE_NPSVC, &n->state)))
+		return;
+
+	napi_gro_flush(n);
+	local_irq_save(flags);
+	__napi_complete(n);
+	local_irq_restore(flags);
+}
+EXPORT_SYMBOL(napi_complete);
+
+void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
+		    int (*poll)(struct napi_struct *, int), int weight)
+{
+	INIT_LIST_HEAD(&napi->poll_list);
+	napi->gro_list = NULL;
+	napi->poll = poll;
+	napi->weight = weight;
+	list_add(&napi->dev_list, &dev->napi_list);
+#ifdef CONFIG_NETPOLL
+	napi->dev = dev;
+	spin_lock_init(&napi->poll_lock);
+	napi->poll_owner = -1;
+#endif
+	set_bit(NAPI_STATE_SCHED, &napi->state);
+}
+EXPORT_SYMBOL(netif_napi_add);
+
+void netif_napi_del(struct napi_struct *napi)
+{
+	struct sk_buff *skb, *next;
+
+	list_del(&napi->dev_list);
+
+	for (skb = napi->gro_list; skb; skb = next) {
+		next = skb->next;
+		skb->next = NULL;
+		kfree_skb(skb);
+	}
+
+	napi->gro_list = NULL;
+}
+EXPORT_SYMBOL(netif_napi_del);
+
 
 static void net_rx_action(struct softirq_action *h)
 {
@@ -4380,7 +4563,7 @@ struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name,
 
 	netdev_init_queues(dev);
 
-	netpoll_netdev_init(dev);
+	INIT_LIST_HEAD(&dev->napi_list);
 	setup(dev);
 	strcpy(dev->name, name);
 	return dev;
@@ -4397,10 +4580,15 @@ EXPORT_SYMBOL(alloc_netdev_mq);
  */
 void free_netdev(struct net_device *dev)
 {
+	struct napi_struct *p, *n;
+
 	release_net(dev_net(dev));
 
 	kfree(dev->_tx);
 
+	list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
+		netif_napi_del(p);
+
 	/*  Compatibility with error handling in drivers */
 	if (dev->reg_state == NETREG_UNINITIALIZED) {
 		kfree((char *)dev - dev->padded);
@@ -4949,6 +5137,7 @@ static int __init net_dev_init(void)
 
 		queue->backlog.poll = process_backlog;
 		queue->backlog.weight = weight_p;
+		queue->backlog.gro_list = NULL;
 	}
 
 	dev_boot_phase = 0;
-- 
cgit v1.2.3-70-g09d2


From 71d93b39e52e92aea35f1058d957cf12250d0b75 Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Mon, 15 Dec 2008 23:42:33 -0800
Subject: net: Add skb_gro_receive

This patch adds the helper skb_gro_receive to merge packets for
GRO.  The current method is to allocate a new header skb and then
chain the original packets to its frag_list.  This is done to
make it easier to integrate into the existing GSO framework.

In future as GSO is moved into the drivers, we can undo this and
simply chain the original packets together.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h |  2 ++
 net/core/skbuff.c      | 59 ++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 61 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index acf17af45af..cf2cb50f77d 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -1687,6 +1687,8 @@ extern int	       skb_shift(struct sk_buff *tgt, struct sk_buff *skb,
 				 int shiftlen);
 
 extern struct sk_buff *skb_segment(struct sk_buff *skb, int features);
+extern int	       skb_gro_receive(struct sk_buff **head,
+				       struct sk_buff *skb);
 
 static inline void *skb_header_pointer(const struct sk_buff *skb, int offset,
 				       int len, void *buffer)
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 18e224af05a..b8d0abb2643 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -2582,6 +2582,65 @@ err:
 
 EXPORT_SYMBOL_GPL(skb_segment);
 
+int skb_gro_receive(struct sk_buff **head, struct sk_buff *skb)
+{
+	struct sk_buff *p = *head;
+	struct sk_buff *nskb;
+	unsigned int headroom;
+	unsigned int hlen = p->data - skb_mac_header(p);
+
+	if (hlen + p->len + skb->len >= 65536)
+		return -E2BIG;
+
+	if (skb_shinfo(p)->frag_list)
+		goto merge;
+
+	headroom = skb_headroom(p);
+	nskb = netdev_alloc_skb(p->dev, headroom);
+	if (unlikely(!nskb))
+		return -ENOMEM;
+
+	__copy_skb_header(nskb, p);
+	nskb->mac_len = p->mac_len;
+
+	skb_reserve(nskb, headroom);
+
+	skb_set_mac_header(nskb, -hlen);
+	skb_set_network_header(nskb, skb_network_offset(p));
+	skb_set_transport_header(nskb, skb_transport_offset(p));
+
+	memcpy(skb_mac_header(nskb), skb_mac_header(p), hlen);
+
+	*NAPI_GRO_CB(nskb) = *NAPI_GRO_CB(p);
+	skb_shinfo(nskb)->frag_list = p;
+	skb_header_release(p);
+	nskb->prev = p;
+
+	nskb->data_len += p->len;
+	nskb->truesize += p->len;
+	nskb->len += p->len;
+
+	*head = nskb;
+	nskb->next = p->next;
+	p->next = NULL;
+
+	p = nskb;
+
+merge:
+	NAPI_GRO_CB(p)->count++;
+	p->prev->next = skb;
+	p->prev = skb;
+	skb_header_release(skb);
+
+	p->data_len += skb->len;
+	p->truesize += skb->len;
+	p->len += skb->len;
+
+	NAPI_GRO_CB(skb)->same_flow = 1;
+	return 0;
+}
+EXPORT_SYMBOL_GPL(skb_gro_receive);
+
 void __init skb_init(void)
 {
 	skbuff_head_cache = kmem_cache_create("skbuff_head_cache",
-- 
cgit v1.2.3-70-g09d2


From b240a0e5644eb817c4a397098a40e1ad42a615bc Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Mon, 15 Dec 2008 23:44:31 -0800
Subject: ethtool: Add GGRO and SGRO ops

This patch adds the ethtool ops to enable and disable GRO.  It also
makes GRO depend on RX checksum offload much the same as how TSO
depends on SG support.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/ethtool.h |  2 ++
 net/core/ethtool.c      | 53 +++++++++++++++++++++++++++++++++++++++++++++++--
 2 files changed, 53 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h
index b4b038b89ee..27c67a54223 100644
--- a/include/linux/ethtool.h
+++ b/include/linux/ethtool.h
@@ -467,6 +467,8 @@ struct ethtool_ops {
 
 #define	ETHTOOL_GRXFH		0x00000029 /* Get RX flow hash configuration */
 #define	ETHTOOL_SRXFH		0x0000002a /* Set RX flow hash configuration */
+#define ETHTOOL_GGRO		0x0000002b /* Get GRO enable (ethtool_value) */
+#define ETHTOOL_SGRO		0x0000002c /* Set GRO enable (ethtool_value) */
 
 /* compatibility with older code */
 #define SPARC_ETH_GSET		ETHTOOL_GSET
diff --git a/net/core/ethtool.c b/net/core/ethtool.c
index 14ada537f89..947710a36ce 100644
--- a/net/core/ethtool.c
+++ b/net/core/ethtool.c
@@ -528,6 +528,22 @@ static int ethtool_set_tx_csum(struct net_device *dev, char __user *useraddr)
 	return dev->ethtool_ops->set_tx_csum(dev, edata.data);
 }
 
+static int ethtool_set_rx_csum(struct net_device *dev, char __user *useraddr)
+{
+	struct ethtool_value edata;
+
+	if (!dev->ethtool_ops->set_rx_csum)
+		return -EOPNOTSUPP;
+
+	if (copy_from_user(&edata, useraddr, sizeof(edata)))
+		return -EFAULT;
+
+	if (!edata.data && dev->ethtool_ops->set_sg)
+		dev->features &= ~NETIF_F_GRO;
+
+	return dev->ethtool_ops->set_rx_csum(dev, edata.data);
+}
+
 static int ethtool_set_sg(struct net_device *dev, char __user *useraddr)
 {
 	struct ethtool_value edata;
@@ -599,6 +615,34 @@ static int ethtool_set_gso(struct net_device *dev, char __user *useraddr)
 	return 0;
 }
 
+static int ethtool_get_gro(struct net_device *dev, char __user *useraddr)
+{
+	struct ethtool_value edata = { ETHTOOL_GGRO };
+
+	edata.data = dev->features & NETIF_F_GRO;
+	if (copy_to_user(useraddr, &edata, sizeof(edata)))
+		 return -EFAULT;
+	return 0;
+}
+
+static int ethtool_set_gro(struct net_device *dev, char __user *useraddr)
+{
+	struct ethtool_value edata;
+
+	if (copy_from_user(&edata, useraddr, sizeof(edata)))
+		return -EFAULT;
+
+	if (edata.data) {
+		if (!dev->ethtool_ops->get_rx_csum ||
+		    !dev->ethtool_ops->get_rx_csum(dev))
+			return -EINVAL;
+		dev->features |= NETIF_F_GRO;
+	} else
+		dev->features &= ~NETIF_F_GRO;
+
+	return 0;
+}
+
 static int ethtool_self_test(struct net_device *dev, char __user *useraddr)
 {
 	struct ethtool_test test;
@@ -932,8 +976,7 @@ int dev_ethtool(struct net *net, struct ifreq *ifr)
 				       dev->ethtool_ops->get_rx_csum);
 		break;
 	case ETHTOOL_SRXCSUM:
-		rc = ethtool_set_value(dev, useraddr,
-				       dev->ethtool_ops->set_rx_csum);
+		rc = ethtool_set_rx_csum(dev, useraddr);
 		break;
 	case ETHTOOL_GTXCSUM:
 		rc = ethtool_get_value(dev, useraddr, ethcmd,
@@ -1014,6 +1057,12 @@ int dev_ethtool(struct net *net, struct ifreq *ifr)
 	case ETHTOOL_SRXFH:
 		rc = ethtool_set_rxhash(dev, useraddr);
 		break;
+	case ETHTOOL_GGRO:
+		rc = ethtool_get_gro(dev, useraddr);
+		break;
+	case ETHTOOL_SGRO:
+		rc = ethtool_set_gro(dev, useraddr);
+		break;
 	default:
 		rc = -EOPNOTSUPP;
 	}
-- 
cgit v1.2.3-70-g09d2


From 092cab7e2cd868cb0b30209a0337689c3ffd6133 Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Tue, 16 Dec 2008 01:19:41 -0800
Subject: netfilter: ctnetlink: fix missing CTA_NAT_SEQ_UNSPEC

This patch fixes an inconsistency in nfnetlink_conntrack.h that
I introduced myself. The problem is that CTA_NAT_SEQ_UNSPEC is
missing from enum ctattr_natseq. This inconsistency may lead to
problems in the message parsing in userspace (if the message
contains the CTA_NAT_SEQ_* attributes, of course).

This patch breaks backward compatibility, however, the only known
client of this code is libnetfilter_conntrack which indeed crashes
because it assumes the existence of CTA_NAT_SEQ_UNSPEC to do
the parsing.

The CTA_NAT_SEQ_* attributes were introduced in 2.6.25.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
Signed-off-by: Patrick McHardy <kaber@trash.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netfilter/nfnetlink_conntrack.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/netfilter/nfnetlink_conntrack.h b/include/linux/netfilter/nfnetlink_conntrack.h
index c19595c8930..29fe9ea1d34 100644
--- a/include/linux/netfilter/nfnetlink_conntrack.h
+++ b/include/linux/netfilter/nfnetlink_conntrack.h
@@ -141,6 +141,7 @@ enum ctattr_protonat {
 #define CTA_PROTONAT_MAX (__CTA_PROTONAT_MAX - 1)
 
 enum ctattr_natseq {
+	CTA_NAT_SEQ_UNSPEC,
 	CTA_NAT_SEQ_CORRECTION_POS,
 	CTA_NAT_SEQ_OFFSET_BEFORE,
 	CTA_NAT_SEQ_OFFSET_AFTER,
-- 
cgit v1.2.3-70-g09d2


From e18ce3465477502108187c6c08b6423fb784a313 Mon Sep 17 00:00:00 2001
From: Steve Glendinning <steve.glendinning@smsc.com>
Date: Tue, 16 Dec 2008 02:00:00 -0800
Subject: net: Move flow control definitions to mii.h

flags used within drivers for indicating tx and rx flow control are
defined in 4 drivers (and probably more), move these constants to mii.h.

The 3 SMSC drivers use the same constants (FLOW_CTRL_TX), but TG3 uses
TG3_FLOW_CTRL_TX, so this patch also renames the constants within TG3.

Signed-off-by: Steve Glendinning <steve.glendinning@smsc.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/smsc911x.h     |  3 ---
 drivers/net/smsc9420.h     |  3 ---
 drivers/net/tg3.c          | 50 +++++++++++++++++++++++-----------------------
 drivers/net/tg3.h          |  2 --
 drivers/net/usb/smsc95xx.c |  2 --
 include/linux/mii.h        |  4 ++++
 6 files changed, 29 insertions(+), 35 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/smsc911x.h b/drivers/net/smsc911x.h
index f818cf0415f..2b76654bb95 100644
--- a/drivers/net/smsc911x.h
+++ b/drivers/net/smsc911x.h
@@ -61,9 +61,6 @@
 #define SMSC_ASSERT_MAC_LOCK(pdata) do {} while (0)
 #endif				/* CONFIG_DEBUG_SPINLOCK */
 
-#define FLOW_CTRL_TX		(1)
-#define FLOW_CTRL_RX		(2)
-
 /* SMSC911x registers and bitfields */
 #define RX_DATA_FIFO			0x00
 
diff --git a/drivers/net/smsc9420.h b/drivers/net/smsc9420.h
index 80c426dc432..69c351f93f8 100644
--- a/drivers/net/smsc9420.h
+++ b/drivers/net/smsc9420.h
@@ -45,9 +45,6 @@
 
 #define SMSC9420_EEPROM_SIZE		((u32)11)
 
-#define FLOW_CTRL_TX			(1)
-#define FLOW_CTRL_RX			(2)
-
 #define PKT_BUF_SZ			(VLAN_ETH_FRAME_LEN + NET_IP_ALIGN + 4)
 
 /***********************************************/
diff --git a/drivers/net/tg3.c b/drivers/net/tg3.c
index 6e40d52107a..f353f69caeb 100644
--- a/drivers/net/tg3.c
+++ b/drivers/net/tg3.c
@@ -1187,9 +1187,9 @@ static void tg3_link_report(struct tg3 *tp)
 		printk(KERN_INFO PFX
 		       "%s: Flow control is %s for TX and %s for RX.\n",
 		       tp->dev->name,
-		       (tp->link_config.active_flowctrl & TG3_FLOW_CTRL_TX) ?
+		       (tp->link_config.active_flowctrl & FLOW_CTRL_TX) ?
 		       "on" : "off",
-		       (tp->link_config.active_flowctrl & TG3_FLOW_CTRL_RX) ?
+		       (tp->link_config.active_flowctrl & FLOW_CTRL_RX) ?
 		       "on" : "off");
 		tg3_ump_link_report(tp);
 	}
@@ -1199,11 +1199,11 @@ static u16 tg3_advert_flowctrl_1000T(u8 flow_ctrl)
 {
 	u16 miireg;
 
-	if ((flow_ctrl & TG3_FLOW_CTRL_TX) && (flow_ctrl & TG3_FLOW_CTRL_RX))
+	if ((flow_ctrl & FLOW_CTRL_TX) && (flow_ctrl & FLOW_CTRL_RX))
 		miireg = ADVERTISE_PAUSE_CAP;
-	else if (flow_ctrl & TG3_FLOW_CTRL_TX)
+	else if (flow_ctrl & FLOW_CTRL_TX)
 		miireg = ADVERTISE_PAUSE_ASYM;
-	else if (flow_ctrl & TG3_FLOW_CTRL_RX)
+	else if (flow_ctrl & FLOW_CTRL_RX)
 		miireg = ADVERTISE_PAUSE_CAP | ADVERTISE_PAUSE_ASYM;
 	else
 		miireg = 0;
@@ -1215,11 +1215,11 @@ static u16 tg3_advert_flowctrl_1000X(u8 flow_ctrl)
 {
 	u16 miireg;
 
-	if ((flow_ctrl & TG3_FLOW_CTRL_TX) && (flow_ctrl & TG3_FLOW_CTRL_RX))
+	if ((flow_ctrl & FLOW_CTRL_TX) && (flow_ctrl & FLOW_CTRL_RX))
 		miireg = ADVERTISE_1000XPAUSE;
-	else if (flow_ctrl & TG3_FLOW_CTRL_TX)
+	else if (flow_ctrl & FLOW_CTRL_TX)
 		miireg = ADVERTISE_1000XPSE_ASYM;
-	else if (flow_ctrl & TG3_FLOW_CTRL_RX)
+	else if (flow_ctrl & FLOW_CTRL_RX)
 		miireg = ADVERTISE_1000XPAUSE | ADVERTISE_1000XPSE_ASYM;
 	else
 		miireg = 0;
@@ -1256,16 +1256,16 @@ static u8 tg3_resolve_flowctrl_1000X(u16 lcladv, u16 rmtadv)
 	if (lcladv & ADVERTISE_1000XPAUSE) {
 		if (lcladv & ADVERTISE_1000XPSE_ASYM) {
 			if (rmtadv & LPA_1000XPAUSE)
-				cap = TG3_FLOW_CTRL_TX | TG3_FLOW_CTRL_RX;
+				cap = FLOW_CTRL_TX | FLOW_CTRL_RX;
 			else if (rmtadv & LPA_1000XPAUSE_ASYM)
-				cap = TG3_FLOW_CTRL_RX;
+				cap = FLOW_CTRL_RX;
 		} else {
 			if (rmtadv & LPA_1000XPAUSE)
-				cap = TG3_FLOW_CTRL_TX | TG3_FLOW_CTRL_RX;
+				cap = FLOW_CTRL_TX | FLOW_CTRL_RX;
 		}
 	} else if (lcladv & ADVERTISE_1000XPSE_ASYM) {
 		if ((rmtadv & LPA_1000XPAUSE) && (rmtadv & LPA_1000XPAUSE_ASYM))
-			cap = TG3_FLOW_CTRL_TX;
+			cap = FLOW_CTRL_TX;
 	}
 
 	return cap;
@@ -1294,7 +1294,7 @@ static void tg3_setup_flow_control(struct tg3 *tp, u32 lcladv, u32 rmtadv)
 
 	tp->link_config.active_flowctrl = flowctrl;
 
-	if (flowctrl & TG3_FLOW_CTRL_RX)
+	if (flowctrl & FLOW_CTRL_RX)
 		tp->rx_mode |= RX_MODE_FLOW_CTRL_ENABLE;
 	else
 		tp->rx_mode &= ~RX_MODE_FLOW_CTRL_ENABLE;
@@ -1302,7 +1302,7 @@ static void tg3_setup_flow_control(struct tg3 *tp, u32 lcladv, u32 rmtadv)
 	if (old_rx_mode != tp->rx_mode)
 		tw32_f(MAC_RX_MODE, tp->rx_mode);
 
-	if (flowctrl & TG3_FLOW_CTRL_TX)
+	if (flowctrl & FLOW_CTRL_TX)
 		tp->tx_mode |= TX_MODE_FLOW_CTRL_ENABLE;
 	else
 		tp->tx_mode &= ~TX_MODE_FLOW_CTRL_ENABLE;
@@ -9419,12 +9419,12 @@ static void tg3_get_pauseparam(struct net_device *dev, struct ethtool_pauseparam
 
 	epause->autoneg = (tp->tg3_flags & TG3_FLAG_PAUSE_AUTONEG) != 0;
 
-	if (tp->link_config.active_flowctrl & TG3_FLOW_CTRL_RX)
+	if (tp->link_config.active_flowctrl & FLOW_CTRL_RX)
 		epause->rx_pause = 1;
 	else
 		epause->rx_pause = 0;
 
-	if (tp->link_config.active_flowctrl & TG3_FLOW_CTRL_TX)
+	if (tp->link_config.active_flowctrl & FLOW_CTRL_TX)
 		epause->tx_pause = 1;
 	else
 		epause->tx_pause = 0;
@@ -9475,14 +9475,14 @@ static int tg3_set_pauseparam(struct net_device *dev, struct ethtool_pauseparam
 			}
 		} else {
 			if (epause->rx_pause)
-				tp->link_config.flowctrl |= TG3_FLOW_CTRL_RX;
+				tp->link_config.flowctrl |= FLOW_CTRL_RX;
 			else
-				tp->link_config.flowctrl &= ~TG3_FLOW_CTRL_RX;
+				tp->link_config.flowctrl &= ~FLOW_CTRL_RX;
 
 			if (epause->tx_pause)
-				tp->link_config.flowctrl |= TG3_FLOW_CTRL_TX;
+				tp->link_config.flowctrl |= FLOW_CTRL_TX;
 			else
-				tp->link_config.flowctrl &= ~TG3_FLOW_CTRL_TX;
+				tp->link_config.flowctrl &= ~FLOW_CTRL_TX;
 
 			if (netif_running(dev))
 				tg3_setup_flow_control(tp, 0, 0);
@@ -9502,13 +9502,13 @@ static int tg3_set_pauseparam(struct net_device *dev, struct ethtool_pauseparam
 		else
 			tp->tg3_flags &= ~TG3_FLAG_PAUSE_AUTONEG;
 		if (epause->rx_pause)
-			tp->link_config.flowctrl |= TG3_FLOW_CTRL_RX;
+			tp->link_config.flowctrl |= FLOW_CTRL_RX;
 		else
-			tp->link_config.flowctrl &= ~TG3_FLOW_CTRL_RX;
+			tp->link_config.flowctrl &= ~FLOW_CTRL_RX;
 		if (epause->tx_pause)
-			tp->link_config.flowctrl |= TG3_FLOW_CTRL_TX;
+			tp->link_config.flowctrl |= FLOW_CTRL_TX;
 		else
-			tp->link_config.flowctrl &= ~TG3_FLOW_CTRL_TX;
+			tp->link_config.flowctrl &= ~FLOW_CTRL_TX;
 
 		if (netif_running(dev)) {
 			tg3_halt(tp, RESET_KIND_SHUTDOWN, 1);
@@ -13849,7 +13849,7 @@ static int __devinit tg3_init_one(struct pci_dev *pdev,
 
 	/* flow control autonegotiation is default behavior */
 	tp->tg3_flags |= TG3_FLAG_PAUSE_AUTONEG;
-	tp->link_config.flowctrl = TG3_FLOW_CTRL_TX | TG3_FLOW_CTRL_RX;
+	tp->link_config.flowctrl = FLOW_CTRL_TX | FLOW_CTRL_RX;
 
 	tg3_init_coal(tp);
 
diff --git a/drivers/net/tg3.h b/drivers/net/tg3.h
index 61556764a50..0880cfacdcb 100644
--- a/drivers/net/tg3.h
+++ b/drivers/net/tg3.h
@@ -2338,8 +2338,6 @@ struct tg3_link_config {
 	u8				duplex;
 	u8				autoneg;
 	u8				flowctrl;
-#define TG3_FLOW_CTRL_TX		0x01
-#define TG3_FLOW_CTRL_RX		0x02
 
 	/* Describes what we actually have. */
 	u8				active_flowctrl;
diff --git a/drivers/net/usb/smsc95xx.c b/drivers/net/usb/smsc95xx.c
index 4638a7bb471..ee2eac3047b 100644
--- a/drivers/net/usb/smsc95xx.c
+++ b/drivers/net/usb/smsc95xx.c
@@ -45,8 +45,6 @@
 #define SMSC95XX_INTERNAL_PHY_ID	(1)
 #define SMSC95XX_TX_OVERHEAD		(8)
 #define SMSC95XX_TX_OVERHEAD_CSUM	(12)
-#define FLOW_CTRL_TX			(1)
-#define FLOW_CTRL_RX			(2)
 
 struct smsc95xx_priv {
 	u32 mac_cr;
diff --git a/include/linux/mii.h b/include/linux/mii.h
index 151b7e0182c..4a376e0816f 100644
--- a/include/linux/mii.h
+++ b/include/linux/mii.h
@@ -135,6 +135,10 @@
 #define LPA_1000FULL            0x0800  /* Link partner 1000BASE-T full duplex */
 #define LPA_1000HALF            0x0400  /* Link partner 1000BASE-T half duplex */
 
+/* Flow control flags */
+#define FLOW_CTRL_TX		0x01
+#define FLOW_CTRL_RX		0x02
+
 /* This structure is used in all SIOCxMIIxxx ioctl calls */
 struct mii_ioctl_data {
 	__u16		phy_id;
-- 
cgit v1.2.3-70-g09d2


From bc02ff95fe4ebd3e5ee7455c0aa6f76ebe39ebca Mon Sep 17 00:00:00 2001
From: Steve Glendinning <steve.glendinning@smsc.com>
Date: Tue, 16 Dec 2008 02:00:48 -0800
Subject: net: Refactor full duplex flow control resolution

These 4 drivers have identical full duplex flow control resolution
functions.  This patch changes them all to use one common function.

The function in question decides whether a device should enable TX and
RX flow control in a standard way (IEEE 802.3-2005 table 28B-3), so this
should also be useful for other drivers.

Signed-off-by: Steve Glendinning <steve.glendinning@smsc.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/smsc911x.c     | 24 +-----------------------
 drivers/net/smsc9420.c     | 24 +-----------------------
 drivers/net/tg3.c          | 24 +-----------------------
 drivers/net/usb/smsc95xx.c | 24 +-----------------------
 include/linux/mii.h        | 29 +++++++++++++++++++++++++++++
 5 files changed, 33 insertions(+), 92 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/smsc911x.c b/drivers/net/smsc911x.c
index ae327166f97..fa28542b47d 100644
--- a/drivers/net/smsc911x.c
+++ b/drivers/net/smsc911x.c
@@ -642,28 +642,6 @@ static int smsc911x_phy_loopbacktest(struct net_device *dev)
 }
 #endif				/* USE_PHY_WORK_AROUND */
 
-static u8 smsc95xx_resolve_flowctrl_fulldplx(u16 lcladv, u16 rmtadv)
-{
-	u8 cap = 0;
-
-	if (lcladv & ADVERTISE_PAUSE_CAP) {
-		if (lcladv & ADVERTISE_PAUSE_ASYM) {
-			if (rmtadv & LPA_PAUSE_CAP)
-				cap = FLOW_CTRL_TX | FLOW_CTRL_RX;
-			else if (rmtadv & LPA_PAUSE_ASYM)
-				cap = FLOW_CTRL_RX;
-		} else {
-			if (rmtadv & LPA_PAUSE_CAP)
-				cap = FLOW_CTRL_TX | FLOW_CTRL_RX;
-		}
-	} else if (lcladv & ADVERTISE_PAUSE_ASYM) {
-		if ((rmtadv & LPA_PAUSE_CAP) && (rmtadv & LPA_PAUSE_ASYM))
-			cap = FLOW_CTRL_TX;
-	}
-
-	return cap;
-}
-
 static void smsc911x_phy_update_flowcontrol(struct smsc911x_data *pdata)
 {
 	struct phy_device *phy_dev = pdata->phy_dev;
@@ -674,7 +652,7 @@ static void smsc911x_phy_update_flowcontrol(struct smsc911x_data *pdata)
 	if (phy_dev->duplex == DUPLEX_FULL) {
 		u16 lcladv = phy_read(phy_dev, MII_ADVERTISE);
 		u16 rmtadv = phy_read(phy_dev, MII_LPA);
-		u8 cap = smsc95xx_resolve_flowctrl_fulldplx(lcladv, rmtadv);
+		u8 cap = mii_resolve_flowctrl_fdx(lcladv, rmtadv);
 
 		if (cap & FLOW_CTRL_RX)
 			flow = 0xFFFF0002;
diff --git a/drivers/net/smsc9420.c b/drivers/net/smsc9420.c
index bc9879d5f28..940220f6092 100644
--- a/drivers/net/smsc9420.c
+++ b/drivers/net/smsc9420.c
@@ -1080,28 +1080,6 @@ static void smsc9420_set_multicast_list(struct net_device *dev)
 	smsc9420_pci_flush_write(pd);
 }
 
-static u8 smsc9420_resolve_flowctrl_fulldplx(u16 lcladv, u16 rmtadv)
-{
-	u8 cap = 0;
-
-	if (lcladv & ADVERTISE_PAUSE_CAP) {
-		if (lcladv & ADVERTISE_PAUSE_ASYM) {
-			if (rmtadv & LPA_PAUSE_CAP)
-				cap = FLOW_CTRL_TX | FLOW_CTRL_RX;
-			else if (rmtadv & LPA_PAUSE_ASYM)
-				cap = FLOW_CTRL_RX;
-		} else {
-			if (rmtadv & LPA_PAUSE_CAP)
-				cap = FLOW_CTRL_TX | FLOW_CTRL_RX;
-		}
-	} else if (lcladv & ADVERTISE_PAUSE_ASYM) {
-		if ((rmtadv & LPA_PAUSE_CAP) && (rmtadv & LPA_PAUSE_ASYM))
-			cap = FLOW_CTRL_TX;
-	}
-
-	return cap;
-}
-
 static void smsc9420_phy_update_flowcontrol(struct smsc9420_pdata *pd)
 {
 	struct phy_device *phy_dev = pd->phy_dev;
@@ -1110,7 +1088,7 @@ static void smsc9420_phy_update_flowcontrol(struct smsc9420_pdata *pd)
 	if (phy_dev->duplex == DUPLEX_FULL) {
 		u16 lcladv = phy_read(phy_dev, MII_ADVERTISE);
 		u16 rmtadv = phy_read(phy_dev, MII_LPA);
-		u8 cap = smsc9420_resolve_flowctrl_fulldplx(lcladv, rmtadv);
+		u8 cap = mii_resolve_flowctrl_fdx(lcladv, rmtadv);
 
 		if (cap & FLOW_CTRL_RX)
 			flow = 0xFFFF0002;
diff --git a/drivers/net/tg3.c b/drivers/net/tg3.c
index f353f69caeb..06bd2f4eee6 100644
--- a/drivers/net/tg3.c
+++ b/drivers/net/tg3.c
@@ -1227,28 +1227,6 @@ static u16 tg3_advert_flowctrl_1000X(u8 flow_ctrl)
 	return miireg;
 }
 
-static u8 tg3_resolve_flowctrl_1000T(u16 lcladv, u16 rmtadv)
-{
-	u8 cap = 0;
-
-	if (lcladv & ADVERTISE_PAUSE_CAP) {
-		if (lcladv & ADVERTISE_PAUSE_ASYM) {
-			if (rmtadv & LPA_PAUSE_CAP)
-				cap = TG3_FLOW_CTRL_TX | TG3_FLOW_CTRL_RX;
-			else if (rmtadv & LPA_PAUSE_ASYM)
-				cap = TG3_FLOW_CTRL_RX;
-		} else {
-			if (rmtadv & LPA_PAUSE_CAP)
-				cap = TG3_FLOW_CTRL_TX | TG3_FLOW_CTRL_RX;
-		}
-	} else if (lcladv & ADVERTISE_PAUSE_ASYM) {
-		if ((rmtadv & LPA_PAUSE_CAP) && (rmtadv & LPA_PAUSE_ASYM))
-			cap = TG3_FLOW_CTRL_TX;
-	}
-
-	return cap;
-}
-
 static u8 tg3_resolve_flowctrl_1000X(u16 lcladv, u16 rmtadv)
 {
 	u8 cap = 0;
@@ -1288,7 +1266,7 @@ static void tg3_setup_flow_control(struct tg3 *tp, u32 lcladv, u32 rmtadv)
 		if (tp->tg3_flags2 & TG3_FLG2_ANY_SERDES)
 			flowctrl = tg3_resolve_flowctrl_1000X(lcladv, rmtadv);
 		else
-			flowctrl = tg3_resolve_flowctrl_1000T(lcladv, rmtadv);
+			flowctrl = mii_resolve_flowctrl_fdx(lcladv, rmtadv);
 	} else
 		flowctrl = tp->link_config.flowctrl;
 
diff --git a/drivers/net/usb/smsc95xx.c b/drivers/net/usb/smsc95xx.c
index ee2eac3047b..fed22ffedd5 100644
--- a/drivers/net/usb/smsc95xx.c
+++ b/drivers/net/usb/smsc95xx.c
@@ -435,28 +435,6 @@ static void smsc95xx_set_multicast(struct net_device *netdev)
 	smsc95xx_write_reg_async(dev, MAC_CR, &pdata->mac_cr);
 }
 
-static u8 smsc95xx_resolve_flowctrl_fulldplx(u16 lcladv, u16 rmtadv)
-{
-	u8 cap = 0;
-
-	if (lcladv & ADVERTISE_PAUSE_CAP) {
-		if (lcladv & ADVERTISE_PAUSE_ASYM) {
-			if (rmtadv & LPA_PAUSE_CAP)
-				cap = FLOW_CTRL_TX | FLOW_CTRL_RX;
-			else if (rmtadv & LPA_PAUSE_ASYM)
-				cap = FLOW_CTRL_RX;
-		} else {
-			if (rmtadv & LPA_PAUSE_CAP)
-				cap = FLOW_CTRL_TX | FLOW_CTRL_RX;
-		}
-	} else if (lcladv & ADVERTISE_PAUSE_ASYM) {
-		if ((rmtadv & LPA_PAUSE_CAP) && (rmtadv & LPA_PAUSE_ASYM))
-			cap = FLOW_CTRL_TX;
-	}
-
-	return cap;
-}
-
 static void smsc95xx_phy_update_flowcontrol(struct usbnet *dev, u8 duplex,
 					    u16 lcladv, u16 rmtadv)
 {
@@ -469,7 +447,7 @@ static void smsc95xx_phy_update_flowcontrol(struct usbnet *dev, u8 duplex,
 	}
 
 	if (duplex == DUPLEX_FULL) {
-		u8 cap = smsc95xx_resolve_flowctrl_fulldplx(lcladv, rmtadv);
+		u8 cap = mii_resolve_flowctrl_fdx(lcladv, rmtadv);
 
 		if (cap & FLOW_CTRL_RX)
 			flow = 0xFFFF0002;
diff --git a/include/linux/mii.h b/include/linux/mii.h
index 4a376e0816f..ad748588faf 100644
--- a/include/linux/mii.h
+++ b/include/linux/mii.h
@@ -239,5 +239,34 @@ static inline unsigned int mii_duplex (unsigned int duplex_lock,
 	return 0;
 }
 
+/**
+ * mii_resolve_flowctrl_fdx
+ * @lcladv: value of MII ADVERTISE register
+ * @rmtadv: value of MII LPA register
+ *
+ * Resolve full duplex flow control as per IEEE 802.3-2005 table 28B-3
+ */
+static inline u8 mii_resolve_flowctrl_fdx(u16 lcladv, u16 rmtadv)
+{
+	u8 cap = 0;
+
+	if (lcladv & ADVERTISE_PAUSE_CAP) {
+		if (lcladv & ADVERTISE_PAUSE_ASYM) {
+			if (rmtadv & LPA_PAUSE_CAP)
+				cap = FLOW_CTRL_TX | FLOW_CTRL_RX;
+			else if (rmtadv & LPA_PAUSE_ASYM)
+				cap = FLOW_CTRL_RX;
+		} else {
+			if (rmtadv & LPA_PAUSE_CAP)
+				cap = FLOW_CTRL_TX | FLOW_CTRL_RX;
+		}
+	} else if (lcladv & ADVERTISE_PAUSE_ASYM) {
+		if ((rmtadv & LPA_PAUSE_CAP) && (rmtadv & LPA_PAUSE_ASYM))
+			cap = FLOW_CTRL_TX;
+	}
+
+	return cap;
+}
+
 #endif /* __KERNEL__ */
 #endif /* __LINUX_MII_H__ */
-- 
cgit v1.2.3-70-g09d2


From b24a2516d10751d7ed5afb58420df25370c9dffb Mon Sep 17 00:00:00 2001
From: Yang Hongyang <yanghy@cn.fujitsu.com>
Date: Tue, 16 Dec 2008 02:06:23 -0800
Subject: ipv6: Add IPV6_PKTINFO sticky option support to setsockopt()

There are three reasons for me to add this support:
1.When no interface is specified in an IPV6_PKTINFO ancillary data
  item, the interface specified in an IPV6_PKTINFO sticky optionis
  is used.

RFC3542:
6.7.  Summary of Outgoing Interface Selection

   This document and [RFC-3493] specify various methods that affect the
   selection of the packet's outgoing interface.  This subsection
   summarizes the ordering among those in order to ensure deterministic
   behavior.

   For a given outgoing packet on a given socket, the outgoing interface
   is determined in the following order:

   1. if an interface is specified in an IPV6_PKTINFO ancillary data
      item, the interface is used.

   2. otherwise, if an interface is specified in an IPV6_PKTINFO sticky
      option, the interface is used.

2.When no IPV6_PKTINFO ancillary data is received,getsockopt() should
  return the sticky option value which set with setsockopt().

RFC 3542:
   Issuing getsockopt() for the above options will return the sticky
   option value i.e., the value set with setsockopt().  If no sticky
   option value has been set getsockopt() will return the following
   values:

3.Make the setsockopt implementation POSIX compliant.

Signed-off-by: Yang Hongyang <yanghy@cn.fujitsu.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/ipv6.h     |  1 +
 net/ipv6/ipv6_sockglue.c | 22 ++++++++++++++++++++++
 2 files changed, 23 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h
index 641e026eee8..0b816cae533 100644
--- a/include/linux/ipv6.h
+++ b/include/linux/ipv6.h
@@ -278,6 +278,7 @@ struct ipv6_pinfo {
 	struct in6_addr 	saddr;
 	struct in6_addr 	rcv_saddr;
 	struct in6_addr		daddr;
+	struct in6_pktinfo	sticky_pktinfo;
 	struct in6_addr		*daddr_cache;
 #ifdef CONFIG_IPV6_SUBTREES
 	struct in6_addr		*saddr_cache;
diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c
index 2aa294be0c7..0feaee38bc3 100644
--- a/net/ipv6/ipv6_sockglue.c
+++ b/net/ipv6/ipv6_sockglue.c
@@ -395,6 +395,28 @@ sticky_done:
 		break;
 	}
 
+	case IPV6_PKTINFO:
+	{
+		struct in6_pktinfo pkt;
+
+		if (optlen == 0)
+			goto e_inval;
+		else if (optlen < sizeof(struct in6_pktinfo) || optval == NULL)
+			goto e_inval;
+
+		if (copy_from_user(&pkt, optval, optlen)) {
+				retv = -EFAULT;
+				break;
+		}
+		if (sk->sk_bound_dev_if && pkt.ipi6_ifindex != sk->sk_bound_dev_if)
+			goto e_inval;
+
+		np->sticky_pktinfo.ipi6_ifindex = pkt.ipi6_ifindex;
+		ipv6_addr_copy(&np->sticky_pktinfo.ipi6_addr, &pkt.ipi6_addr);
+		retv = 0;
+		break;
+	}
+
 	case IPV6_2292PKTOPTIONS:
 	{
 		struct ipv6_txoptions *opt = NULL;
-- 
cgit v1.2.3-70-g09d2


From 8c5df16bec8a60bb8589fc232b9e26cac0ed4b2c Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy@goop.org>
Date: Tue, 16 Dec 2008 12:17:26 -0800
Subject: swiotlb: allow architectures to override swiotlb pool allocation

Impact: generalize swiotlb allocation code

Architectures may need to allocate memory specially for use with
the swiotlb.  Create the weak function swiotlb_alloc_boot() and
swiotlb_alloc() defaulting to the current behaviour.

Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Signed-off-by: Ian Campbell <ian.campbell@citrix.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/swiotlb.h |  3 +++
 lib/swiotlb.c           | 16 +++++++++++++---
 2 files changed, 16 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/swiotlb.h b/include/linux/swiotlb.h
index b18ec5533e8..b8c5fc766a5 100644
--- a/include/linux/swiotlb.h
+++ b/include/linux/swiotlb.h
@@ -10,6 +10,9 @@ struct scatterlist;
 extern void
 swiotlb_init(void);
 
+extern void *swiotlb_alloc_boot(size_t bytes, unsigned long nslabs);
+extern void *swiotlb_alloc(unsigned order, unsigned long nslabs);
+
 extern void
 *swiotlb_alloc_coherent(struct device *hwdev, size_t size,
 			dma_addr_t *dma_handle, gfp_t flags);
diff --git a/lib/swiotlb.c b/lib/swiotlb.c
index 5f6c629a924..abecb285755 100644
--- a/lib/swiotlb.c
+++ b/lib/swiotlb.c
@@ -21,6 +21,7 @@
 #include <linux/mm.h>
 #include <linux/module.h>
 #include <linux/spinlock.h>
+#include <linux/swiotlb.h>
 #include <linux/string.h>
 #include <linux/types.h>
 #include <linux/ctype.h>
@@ -126,6 +127,16 @@ setup_io_tlb_npages(char *str)
 __setup("swiotlb=", setup_io_tlb_npages);
 /* make io_tlb_overflow tunable too? */
 
+void * __weak swiotlb_alloc_boot(size_t size, unsigned long nslabs)
+{
+	return alloc_bootmem_low_pages(size);
+}
+
+void * __weak swiotlb_alloc(unsigned order, unsigned long nslabs)
+{
+	return (void *)__get_free_pages(GFP_DMA | __GFP_NOWARN, order);
+}
+
 /*
  * Statically reserve bounce buffer space and initialize bounce buffer data
  * structures for the software IO TLB used to implement the DMA API.
@@ -145,7 +156,7 @@ swiotlb_init_with_default_size(size_t default_size)
 	/*
 	 * Get IO TLB memory from the low pages
 	 */
-	io_tlb_start = alloc_bootmem_low_pages(bytes);
+	io_tlb_start = swiotlb_alloc_boot(bytes, io_tlb_nslabs);
 	if (!io_tlb_start)
 		panic("Cannot allocate SWIOTLB buffer");
 	io_tlb_end = io_tlb_start + bytes;
@@ -202,8 +213,7 @@ swiotlb_late_init_with_default_size(size_t default_size)
 	bytes = io_tlb_nslabs << IO_TLB_SHIFT;
 
 	while ((SLABS_PER_PAGE << order) > IO_TLB_MIN_SLABS) {
-		io_tlb_start = (char *)__get_free_pages(GFP_DMA | __GFP_NOWARN,
-		                                        order);
+		io_tlb_start = swiotlb_alloc(order, io_tlb_nslabs);
 		if (io_tlb_start)
 			break;
 		order--;
-- 
cgit v1.2.3-70-g09d2


From 0016fdee927f7aa0f428494bcf11ae60c7470a02 Mon Sep 17 00:00:00 2001
From: Ian Campbell <ian.campbell@citrix.com>
Date: Tue, 16 Dec 2008 12:17:27 -0800
Subject: swiotlb: move some definitions to header

Impact: cleanup

Signed-off-by: Ian Campbell <ian.campbell@citrix.com>
Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/swiotlb.h | 14 ++++++++++++++
 lib/swiotlb.c           | 14 +-------------
 2 files changed, 15 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/swiotlb.h b/include/linux/swiotlb.h
index b8c5fc766a5..58b996a642f 100644
--- a/include/linux/swiotlb.h
+++ b/include/linux/swiotlb.h
@@ -7,6 +7,20 @@ struct device;
 struct dma_attrs;
 struct scatterlist;
 
+/*
+ * Maximum allowable number of contiguous slabs to map,
+ * must be a power of 2.  What is the appropriate value ?
+ * The complexity of {map,unmap}_single is linearly dependent on this value.
+ */
+#define IO_TLB_SEGSIZE	128
+
+
+/*
+ * log of the size of each IO TLB slab.  The number of slabs is command line
+ * controllable.
+ */
+#define IO_TLB_SHIFT 11
+
 extern void
 swiotlb_init(void);
 
diff --git a/lib/swiotlb.c b/lib/swiotlb.c
index abecb285755..db724ba7ebf 100644
--- a/lib/swiotlb.c
+++ b/lib/swiotlb.c
@@ -23,6 +23,7 @@
 #include <linux/spinlock.h>
 #include <linux/swiotlb.h>
 #include <linux/string.h>
+#include <linux/swiotlb.h>
 #include <linux/types.h>
 #include <linux/ctype.h>
 
@@ -40,19 +41,6 @@
 #define SG_ENT_VIRT_ADDRESS(sg)	(sg_virt((sg)))
 #define SG_ENT_PHYS_ADDRESS(sg)	virt_to_bus(SG_ENT_VIRT_ADDRESS(sg))
 
-/*
- * Maximum allowable number of contiguous slabs to map,
- * must be a power of 2.  What is the appropriate value ?
- * The complexity of {map,unmap}_single is linearly dependent on this value.
- */
-#define IO_TLB_SEGSIZE	128
-
-/*
- * log of the size of each IO TLB slab.  The number of slabs is command line
- * controllable.
- */
-#define IO_TLB_SHIFT 11
-
 #define SLABS_PER_PAGE (1 << (PAGE_SHIFT - IO_TLB_SHIFT))
 
 /*
-- 
cgit v1.2.3-70-g09d2


From 48a1b10aff588833b73994704c47bbd0deb73e9c Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai@kernel.org>
Date: Thu, 11 Dec 2008 00:15:01 -0800
Subject: x86, sparseirq: move irq_desc according to smp_affinity, v7

Impact: improve NUMA handling by migrating irq_desc on smp_affinity changes

if CONFIG_NUMA_MIGRATE_IRQ_DESC is set:

-  make irq_desc to go with affinity aka irq_desc moving etc
-  call move_irq_desc in irq_complete_move()
-  legacy irq_desc is not moved, because they are allocated via static array

for logical apic mode, need to add move_desc_in_progress_in_same_domain,
otherwise it will not be moved ==> also could need two phases to get
irq_desc moved.

Signed-off-by: Yinghai Lu <yinghai@kernel.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/Kconfig          |   9 +++
 arch/x86/kernel/io_apic.c | 142 +++++++++++++++++++++++++++++++++++++++++++++-
 include/linux/irq.h       |  10 ++++
 kernel/irq/Makefile       |   1 +
 kernel/irq/chip.c         |  12 +++-
 kernel/irq/handle.c       |  15 +++--
 kernel/irq/internals.h    |   5 ++
 kernel/irq/numa_migrate.c | 127 +++++++++++++++++++++++++++++++++++++++++
 8 files changed, 313 insertions(+), 8 deletions(-)
 create mode 100644 kernel/irq/numa_migrate.c

(limited to 'include/linux')

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 8943c13502c..29073532f94 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -248,6 +248,15 @@ config SPARSE_IRQ
 
 	  If you don't know what to do here, say Y.
 
+config NUMA_MIGRATE_IRQ_DESC
+	bool "Move irq desc when changing irq smp_affinity"
+	depends on SPARSE_IRQ && SMP
+	default n
+	help
+	  This enables moving irq_desc to cpu/node that irq will use handled.
+
+	  If you don't know what to do here, say N.
+
 config X86_FIND_SMP_CONFIG
 	def_bool y
 	depends on X86_MPPARSE || X86_VOYAGER
diff --git a/arch/x86/kernel/io_apic.c b/arch/x86/kernel/io_apic.c
index a1a2e070f31..bfe1245b1a3 100644
--- a/arch/x86/kernel/io_apic.c
+++ b/arch/x86/kernel/io_apic.c
@@ -141,6 +141,9 @@ struct irq_cfg {
 	unsigned move_cleanup_count;
 	u8 vector;
 	u8 move_in_progress : 1;
+#ifdef CONFIG_NUMA_MIGRATE_IRQ_DESC
+	u8 move_desc_pending : 1;
+#endif
 };
 
 /* irq_cfg is indexed by the sum of all RTEs in all I/O APICs. */
@@ -223,6 +226,121 @@ void arch_init_chip_data(struct irq_desc *desc, int cpu)
 	}
 }
 
+#ifdef CONFIG_NUMA_MIGRATE_IRQ_DESC
+
+static void
+init_copy_irq_2_pin(struct irq_cfg *old_cfg, struct irq_cfg *cfg, int cpu)
+{
+	struct irq_pin_list *old_entry, *head, *tail, *entry;
+
+	cfg->irq_2_pin = NULL;
+	old_entry = old_cfg->irq_2_pin;
+	if (!old_entry)
+		return;
+
+	entry = get_one_free_irq_2_pin(cpu);
+	if (!entry)
+		return;
+
+	entry->apic	= old_entry->apic;
+	entry->pin	= old_entry->pin;
+	head		= entry;
+	tail		= entry;
+	old_entry	= old_entry->next;
+	while (old_entry) {
+		entry = get_one_free_irq_2_pin(cpu);
+		if (!entry) {
+			entry = head;
+			while (entry) {
+				head = entry->next;
+				kfree(entry);
+				entry = head;
+			}
+			/* still use the old one */
+			return;
+		}
+		entry->apic	= old_entry->apic;
+		entry->pin	= old_entry->pin;
+		tail->next	= entry;
+		tail		= entry;
+		old_entry	= old_entry->next;
+	}
+
+	tail->next = NULL;
+	cfg->irq_2_pin = head;
+}
+
+static void free_irq_2_pin(struct irq_cfg *old_cfg, struct irq_cfg *cfg)
+{
+	struct irq_pin_list *entry, *next;
+
+	if (old_cfg->irq_2_pin == cfg->irq_2_pin)
+		return;
+
+	entry = old_cfg->irq_2_pin;
+
+	while (entry) {
+		next = entry->next;
+		kfree(entry);
+		entry = next;
+	}
+	old_cfg->irq_2_pin = NULL;
+}
+
+void arch_init_copy_chip_data(struct irq_desc *old_desc,
+				 struct irq_desc *desc, int cpu)
+{
+	struct irq_cfg *cfg;
+	struct irq_cfg *old_cfg;
+
+	cfg = get_one_free_irq_cfg(cpu);
+
+	if (!cfg)
+		return;
+
+	desc->chip_data = cfg;
+
+	old_cfg = old_desc->chip_data;
+
+	memcpy(cfg, old_cfg, sizeof(struct irq_cfg));
+
+	init_copy_irq_2_pin(old_cfg, cfg, cpu);
+}
+
+static void free_irq_cfg(struct irq_cfg *old_cfg)
+{
+	kfree(old_cfg);
+}
+
+void arch_free_chip_data(struct irq_desc *old_desc, struct irq_desc *desc)
+{
+	struct irq_cfg *old_cfg, *cfg;
+
+	old_cfg = old_desc->chip_data;
+	cfg = desc->chip_data;
+
+	if (old_cfg == cfg)
+		return;
+
+	if (old_cfg) {
+		free_irq_2_pin(old_cfg, cfg);
+		free_irq_cfg(old_cfg);
+		old_desc->chip_data = NULL;
+	}
+}
+
+static void set_extra_move_desc(struct irq_desc *desc, cpumask_t mask)
+{
+	struct irq_cfg *cfg = desc->chip_data;
+
+	if (!cfg->move_in_progress) {
+		/* it means that domain is not changed */
+		if (!cpus_intersects(desc->affinity, mask))
+			cfg->move_desc_pending = 1;
+	}
+}
+#endif
+
 #else
 static struct irq_cfg *irq_cfg(unsigned int irq)
 {
@@ -231,9 +349,11 @@ static struct irq_cfg *irq_cfg(unsigned int irq)
 
 #endif
 
+#ifndef CONFIG_NUMA_MIGRATE_IRQ_DESC
 static inline void set_extra_move_desc(struct irq_desc *desc, cpumask_t mask)
 {
 }
+#endif
 
 struct io_apic {
 	unsigned int index;
@@ -2346,14 +2466,34 @@ static void irq_complete_move(struct irq_desc **descp)
 	struct irq_cfg *cfg = desc->chip_data;
 	unsigned vector, me;
 
-	if (likely(!cfg->move_in_progress))
+	if (likely(!cfg->move_in_progress)) {
+#ifdef CONFIG_NUMA_MIGRATE_IRQ_DESC
+		if (likely(!cfg->move_desc_pending))
+			return;
+
+		/* domain is not change, but affinity is changed */
+		me = smp_processor_id();
+		if (cpu_isset(me, desc->affinity)) {
+			*descp = desc = move_irq_desc(desc, me);
+			/* get the new one */
+			cfg = desc->chip_data;
+			cfg->move_desc_pending = 0;
+		}
+#endif
 		return;
+	}
 
 	vector = ~get_irq_regs()->orig_ax;
 	me = smp_processor_id();
 	if ((vector == cfg->vector) && cpu_isset(me, cfg->domain)) {
 		cpumask_t cleanup_mask;
 
+#ifdef CONFIG_NUMA_MIGRATE_IRQ_DESC
+		*descp = desc = move_irq_desc(desc, me);
+		/* get the new one */
+		cfg = desc->chip_data;
+#endif
+
 		cpus_and(cleanup_mask, cfg->old_domain, cpu_online_map);
 		cfg->move_cleanup_count = cpus_weight(cleanup_mask);
 		send_IPI_mask(cleanup_mask, IRQ_MOVE_CLEANUP_VECTOR);
diff --git a/include/linux/irq.h b/include/linux/irq.h
index b5749db3e5a..36a01574678 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -227,6 +227,16 @@ extern struct irq_desc *move_irq_desc(struct irq_desc *old_desc, int cpu);
 
 #endif
 
+static inline struct irq_desc *
+irq_remap_to_desc(unsigned int irq, struct irq_desc *desc)
+{
+#ifdef CONFIG_NUMA_MIGRATE_IRQ_DESC
+	return irq_to_desc(irq);
+#else
+	return desc;
+#endif
+}
+
 /*
  * Migration helpers for obsolete names, they will go away:
  */
diff --git a/kernel/irq/Makefile b/kernel/irq/Makefile
index 681c52dbfe2..4dd5b1edac9 100644
--- a/kernel/irq/Makefile
+++ b/kernel/irq/Makefile
@@ -3,3 +3,4 @@ obj-y := handle.o manage.o spurious.o resend.o chip.o devres.o
 obj-$(CONFIG_GENERIC_IRQ_PROBE) += autoprobe.o
 obj-$(CONFIG_PROC_FS) += proc.o
 obj-$(CONFIG_GENERIC_PENDING_IRQ) += migration.o
+obj-$(CONFIG_NUMA_MIGRATE_IRQ_DESC) += numa_migrate.o
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index 8e4fce4a1b1..de210f4b7a9 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -353,6 +353,7 @@ handle_level_irq(unsigned int irq, struct irq_desc *desc)
 
 	spin_lock(&desc->lock);
 	mask_ack_irq(desc, irq);
+	desc = irq_remap_to_desc(irq, desc);
 
 	if (unlikely(desc->status & IRQ_INPROGRESS))
 		goto out_unlock;
@@ -430,6 +431,7 @@ handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc)
 	desc->status &= ~IRQ_INPROGRESS;
 out:
 	desc->chip->eoi(irq);
+	desc = irq_remap_to_desc(irq, desc);
 
 	spin_unlock(&desc->lock);
 }
@@ -466,12 +468,14 @@ handle_edge_irq(unsigned int irq, struct irq_desc *desc)
 		    !desc->action)) {
 		desc->status |= (IRQ_PENDING | IRQ_MASKED);
 		mask_ack_irq(desc, irq);
+		desc = irq_remap_to_desc(irq, desc);
 		goto out_unlock;
 	}
 	kstat_incr_irqs_this_cpu(irq, desc);
 
 	/* Start handling the irq */
 	desc->chip->ack(irq);
+	desc = irq_remap_to_desc(irq, desc);
 
 	/* Mark the IRQ currently in progress.*/
 	desc->status |= IRQ_INPROGRESS;
@@ -532,8 +536,10 @@ handle_percpu_irq(unsigned int irq, struct irq_desc *desc)
 	if (!noirqdebug)
 		note_interrupt(irq, desc, action_ret);
 
-	if (desc->chip->eoi)
+	if (desc->chip->eoi) {
 		desc->chip->eoi(irq);
+		desc = irq_remap_to_desc(irq, desc);
+	}
 }
 
 void
@@ -568,8 +574,10 @@ __set_irq_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained,
 
 	/* Uninstall? */
 	if (handle == handle_bad_irq) {
-		if (desc->chip != &no_irq_chip)
+		if (desc->chip != &no_irq_chip) {
 			mask_ack_irq(desc, irq);
+			desc = irq_remap_to_desc(irq, desc);
+		}
 		desc->status |= IRQ_DISABLED;
 		desc->depth = 1;
 	}
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index 8aa09547f5e..f1a23069c20 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -23,7 +23,7 @@
 /*
  * lockdep: we want to handle all irq_desc locks as a single lock-class:
  */
-static struct lock_class_key irq_desc_lock_class;
+struct lock_class_key irq_desc_lock_class;
 
 /**
  * handle_bad_irq - handle spurious and unhandled irqs
@@ -73,7 +73,7 @@ static struct irq_desc irq_desc_init = {
 #endif
 };
 
-static void init_kstat_irqs(struct irq_desc *desc, int cpu, int nr)
+void init_kstat_irqs(struct irq_desc *desc, int cpu, int nr)
 {
 	unsigned long bytes;
 	char *ptr;
@@ -113,7 +113,7 @@ static void init_one_irq_desc(int irq, struct irq_desc *desc, int cpu)
 /*
  * Protect the sparse_irqs:
  */
-static DEFINE_SPINLOCK(sparse_irq_lock);
+DEFINE_SPINLOCK(sparse_irq_lock);
 
 struct irq_desc *irq_desc_ptrs[NR_IRQS] __read_mostly;
 
@@ -337,8 +337,11 @@ unsigned int __do_IRQ(unsigned int irq)
 		/*
 		 * No locking required for CPU-local interrupts:
 		 */
-		if (desc->chip->ack)
+		if (desc->chip->ack) {
 			desc->chip->ack(irq);
+			/* get new one */
+			desc = irq_remap_to_desc(irq, desc);
+		}
 		if (likely(!(desc->status & IRQ_DISABLED))) {
 			action_ret = handle_IRQ_event(irq, desc->action);
 			if (!noirqdebug)
@@ -349,8 +352,10 @@ unsigned int __do_IRQ(unsigned int irq)
 	}
 
 	spin_lock(&desc->lock);
-	if (desc->chip->ack)
+	if (desc->chip->ack) {
 		desc->chip->ack(irq);
+		desc = irq_remap_to_desc(irq, desc);
+	}
 	/*
 	 * REPLAY is when Linux resends an IRQ that was dropped earlier
 	 * WAITING is used by probe to mark irqs that are being tested
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index 64c1c7253da..e6d0a43cc12 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -13,6 +13,11 @@ extern void compat_irq_chip_set_default_handler(struct irq_desc *desc);
 extern int __irq_set_trigger(struct irq_desc *desc, unsigned int irq,
 		unsigned long flags);
 
+extern struct lock_class_key irq_desc_lock_class;
+extern void init_kstat_irqs(struct irq_desc *desc, int cpu, int nr);
+extern spinlock_t sparse_irq_lock;
+extern struct irq_desc *irq_desc_ptrs[NR_IRQS];
+
 #ifdef CONFIG_PROC_FS
 extern void register_irq_proc(unsigned int irq, struct irq_desc *desc);
 extern void register_handler_proc(unsigned int irq, struct irqaction *action);
diff --git a/kernel/irq/numa_migrate.c b/kernel/irq/numa_migrate.c
new file mode 100644
index 00000000000..0178e229699
--- /dev/null
+++ b/kernel/irq/numa_migrate.c
@@ -0,0 +1,127 @@
+/*
+ * linux/kernel/irq/handle.c
+ *
+ * Copyright (C) 1992, 1998-2006 Linus Torvalds, Ingo Molnar
+ * Copyright (C) 2005-2006, Thomas Gleixner, Russell King
+ *
+ * This file contains the core interrupt handling code.
+ *
+ * Detailed information is available in Documentation/DocBook/genericirq
+ *
+ */
+
+#include <linux/irq.h>
+#include <linux/module.h>
+#include <linux/random.h>
+#include <linux/interrupt.h>
+#include <linux/kernel_stat.h>
+
+#include "internals.h"
+
+static void init_copy_kstat_irqs(struct irq_desc *old_desc,
+				 struct irq_desc *desc,
+				 int cpu, int nr)
+{
+	unsigned long bytes;
+
+	init_kstat_irqs(desc, cpu, nr);
+
+	if (desc->kstat_irqs != old_desc->kstat_irqs) {
+		/* Compute how many bytes we need per irq and allocate them */
+		bytes = nr * sizeof(unsigned int);
+
+		memcpy(desc->kstat_irqs, old_desc->kstat_irqs, bytes);
+	}
+}
+
+static void free_kstat_irqs(struct irq_desc *old_desc, struct irq_desc *desc)
+{
+	if (old_desc->kstat_irqs == desc->kstat_irqs)
+		return;
+
+	kfree(old_desc->kstat_irqs);
+	old_desc->kstat_irqs = NULL;
+}
+
+static void init_copy_one_irq_desc(int irq, struct irq_desc *old_desc,
+		 struct irq_desc *desc, int cpu)
+{
+	memcpy(desc, old_desc, sizeof(struct irq_desc));
+	desc->cpu = cpu;
+	lockdep_set_class(&desc->lock, &irq_desc_lock_class);
+	init_copy_kstat_irqs(old_desc, desc, cpu, nr_cpu_ids);
+	arch_init_copy_chip_data(old_desc, desc, cpu);
+}
+
+static void free_one_irq_desc(struct irq_desc *old_desc, struct irq_desc *desc)
+{
+	free_kstat_irqs(old_desc, desc);
+	arch_free_chip_data(old_desc, desc);
+}
+
+static struct irq_desc *__real_move_irq_desc(struct irq_desc *old_desc,
+						int cpu)
+{
+	struct irq_desc *desc;
+	unsigned int irq;
+	unsigned long flags;
+	int node;
+
+	irq = old_desc->irq;
+
+	spin_lock_irqsave(&sparse_irq_lock, flags);
+
+	/* We have to check it to avoid races with another CPU */
+	desc = irq_desc_ptrs[irq];
+
+	if (desc && old_desc != desc)
+			goto out_unlock;
+
+	node = cpu_to_node(cpu);
+	desc = kzalloc_node(sizeof(*desc), GFP_ATOMIC, node);
+	printk(KERN_DEBUG "  move irq_desc for %d to cpu %d node %d\n",
+		 irq, cpu, node);
+	if (!desc) {
+		printk(KERN_ERR "can not get new irq_desc for moving\n");
+		/* still use old one */
+		desc = old_desc;
+		goto out_unlock;
+	}
+	init_copy_one_irq_desc(irq, old_desc, desc, cpu);
+
+	irq_desc_ptrs[irq] = desc;
+
+	/* free the old one */
+	free_one_irq_desc(old_desc, desc);
+	kfree(old_desc);
+
+out_unlock:
+	spin_unlock_irqrestore(&sparse_irq_lock, flags);
+
+	return desc;
+}
+
+struct irq_desc *move_irq_desc(struct irq_desc *desc, int cpu)
+{
+	int old_cpu;
+	int node, old_node;
+
+	/* those all static, do move them */
+	if (desc->irq < NR_IRQS_LEGACY)
+		return desc;
+
+	old_cpu = desc->cpu;
+	printk(KERN_DEBUG
+		 "try to move irq_desc from cpu %d to %d\n", old_cpu, cpu);
+	if (old_cpu != cpu) {
+		node = cpu_to_node(cpu);
+		old_node = cpu_to_node(old_cpu);
+		if (old_node != node)
+			desc = __real_move_irq_desc(desc, cpu);
+		else
+			desc->cpu = cpu;
+	}
+
+	return desc;
+}
+
-- 
cgit v1.2.3-70-g09d2


From b31a1d8b41513b96e9c7ec2f68c5734cef0b26a4 Mon Sep 17 00:00:00 2001
From: Andy Fleming <afleming@freescale.com>
Date: Tue, 16 Dec 2008 15:29:15 -0800
Subject: gianfar: Convert gianfar to an of_platform_driver

Does the same for the accompanying MDIO driver, and then modifies the TBI
configuration method.  The old way used fields in einfo, which no longer
exists.  The new way is to create an MDIO device-tree node for each instance
of gianfar, and create a tbi-handle property to associate ethernet controllers
with the TBI PHYs they are connected to.

Signed-off-by: Andy Fleming <afleming@freescale.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/powerpc/dts-bindings/fsl/tsec.txt |  12 +-
 arch/powerpc/boot/dts/asp834x-redboot.dts       |  20 ++
 arch/powerpc/boot/dts/ksi8560.dts               |  20 ++
 arch/powerpc/boot/dts/mpc8313erdb.dts           |  20 ++
 arch/powerpc/boot/dts/mpc8315erdb.dts           |  19 ++
 arch/powerpc/boot/dts/mpc8349emitx.dts          |  18 ++
 arch/powerpc/boot/dts/mpc8349emitxgp.dts        |   5 +
 arch/powerpc/boot/dts/mpc834x_mds.dts           |  19 ++
 arch/powerpc/boot/dts/mpc8377_mds.dts           |  19 ++
 arch/powerpc/boot/dts/mpc8377_rdb.dts           |  19 ++
 arch/powerpc/boot/dts/mpc8378_mds.dts           |  19 ++
 arch/powerpc/boot/dts/mpc8378_rdb.dts           |  17 ++
 arch/powerpc/boot/dts/mpc8379_mds.dts           |  18 ++
 arch/powerpc/boot/dts/mpc8379_rdb.dts           |  18 ++
 arch/powerpc/boot/dts/mpc8536ds.dts             |  18 ++
 arch/powerpc/boot/dts/mpc8540ads.dts            |  31 +++
 arch/powerpc/boot/dts/mpc8541cds.dts            |  18 ++
 arch/powerpc/boot/dts/mpc8544ds.dts             |  20 ++
 arch/powerpc/boot/dts/mpc8548cds.dts            |  44 ++++
 arch/powerpc/boot/dts/mpc8555cds.dts            |  18 ++
 arch/powerpc/boot/dts/mpc8560ads.dts            |  18 ++
 arch/powerpc/boot/dts/mpc8568mds.dts            |  18 ++
 arch/powerpc/boot/dts/mpc8572ds.dts             |  45 ++++
 arch/powerpc/boot/dts/mpc8641_hpcn.dts          |  45 ++++
 arch/powerpc/boot/dts/sbc8349.dts               |  18 ++
 arch/powerpc/boot/dts/sbc8548.dts               |  18 ++
 arch/powerpc/boot/dts/sbc8560.dts               |  18 ++
 arch/powerpc/boot/dts/sbc8641d.dts              |  44 ++++
 arch/powerpc/boot/dts/stx_gp3_8560.dts          |  18 ++
 arch/powerpc/boot/dts/tqm8540.dts               |  28 +++
 arch/powerpc/boot/dts/tqm8541.dts               |  18 ++
 arch/powerpc/boot/dts/tqm8548-bigflash.dts      |  44 ++++
 arch/powerpc/boot/dts/tqm8548.dts               |  44 ++++
 arch/powerpc/boot/dts/tqm8555.dts               |  18 ++
 arch/powerpc/boot/dts/tqm8560.dts               |  18 ++
 arch/powerpc/sysdev/fsl_soc.c                   | 241 +++---------------
 drivers/net/gianfar.c                           | 321 ++++++++++++++++--------
 drivers/net/gianfar.h                           |  21 +-
 drivers/net/gianfar_ethtool.c                   |  22 +-
 drivers/net/gianfar_mii.c                       | 212 +++++++++++-----
 drivers/net/gianfar_mii.h                       |   2 +
 include/linux/fsl_devices.h                     |  18 +-
 42 files changed, 1217 insertions(+), 424 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/powerpc/dts-bindings/fsl/tsec.txt b/Documentation/powerpc/dts-bindings/fsl/tsec.txt
index cf55fa4112d..7fa4b27574b 100644
--- a/Documentation/powerpc/dts-bindings/fsl/tsec.txt
+++ b/Documentation/powerpc/dts-bindings/fsl/tsec.txt
@@ -2,8 +2,8 @@
 
 The MDIO is a bus to which the PHY devices are connected.  For each
 device that exists on this bus, a child node should be created.  See
-the definition of the PHY node below for an example of how to define
-a PHY.
+the definition of the PHY node in booting-without-of.txt for an example
+of how to define a PHY.
 
 Required properties:
   - reg : Offset and length of the register set for the device
@@ -21,6 +21,14 @@ Example:
 		};
 	};
 
+* TBI Internal MDIO bus
+
+As of this writing, every tsec is associated with an internal TBI PHY.
+This PHY is accessed through the local MDIO bus.  These buses are defined
+similarly to the mdio buses, except they are compatible with "fsl,gianfar-tbi".
+The TBI PHYs underneath them are similar to normal PHYs, but the reg property
+is considered instructive, rather than descriptive.  The reg property should
+be chosen so it doesn't interfere with other PHYs on the bus.
 
 * Gianfar-compatible ethernet nodes
 
diff --git a/arch/powerpc/boot/dts/asp834x-redboot.dts b/arch/powerpc/boot/dts/asp834x-redboot.dts
index 6235fca445d..524af7ef9f2 100644
--- a/arch/powerpc/boot/dts/asp834x-redboot.dts
+++ b/arch/powerpc/boot/dts/asp834x-redboot.dts
@@ -199,8 +199,26 @@
 				reg = <0x2>;
 				device_type = "ethernet-phy";
 			};
+
+			tbi0: tbi-phy@11 {
+				reg = <0x11>;
+				device_type = "tbi-phy";
+			};
 		};
 
+		mdio@25520 {
+			#address-cells = <1>;
+			#size-cells = <0>;
+			compatible = "fsl,gianfar-tbi";
+			reg = <0x25520 0x20>;
+
+			tbi1: tbi-phy@11 {
+				reg = <0x11>;
+				device_type = "tbi-phy";
+			};
+		};
+
+
 		enet0: ethernet@24000 {
 			cell-index = <0>;
 			device_type = "network";
@@ -210,6 +228,7 @@
 			local-mac-address = [ 00 08 e5 11 32 33 ];
 			interrupts = <32 0x8 33 0x8 34 0x8>;
 			interrupt-parent = <&ipic>;
+			tbi-handle = <&tbi0>;
 			phy-handle = <&phy0>;
 			linux,network-index = <0>;
 		};
@@ -223,6 +242,7 @@
 			local-mac-address = [ 00 08 e5 11 32 34 ];
 			interrupts = <35 0x8 36 0x8 37 0x8>;
 			interrupt-parent = <&ipic>;
+			tbi-handle = <&tbi1>;
 			phy-handle = <&phy1>;
 			linux,network-index = <1>;
 		};
diff --git a/arch/powerpc/boot/dts/ksi8560.dts b/arch/powerpc/boot/dts/ksi8560.dts
index 49737589ffc..3bfff47418d 100644
--- a/arch/powerpc/boot/dts/ksi8560.dts
+++ b/arch/powerpc/boot/dts/ksi8560.dts
@@ -141,8 +141,26 @@
 				reg = <0x2>;
 				device_type = "ethernet-phy";
 			};
+
+			tbi0: tbi-phy@11 {
+				reg = <0x11>;
+				device_type = "tbi-phy";
+			};
 		};
 
+		mdio@25520 {
+			#address-cells = <1>;
+			#size-cells = <0>;
+			compatible = "fsl,gianfar-tbi";
+			reg = <0x25520 0x20>;
+
+			tbi1: tbi-phy@11 {
+				reg = <0x11>;
+				device_type = "tbi-phy";
+			};
+		};
+
+
 		enet0: ethernet@24000 {
 			device_type = "network";
 			model = "TSEC";
@@ -152,6 +170,7 @@
 			local-mac-address = [ 00 00 00 00 00 00 ];
 			interrupts = <0x1d 0x2 0x1e 0x2 0x22 0x2>;
 			interrupt-parent = <&mpic>;
+			tbi-handle = <&tbi0>;
 			phy-handle = <&PHY1>;
 		};
 
@@ -164,6 +183,7 @@
 			local-mac-address = [ 00 00 00 00 00 00 ];
 			interrupts = <0x23 0x2 0x24 0x2 0x28 0x2>;
 			interrupt-parent = <&mpic>;
+			tbi-handle = <&tbi1>;
 			phy-handle = <&PHY2>;
 		};
 
diff --git a/arch/powerpc/boot/dts/mpc8313erdb.dts b/arch/powerpc/boot/dts/mpc8313erdb.dts
index 50303176682..d4df8b6857a 100644
--- a/arch/powerpc/boot/dts/mpc8313erdb.dts
+++ b/arch/powerpc/boot/dts/mpc8313erdb.dts
@@ -190,6 +190,7 @@
 			local-mac-address = [ 00 00 00 00 00 00 ];
 			interrupts = <37 0x8 36 0x8 35 0x8>;
 			interrupt-parent = <&ipic>;
+			tbi-handle = < &tbi0 >;
 			phy-handle = < &phy1 >;
 			fsl,magic-packet;
 
@@ -210,6 +211,10 @@
 					reg = <0x4>;
 					device_type = "ethernet-phy";
 				};
+				tbi0: tbi-phy@11 {
+					reg = <0x11>;
+					device_type = "tbi-phy";
+				};
 			};
 		};
 
@@ -222,9 +227,24 @@
 			local-mac-address = [ 00 00 00 00 00 00 ];
 			interrupts = <34 0x8 33 0x8 32 0x8>;
 			interrupt-parent = <&ipic>;
+			tbi-handle = < &tbi1 >;
 			phy-handle = < &phy4 >;
 			sleep = <&pmc 0x10000000>;
 			fsl,magic-packet;
+
+			mdio@25520 {
+				#address-cells = <1>;
+				#size-cells = <0>;
+				compatible = "fsl,gianfar-tbi";
+				reg = <0x25520 0x20>;
+
+				tbi1: tbi-phy@11 {
+					reg = <0x11>;
+					device_type = "tbi-phy";
+				};
+			};
+
+
 		};
 
 		serial0: serial@4500 {
diff --git a/arch/powerpc/boot/dts/mpc8315erdb.dts b/arch/powerpc/boot/dts/mpc8315erdb.dts
index 6b850670de1..d2cdd47a246 100644
--- a/arch/powerpc/boot/dts/mpc8315erdb.dts
+++ b/arch/powerpc/boot/dts/mpc8315erdb.dts
@@ -206,8 +206,25 @@
 				reg = <0x1>;
 				device_type = "ethernet-phy";
 			};
+			tbi0: tbi-phy@11 {
+				reg = <0x11>;
+				device_type = "tbi-phy";
+			};
+		};
+
+		mdio@25520 {
+			#address-cells = <1>;
+			#size-cells = <0>;
+			compatible = "fsl,gianfar-tbi";
+			reg = <0x25520 0x20>;
+
+			tbi1: tbi-phy@11 {
+				reg = <0x11>;
+				device_type = "tbi-phy";
+			};
 		};
 
+
 		enet0: ethernet@24000 {
 			cell-index = <0>;
 			device_type = "network";
@@ -217,6 +234,7 @@
 			local-mac-address = [ 00 00 00 00 00 00 ];
 			interrupts = <32 0x8 33 0x8 34 0x8>;
 			interrupt-parent = <&ipic>;
+			tbi-handle = <&tbi0>;
 			phy-handle = < &phy0 >;
 		};
 
@@ -229,6 +247,7 @@
 			local-mac-address = [ 00 00 00 00 00 00 ];
 			interrupts = <35 0x8 36 0x8 37 0x8>;
 			interrupt-parent = <&ipic>;
+			tbi-handle = <&tbi1>;
 			phy-handle = < &phy1 >;
 		};
 
diff --git a/arch/powerpc/boot/dts/mpc8349emitx.dts b/arch/powerpc/boot/dts/mpc8349emitx.dts
index 4bdbaf4993a..712783d0707 100644
--- a/arch/powerpc/boot/dts/mpc8349emitx.dts
+++ b/arch/powerpc/boot/dts/mpc8349emitx.dts
@@ -184,6 +184,22 @@
 				reg = <0x1c>;
 				device_type = "ethernet-phy";
 			};
+			tbi0: tbi-phy@11 {
+				reg = <0x11>;
+				device_type = "tbi-phy";
+			};
+		};
+
+		mdio@25520 {
+			#address-cells = <1>;
+			#size-cells = <0>;
+			compatible = "fsl,gianfar-tbi";
+			reg = <0x25520 0x20>;
+
+			tbi1: tbi-phy@11 {
+				reg = <0x11>;
+				device_type = "tbi-phy";
+			};
 		};
 
 		enet0: ethernet@24000 {
@@ -195,6 +211,7 @@
 			local-mac-address = [ 00 00 00 00 00 00 ];
 			interrupts = <32 0x8 33 0x8 34 0x8>;
 			interrupt-parent = <&ipic>;
+			tbi-handle = <&tbi0>;
 			phy-handle = <&phy1c>;
 			linux,network-index = <0>;
 		};
@@ -211,6 +228,7 @@
 			/* Vitesse 7385 isn't on the MDIO bus */
 			fixed-link = <1 1 1000 0 0>;
 			linux,network-index = <1>;
+			tbi-handle = <&tbi1>;
 		};
 
 		serial0: serial@4500 {
diff --git a/arch/powerpc/boot/dts/mpc8349emitxgp.dts b/arch/powerpc/boot/dts/mpc8349emitxgp.dts
index fa40647ee62..3e918af41fb 100644
--- a/arch/powerpc/boot/dts/mpc8349emitxgp.dts
+++ b/arch/powerpc/boot/dts/mpc8349emitxgp.dts
@@ -163,6 +163,10 @@
 				reg = <0x1c>;
 				device_type = "ethernet-phy";
 			};
+			tbi0: tbi-phy@11 {
+				reg = <0x11>;
+				device_type = "tbi-phy";
+			};
 		};
 
 		enet0: ethernet@24000 {
@@ -174,6 +178,7 @@
 			local-mac-address = [ 00 00 00 00 00 00 ];
 			interrupts = <32 0x8 33 0x8 34 0x8>;
 			interrupt-parent = <&ipic>;
+			tbi-handle = <&tbi0>;
 			phy-handle = <&phy1c>;
 			linux,network-index = <0>;
 		};
diff --git a/arch/powerpc/boot/dts/mpc834x_mds.dts b/arch/powerpc/boot/dts/mpc834x_mds.dts
index c986c541e9b..d9adba01c09 100644
--- a/arch/powerpc/boot/dts/mpc834x_mds.dts
+++ b/arch/powerpc/boot/dts/mpc834x_mds.dts
@@ -185,8 +185,25 @@
 				reg = <0x1>;
 				device_type = "ethernet-phy";
 			};
+			tbi0: tbi-phy@11 {
+				reg = <0x11>;
+				device_type = "tbi-phy";
+			};
+		};
+
+		mdio@25520 {
+			#address-cells = <1>;
+			#size-cells = <0>;
+			compatible = "fsl,gianfar-tbi";
+			reg = <0x25520 0x20>;
+
+			tbi1: tbi-phy@11 {
+				reg = <0x11>;
+				device_type = "tbi-phy";
+			};
 		};
 
+
 		enet0: ethernet@24000 {
 			cell-index = <0>;
 			device_type = "network";
@@ -196,6 +213,7 @@
 			local-mac-address = [ 00 00 00 00 00 00 ];
 			interrupts = <32 0x8 33 0x8 34 0x8>;
 			interrupt-parent = <&ipic>;
+			tbi-handle = <&tbi0>;
 			phy-handle = <&phy0>;
 			linux,network-index = <0>;
 		};
@@ -209,6 +227,7 @@
 			local-mac-address = [ 00 00 00 00 00 00 ];
 			interrupts = <35 0x8 36 0x8 37 0x8>;
 			interrupt-parent = <&ipic>;
+			tbi-handle = <&tbi1>;
 			phy-handle = <&phy1>;
 			linux,network-index = <1>;
 		};
diff --git a/arch/powerpc/boot/dts/mpc8377_mds.dts b/arch/powerpc/boot/dts/mpc8377_mds.dts
index 0484561bd2c..1d14d7052e6 100644
--- a/arch/powerpc/boot/dts/mpc8377_mds.dts
+++ b/arch/powerpc/boot/dts/mpc8377_mds.dts
@@ -193,8 +193,25 @@
 				reg = <0x3>;
 				device_type = "ethernet-phy";
 			};
+			tbi0: tbi-phy@11 {
+				reg = <0x11>;
+				device_type = "tbi-phy";
+			};
+		};
+
+		mdio@25520 {
+			#address-cells = <1>;
+			#size-cells = <0>;
+			compatible = "fsl,gianfar-tbi";
+			reg = <0x25520 0x20>;
+
+			tbi1: tbi-phy@11 {
+				reg = <0x11>;
+				device_type = "tbi-phy";
+			};
 		};
 
+
 		enet0: ethernet@24000 {
 			cell-index = <0>;
 			device_type = "network";
@@ -205,6 +222,7 @@
 			interrupts = <32 0x8 33 0x8 34 0x8>;
 			phy-connection-type = "mii";
 			interrupt-parent = <&ipic>;
+			tbi-handle = <&tbi0>;
 			phy-handle = <&phy2>;
 		};
 
@@ -218,6 +236,7 @@
 			interrupts = <35 0x8 36 0x8 37 0x8>;
 			phy-connection-type = "mii";
 			interrupt-parent = <&ipic>;
+			tbi-handle = <&tbi1>;
 			phy-handle = <&phy3>;
 		};
 
diff --git a/arch/powerpc/boot/dts/mpc8377_rdb.dts b/arch/powerpc/boot/dts/mpc8377_rdb.dts
index 435ef3dd022..31f348fdfe1 100644
--- a/arch/powerpc/boot/dts/mpc8377_rdb.dts
+++ b/arch/powerpc/boot/dts/mpc8377_rdb.dts
@@ -211,8 +211,25 @@
 				reg = <0x2>;
 				device_type = "ethernet-phy";
 			};
+			tbi0: tbi-phy@11 {
+				reg = <0x11>;
+				device_type = "tbi-phy";
+			};
+		};
+
+		mdio@25520 {
+			#address-cells = <1>;
+			#size-cells = <0>;
+			compatible = "fsl,gianfar-tbi";
+			reg = <0x25520 0x20>;
+
+			tbi1: tbi-phy@11 {
+				reg = <0x11>;
+				device_type = "tbi-phy";
+			};
 		};
 
+
 		enet0: ethernet@24000 {
 			cell-index = <0>;
 			device_type = "network";
@@ -223,6 +240,7 @@
 			interrupts = <32 0x8 33 0x8 34 0x8>;
 			phy-connection-type = "mii";
 			interrupt-parent = <&ipic>;
+			tbi-handle = <&tbi0>;
 			phy-handle = <&phy2>;
 		};
 
@@ -237,6 +255,7 @@
 			phy-connection-type = "mii";
 			interrupt-parent = <&ipic>;
 			fixed-link = <1 1 1000 0 0>;
+			tbi-handle = <&tbi1>;
 		};
 
 		serial0: serial@4500 {
diff --git a/arch/powerpc/boot/dts/mpc8378_mds.dts b/arch/powerpc/boot/dts/mpc8378_mds.dts
index 67a08d2e2ff..b85fc02682d 100644
--- a/arch/powerpc/boot/dts/mpc8378_mds.dts
+++ b/arch/powerpc/boot/dts/mpc8378_mds.dts
@@ -232,8 +232,25 @@
 				reg = <0x3>;
 				device_type = "ethernet-phy";
 			};
+			tbi0: tbi-phy@11 {
+				reg = <0x11>;
+				device_type = "tbi-phy";
+			};
+		};
+
+		mdio@25520 {
+			#address-cells = <1>;
+			#size-cells = <0>;
+			compatible = "fsl,gianfar-tbi";
+			reg = <0x25520 0x20>;
+
+			tbi1: tbi-phy@11 {
+				reg = <0x11>;
+				device_type = "tbi-phy";
+			};
 		};
 
+
 		enet0: ethernet@24000 {
 			cell-index = <0>;
 			device_type = "network";
@@ -244,6 +261,7 @@
 			interrupts = <32 0x8 33 0x8 34 0x8>;
 			phy-connection-type = "mii";
 			interrupt-parent = <&ipic>;
+			tbi-handle = <&tbi0>;
 			phy-handle = <&phy2>;
 		};
 
@@ -257,6 +275,7 @@
 			interrupts = <35 0x8 36 0x8 37 0x8>;
 			phy-connection-type = "mii";
 			interrupt-parent = <&ipic>;
+			tbi-handle = <&tbi1>;
 			phy-handle = <&phy3>;
 		};
 
diff --git a/arch/powerpc/boot/dts/mpc8378_rdb.dts b/arch/powerpc/boot/dts/mpc8378_rdb.dts
index b11e68f56a0..7a2bad038bd 100644
--- a/arch/powerpc/boot/dts/mpc8378_rdb.dts
+++ b/arch/powerpc/boot/dts/mpc8378_rdb.dts
@@ -211,8 +211,25 @@
 				reg = <0x2>;
 				device_type = "ethernet-phy";
 			};
+			tbi0: tbi-phy@11 {
+				reg = <0x11>;
+				device_type = "tbi-phy";
+			};
+		};
+
+		mdio@25520 {
+			#address-cells = <1>;
+			#size-cells = <0>;
+			compatible = "fsl,gianfar-tbi";
+			reg = <0x25520 0x20>;
+
+			tbi1: tbi-phy@11 {
+				reg = <0x11>;
+				device_type = "tbi-phy";
+			};
 		};
 
+
 		enet0: ethernet@24000 {
 			cell-index = <0>;
 			device_type = "network";
diff --git a/arch/powerpc/boot/dts/mpc8379_mds.dts b/arch/powerpc/boot/dts/mpc8379_mds.dts
index 323370a2b5f..acf06c438db 100644
--- a/arch/powerpc/boot/dts/mpc8379_mds.dts
+++ b/arch/powerpc/boot/dts/mpc8379_mds.dts
@@ -232,6 +232,22 @@
 				reg = <0x3>;
 				device_type = "ethernet-phy";
 			};
+			tbi0: tbi-phy@11 {
+				reg = <0x11>;
+				device_type = "tbi-phy";
+			};
+		};
+
+		mdio@25520 {
+			#address-cells = <1>;
+			#size-cells = <0>;
+			compatible = "fsl,gianfar-tbi";
+			reg = <0x25520 0x20>;
+
+			tbi1: tbi-phy@11 {
+				reg = <0x11>;
+				device_type = "tbi-phy";
+			};
 		};
 
 		enet0: ethernet@24000 {
@@ -244,6 +260,7 @@
 			interrupts = <32 0x8 33 0x8 34 0x8>;
 			phy-connection-type = "mii";
 			interrupt-parent = <&ipic>;
+			tbi-handle = <&tbi0>;
 			phy-handle = <&phy2>;
 		};
 
@@ -257,6 +274,7 @@
 			interrupts = <35 0x8 36 0x8 37 0x8>;
 			phy-connection-type = "mii";
 			interrupt-parent = <&ipic>;
+			tbi-handle = <&tbi1>;
 			phy-handle = <&phy3>;
 		};
 
diff --git a/arch/powerpc/boot/dts/mpc8379_rdb.dts b/arch/powerpc/boot/dts/mpc8379_rdb.dts
index 337af6ea26d..e067616f3f4 100644
--- a/arch/powerpc/boot/dts/mpc8379_rdb.dts
+++ b/arch/powerpc/boot/dts/mpc8379_rdb.dts
@@ -211,6 +211,22 @@
 				reg = <0x2>;
 				device_type = "ethernet-phy";
 			};
+			tbi0: tbi-phy@11 {
+				reg = <0x11>;
+				device_type = "tbi-phy";
+			};
+		};
+
+		mdio@25520 {
+			#address-cells = <1>;
+			#size-cells = <0>;
+			compatible = "fsl,gianfar-tbi";
+			reg = <0x25520 0x20>;
+
+			tbi1: tbi-phy@11 {
+				reg = <0x11>;
+				device_type = "tbi-phy";
+			};
 		};
 
 		enet0: ethernet@24000 {
@@ -223,6 +239,7 @@
 			interrupts = <32 0x8 33 0x8 34 0x8>;
 			phy-connection-type = "mii";
 			interrupt-parent = <&ipic>;
+			tbi-handle = <&tbi0>;
 			phy-handle = <&phy2>;
 		};
 
@@ -237,6 +254,7 @@
 			phy-connection-type = "mii";
 			interrupt-parent = <&ipic>;
 			fixed-link = <1 1 1000 0 0>;
+			tbi-handle = <&tbi1>;
 		};
 
 		serial0: serial@4500 {
diff --git a/arch/powerpc/boot/dts/mpc8536ds.dts b/arch/powerpc/boot/dts/mpc8536ds.dts
index 35db1e5440c..3c905df1812 100644
--- a/arch/powerpc/boot/dts/mpc8536ds.dts
+++ b/arch/powerpc/boot/dts/mpc8536ds.dts
@@ -155,6 +155,22 @@
 				reg = <1>;
 				device_type = "ethernet-phy";
 			};
+			tbi0: tbi-phy@11 {
+				reg = <0x11>;
+				device_type = "tbi-phy";
+			};
+		};
+
+		mdio@26520 {
+			#address-cells = <1>;
+			#size-cells = <0>;
+			compatible = "fsl,gianfar-tbi";
+			reg = <0x26520 0x20>;
+
+			tbi1: tbi-phy@11 {
+				reg = <0x11>;
+				device_type = "tbi-phy";
+			};
 		};
 
 		usb@22000 {
@@ -186,6 +202,7 @@
 			local-mac-address = [ 00 00 00 00 00 00 ];
 			interrupts = <29 2 30 2 34 2>;
 			interrupt-parent = <&mpic>;
+			tbi-handle = <&tbi0>;
 			phy-handle = <&phy1>;
 			phy-connection-type = "rgmii-id";
 		};
@@ -199,6 +216,7 @@
 			local-mac-address = [ 00 00 00 00 00 00 ];
 			interrupts = <31 2 32 2 33 2>;
 			interrupt-parent = <&mpic>;
+			tbi-handle = <&tbi1>;
 			phy-handle = <&phy0>;
 			phy-connection-type = "rgmii-id";
 		};
diff --git a/arch/powerpc/boot/dts/mpc8540ads.dts b/arch/powerpc/boot/dts/mpc8540ads.dts
index 9568bfaff8f..79570ffe41b 100644
--- a/arch/powerpc/boot/dts/mpc8540ads.dts
+++ b/arch/powerpc/boot/dts/mpc8540ads.dts
@@ -150,6 +150,34 @@
 				reg = <0x3>;
 				device_type = "ethernet-phy";
 			};
+			tbi0: tbi-phy@11 {
+				reg = <0x11>;
+				device_type = "tbi-phy";
+			};
+		};
+
+		mdio@25520 {
+			#address-cells = <1>;
+			#size-cells = <0>;
+			compatible = "fsl,gianfar-tbi";
+			reg = <0x25520 0x20>;
+
+			tbi1: tbi-phy@11 {
+				reg = <0x11>;
+				device_type = "tbi-phy";
+			};
+		};
+
+		mdio@26520 {
+			#address-cells = <1>;
+			#size-cells = <0>;
+			compatible = "fsl,gianfar-tbi";
+			reg = <0x26520 0x20>;
+
+			tbi2: tbi-phy@11 {
+				reg = <0x11>;
+				device_type = "tbi-phy";
+			};
 		};
 
 		enet0: ethernet@24000 {
@@ -161,6 +189,7 @@
 			local-mac-address = [ 00 00 00 00 00 00 ];
 			interrupts = <29 2 30 2 34 2>;
 			interrupt-parent = <&mpic>;
+			tbi-handle = <&tbi0>;
 			phy-handle = <&phy0>;
 		};
 
@@ -173,6 +202,7 @@
 			local-mac-address = [ 00 00 00 00 00 00 ];
 			interrupts = <35 2 36 2 40 2>;
 			interrupt-parent = <&mpic>;
+			tbi-handle = <&tbi1>;
 			phy-handle = <&phy1>;
 		};
 
@@ -185,6 +215,7 @@
 			local-mac-address = [ 00 00 00 00 00 00 ];
 			interrupts = <41 2>;
 			interrupt-parent = <&mpic>;
+			tbi-handle = <&tbi2>;
 			phy-handle = <&phy3>;
 		};
 
diff --git a/arch/powerpc/boot/dts/mpc8541cds.dts b/arch/powerpc/boot/dts/mpc8541cds.dts
index 6480f4fd96e..221036a8ce2 100644
--- a/arch/powerpc/boot/dts/mpc8541cds.dts
+++ b/arch/powerpc/boot/dts/mpc8541cds.dts
@@ -144,6 +144,22 @@
 				reg = <0x1>;
 				device_type = "ethernet-phy";
 			};
+			tbi0: tbi-phy@11 {
+				reg = <0x11>;
+				device_type = "tbi-phy";
+			};
+		};
+
+		mdio@25520 {
+			#address-cells = <1>;
+			#size-cells = <0>;
+			compatible = "fsl,gianfar-tbi";
+			reg = <0x25520 0x20>;
+
+			tbi1: tbi-phy@11 {
+				reg = <0x11>;
+				device_type = "tbi-phy";
+			};
 		};
 
 		enet0: ethernet@24000 {
@@ -155,6 +171,7 @@
 			local-mac-address = [ 00 00 00 00 00 00 ];
 			interrupts = <29 2 30 2 34 2>;
 			interrupt-parent = <&mpic>;
+			tbi-handle = <&tbi0>;
 			phy-handle = <&phy0>;
 		};
 
@@ -167,6 +184,7 @@
 			local-mac-address = [ 00 00 00 00 00 00 ];
 			interrupts = <35 2 36 2 40 2>;
 			interrupt-parent = <&mpic>;
+			tbi-handle = <&tbi1>;
 			phy-handle = <&phy1>;
 		};
 
diff --git a/arch/powerpc/boot/dts/mpc8544ds.dts b/arch/powerpc/boot/dts/mpc8544ds.dts
index f1fb20737e3..b9da4210506 100644
--- a/arch/powerpc/boot/dts/mpc8544ds.dts
+++ b/arch/powerpc/boot/dts/mpc8544ds.dts
@@ -116,8 +116,26 @@
 				reg = <0x1>;
 				device_type = "ethernet-phy";
 			};
+
+			tbi0: tbi-phy@11 {
+				reg = <0x11>;
+				device_type = "tbi-phy";
+			};
 		};
 
+		mdio@26520 {
+			#address-cells = <1>;
+			#size-cells = <0>;
+			compatible = "fsl,gianfar-tbi";
+			reg = <0x26520 0x20>;
+
+			tbi1: tbi-phy@11 {
+				reg = <0x11>;
+				device_type = "tbi-phy";
+			};
+		};
+
+
 		dma@21300 {
 			#address-cells = <1>;
 			#size-cells = <1>;
@@ -169,6 +187,7 @@
 			interrupts = <29 2 30 2 34 2>;
 			interrupt-parent = <&mpic>;
 			phy-handle = <&phy0>;
+			tbi-handle = <&tbi0>;
 			phy-connection-type = "rgmii-id";
 		};
 
@@ -182,6 +201,7 @@
 			interrupts = <31 2 32 2 33 2>;
 			interrupt-parent = <&mpic>;
 			phy-handle = <&phy1>;
+			tbi-handle = <&tbi1>;
 			phy-connection-type = "rgmii-id";
 		};
 
diff --git a/arch/powerpc/boot/dts/mpc8548cds.dts b/arch/powerpc/boot/dts/mpc8548cds.dts
index 431b496270d..df774a7088f 100644
--- a/arch/powerpc/boot/dts/mpc8548cds.dts
+++ b/arch/powerpc/boot/dts/mpc8548cds.dts
@@ -172,6 +172,46 @@
 				reg = <0x3>;
 				device_type = "ethernet-phy";
 			};
+			tbi0: tbi-phy@11 {
+				reg = <0x11>;
+				device_type = "tbi-phy";
+			};
+		};
+
+		mdio@25520 {
+			#address-cells = <1>;
+			#size-cells = <0>;
+			compatible = "fsl,gianfar-tbi";
+			reg = <0x25520 0x20>;
+
+			tbi1: tbi-phy@11 {
+				reg = <0x11>;
+				device_type = "tbi-phy";
+			};
+		};
+
+		mdio@26520 {
+			#address-cells = <1>;
+			#size-cells = <0>;
+			compatible = "fsl,gianfar-tbi";
+			reg = <0x26520 0x20>;
+
+			tbi2: tbi-phy@11 {
+				reg = <0x11>;
+				device_type = "tbi-phy";
+			};
+		};
+
+		mdio@27520 {
+			#address-cells = <1>;
+			#size-cells = <0>;
+			compatible = "fsl,gianfar-tbi";
+			reg = <0x27520 0x20>;
+
+			tbi3: tbi-phy@11 {
+				reg = <0x11>;
+				device_type = "tbi-phy";
+			};
 		};
 
 		enet0: ethernet@24000 {
@@ -183,6 +223,7 @@
 			local-mac-address = [ 00 00 00 00 00 00 ];
 			interrupts = <29 2 30 2 34 2>;
 			interrupt-parent = <&mpic>;
+			tbi-handle = <&tbi0>;
 			phy-handle = <&phy0>;
 		};
 
@@ -195,6 +236,7 @@
 			local-mac-address = [ 00 00 00 00 00 00 ];
 			interrupts = <35 2 36 2 40 2>;
 			interrupt-parent = <&mpic>;
+			tbi-handle = <&tbi1>;
 			phy-handle = <&phy1>;
 		};
 
@@ -208,6 +250,7 @@
 			local-mac-address = [ 00 00 00 00 00 00 ];
 			interrupts = <31 2 32 2 33 2>;
 			interrupt-parent = <&mpic>;
+			tbi-handle = <&tbi2>;
 			phy-handle = <&phy2>;
 		};
 
@@ -220,6 +263,7 @@
 			local-mac-address = [ 00 00 00 00 00 00 ];
 			interrupts = <37 2 38 2 39 2>;
 			interrupt-parent = <&mpic>;
+			tbi-handle = <&tbi3>;
 			phy-handle = <&phy3>;
 		};
  */
diff --git a/arch/powerpc/boot/dts/mpc8555cds.dts b/arch/powerpc/boot/dts/mpc8555cds.dts
index d833a5c4f47..053b01e1c93 100644
--- a/arch/powerpc/boot/dts/mpc8555cds.dts
+++ b/arch/powerpc/boot/dts/mpc8555cds.dts
@@ -144,6 +144,22 @@
 				reg = <0x1>;
 				device_type = "ethernet-phy";
 			};
+			tbi0: tbi-phy@11 {
+				reg = <0x11>;
+				device_type = "tbi-phy";
+			};
+		};
+
+		mdio@25520 {
+			#address-cells = <1>;
+			#size-cells = <0>;
+			compatible = "fsl,gianfar-tbi";
+			reg = <0x25520 0x20>;
+
+			tbi1: tbi-phy@11 {
+				reg = <0x11>;
+				device_type = "tbi-phy";
+			};
 		};
 
 		enet0: ethernet@24000 {
@@ -155,6 +171,7 @@
 			local-mac-address = [ 00 00 00 00 00 00 ];
 			interrupts = <29 2 30 2 34 2>;
 			interrupt-parent = <&mpic>;
+			tbi-handle = <&tbi0>;
 			phy-handle = <&phy0>;
 		};
 
@@ -167,6 +184,7 @@
 			local-mac-address = [ 00 00 00 00 00 00 ];
 			interrupts = <35 2 36 2 40 2>;
 			interrupt-parent = <&mpic>;
+			tbi-handle = <&tbi1>;
 			phy-handle = <&phy1>;
 		};
 
diff --git a/arch/powerpc/boot/dts/mpc8560ads.dts b/arch/powerpc/boot/dts/mpc8560ads.dts
index 4d1f2f28409..11b1bcbe14c 100644
--- a/arch/powerpc/boot/dts/mpc8560ads.dts
+++ b/arch/powerpc/boot/dts/mpc8560ads.dts
@@ -145,6 +145,22 @@
 				reg = <0x3>;
 				device_type = "ethernet-phy";
 			};
+			tbi0: tbi-phy@11 {
+				reg = <0x11>;
+				device_type = "tbi-phy";
+			};
+		};
+
+		mdio@25520 {
+			#address-cells = <1>;
+			#size-cells = <0>;
+			compatible = "fsl,gianfar-tbi";
+			reg = <0x25520 0x20>;
+
+			tbi1: tbi-phy@11 {
+				reg = <0x11>;
+				device_type = "tbi-phy";
+			};
 		};
 
 		enet0: ethernet@24000 {
@@ -156,6 +172,7 @@
 			local-mac-address = [ 00 00 00 00 00 00 ];
 			interrupts = <29 2 30 2 34 2>;
 			interrupt-parent = <&mpic>;
+			tbi-handle = <&tbi0>;
 			phy-handle = <&phy0>;
 		};
 
@@ -168,6 +185,7 @@
 			local-mac-address = [ 00 00 00 00 00 00 ];
 			interrupts = <35 2 36 2 40 2>;
 			interrupt-parent = <&mpic>;
+			tbi-handle = <&tbi1>;
 			phy-handle = <&phy1>;
 		};
 
diff --git a/arch/powerpc/boot/dts/mpc8568mds.dts b/arch/powerpc/boot/dts/mpc8568mds.dts
index c80158f7741..1955bd9e113 100644
--- a/arch/powerpc/boot/dts/mpc8568mds.dts
+++ b/arch/powerpc/boot/dts/mpc8568mds.dts
@@ -179,6 +179,22 @@
 				reg = <0x3>;
 				device_type = "ethernet-phy";
 			};
+			tbi0: tbi-phy@11 {
+				reg = <0x11>;
+				device_type = "tbi-phy";
+			};
+		};
+
+		mdio@25520 {
+			#address-cells = <1>;
+			#size-cells = <0>;
+			compatible = "fsl,gianfar-tbi";
+			reg = <0x25520 0x20>;
+
+			tbi1: tbi-phy@11 {
+				reg = <0x11>;
+				device_type = "tbi-phy";
+			};
 		};
 
 		enet0: ethernet@24000 {
@@ -190,6 +206,7 @@
 			local-mac-address = [ 00 00 00 00 00 00 ];
  			interrupts = <29 2 30 2 34 2>;
 			interrupt-parent = <&mpic>;
+			tbi-handle = <&tbi0>;
 			phy-handle = <&phy2>;
 		};
 
@@ -202,6 +219,7 @@
 			local-mac-address = [ 00 00 00 00 00 00 ];
  			interrupts = <35 2 36 2 40 2>;
 			interrupt-parent = <&mpic>;
+			tbi-handle = <&tbi1>;
 			phy-handle = <&phy3>;
 		};
 
diff --git a/arch/powerpc/boot/dts/mpc8572ds.dts b/arch/powerpc/boot/dts/mpc8572ds.dts
index 5c69b2fafd3..05f67253b49 100644
--- a/arch/powerpc/boot/dts/mpc8572ds.dts
+++ b/arch/powerpc/boot/dts/mpc8572ds.dts
@@ -225,6 +225,47 @@
 				interrupts = <10 1>;
 				reg = <0x3>;
 			};
+
+			tbi0: tbi-phy@11 {
+				reg = <0x11>;
+				device_type = "tbi-phy";
+			};
+		};
+
+		mdio@25520 {
+			#address-cells = <1>;
+			#size-cells = <0>;
+			compatible = "fsl,gianfar-tbi";
+			reg = <0x25520 0x20>;
+
+			tbi1: tbi-phy@11 {
+				reg = <0x11>;
+				device_type = "tbi-phy";
+			};
+		};
+
+		mdio@26520 {
+			#address-cells = <1>;
+			#size-cells = <0>;
+			compatible = "fsl,gianfar-tbi";
+			reg = <0x26520 0x20>;
+
+			tbi2: tbi-phy@11 {
+				reg = <0x11>;
+				device_type = "tbi-phy";
+			};
+		};
+
+		mdio@27520 {
+			#address-cells = <1>;
+			#size-cells = <0>;
+			compatible = "fsl,gianfar-tbi";
+			reg = <0x27520 0x20>;
+
+			tbi3: tbi-phy@11 {
+				reg = <0x11>;
+				device_type = "tbi-phy";
+			};
 		};
 
 		enet0: ethernet@24000 {
@@ -236,6 +277,7 @@
 			local-mac-address = [ 00 00 00 00 00 00 ];
 			interrupts = <29 2 30 2 34 2>;
 			interrupt-parent = <&mpic>;
+			tbi-handle = <&tbi0>;
 			phy-handle = <&phy0>;
 			phy-connection-type = "rgmii-id";
 		};
@@ -249,6 +291,7 @@
 			local-mac-address = [ 00 00 00 00 00 00 ];
 			interrupts = <35 2 36 2 40 2>;
 			interrupt-parent = <&mpic>;
+			tbi-handle = <&tbi1>;
 			phy-handle = <&phy1>;
 			phy-connection-type = "rgmii-id";
 		};
@@ -262,6 +305,7 @@
 			local-mac-address = [ 00 00 00 00 00 00 ];
 			interrupts = <31 2 32 2 33 2>;
 			interrupt-parent = <&mpic>;
+			tbi-handle = <&tbi2>;
 			phy-handle = <&phy2>;
 			phy-connection-type = "rgmii-id";
 		};
@@ -275,6 +319,7 @@
 			local-mac-address = [ 00 00 00 00 00 00 ];
 			interrupts = <37 2 38 2 39 2>;
 			interrupt-parent = <&mpic>;
+			tbi-handle = <&tbi3>;
 			phy-handle = <&phy3>;
 			phy-connection-type = "rgmii-id";
 		};
diff --git a/arch/powerpc/boot/dts/mpc8641_hpcn.dts b/arch/powerpc/boot/dts/mpc8641_hpcn.dts
index d665e767822..35d5e248ccd 100644
--- a/arch/powerpc/boot/dts/mpc8641_hpcn.dts
+++ b/arch/powerpc/boot/dts/mpc8641_hpcn.dts
@@ -205,8 +205,49 @@
 				reg = <3>;
 				device_type = "ethernet-phy";
 			};
+			tbi0: tbi-phy@11 {
+				reg = <0x11>;
+				device_type = "tbi-phy";
+			};
+		};
+
+		mdio@25520 {
+			#address-cells = <1>;
+			#size-cells = <0>;
+			compatible = "fsl,gianfar-tbi";
+			reg = <0x25520 0x20>;
+
+			tbi1: tbi-phy@11 {
+				reg = <0x11>;
+				device_type = "tbi-phy";
+			};
+		};
+
+		mdio@26520 {
+			#address-cells = <1>;
+			#size-cells = <0>;
+			compatible = "fsl,gianfar-tbi";
+			reg = <0x26520 0x20>;
+
+			tbi2: tbi-phy@11 {
+				reg = <0x11>;
+				device_type = "tbi-phy";
+			};
+		};
+
+		mdio@27520 {
+			#address-cells = <1>;
+			#size-cells = <0>;
+			compatible = "fsl,gianfar-tbi";
+			reg = <0x27520 0x20>;
+
+			tbi3: tbi-phy@11 {
+				reg = <0x11>;
+				device_type = "tbi-phy";
+			};
 		};
 
+
 		enet0: ethernet@24000 {
 			cell-index = <0>;
 			device_type = "network";
@@ -216,6 +257,7 @@
 			local-mac-address = [ 00 00 00 00 00 00 ];
 			interrupts = <29 2 30  2 34 2>;
 			interrupt-parent = <&mpic>;
+			tbi-handle = <&tbi0>;
 			phy-handle = <&phy0>;
 			phy-connection-type = "rgmii-id";
 		};
@@ -229,6 +271,7 @@
 			local-mac-address = [ 00 00 00 00 00 00 ];
 			interrupts = <35 2 36 2 40 2>;
 			interrupt-parent = <&mpic>;
+			tbi-handle = <&tbi1>;
 			phy-handle = <&phy1>;
 			phy-connection-type = "rgmii-id";
 		};
@@ -242,6 +285,7 @@
 			local-mac-address = [ 00 00 00 00 00 00 ];
 			interrupts = <31 2 32 2 33 2>;
 			interrupt-parent = <&mpic>;
+			tbi-handle = <&tbi2>;
 			phy-handle = <&phy2>;
 			phy-connection-type = "rgmii-id";
 		};
@@ -255,6 +299,7 @@
 			local-mac-address = [ 00 00 00 00 00 00 ];
 			interrupts = <37 2 38 2 39 2>;
 			interrupt-parent = <&mpic>;
+			tbi-handle = <&tbi3>;
 			phy-handle = <&phy3>;
 			phy-connection-type = "rgmii-id";
 		};
diff --git a/arch/powerpc/boot/dts/sbc8349.dts b/arch/powerpc/boot/dts/sbc8349.dts
index 0f941f310e4..8d365a57ebc 100644
--- a/arch/powerpc/boot/dts/sbc8349.dts
+++ b/arch/powerpc/boot/dts/sbc8349.dts
@@ -177,6 +177,22 @@
 				reg = <0x1a>;
 				device_type = "ethernet-phy";
 			};
+			tbi0: tbi-phy@11 {
+				reg = <0x11>;
+				device_type = "tbi-phy";
+			};
+		};
+
+		mdio@25520 {
+			#address-cells = <1>;
+			#size-cells = <0>;
+			compatible = "fsl,gianfar-tbi";
+			reg = <0x25520 0x20>;
+
+			tbi1: tbi-phy@11 {
+				reg = <0x11>;
+				device_type = "tbi-phy";
+			};
 		};
 
 		enet0: ethernet@24000 {
@@ -188,6 +204,7 @@
 			local-mac-address = [ 00 00 00 00 00 00 ];
 			interrupts = <32 0x8 33 0x8 34 0x8>;
 			interrupt-parent = <&ipic>;
+			tbi-handle = <&tbi0>;
 			phy-handle = <&phy0>;
 			linux,network-index = <0>;
 		};
@@ -201,6 +218,7 @@
 			local-mac-address = [ 00 00 00 00 00 00 ];
 			interrupts = <35 0x8 36 0x8 37 0x8>;
 			interrupt-parent = <&ipic>;
+			tbi-handle = <&tbi1>;
 			phy-handle = <&phy1>;
 			linux,network-index = <1>;
 		};
diff --git a/arch/powerpc/boot/dts/sbc8548.dts b/arch/powerpc/boot/dts/sbc8548.dts
index 333552b4e90..2baf4a51f22 100644
--- a/arch/powerpc/boot/dts/sbc8548.dts
+++ b/arch/powerpc/boot/dts/sbc8548.dts
@@ -252,6 +252,22 @@
 				reg = <0x1a>;
 				device_type = "ethernet-phy";
 			};
+			tbi0: tbi-phy@11 {
+				reg = <0x11>;
+				device_type = "tbi-phy";
+			};
+		};
+
+		mdio@25520 {
+			#address-cells = <1>;
+			#size-cells = <0>;
+			compatible = "fsl,gianfar-tbi";
+			reg = <0x25520 0x20>;
+
+			tbi1: tbi-phy@11 {
+				reg = <0x11>;
+				device_type = "tbi-phy";
+			};
 		};
 
 		enet0: ethernet@24000 {
@@ -263,6 +279,7 @@
 			local-mac-address = [ 00 00 00 00 00 00 ];
 			interrupts = <0x1d 0x2 0x1e 0x2 0x22 0x2>;
 			interrupt-parent = <&mpic>;
+			tbi-handle = <&tbi0>;
 			phy-handle = <&phy0>;
 		};
 
@@ -275,6 +292,7 @@
 			local-mac-address = [ 00 00 00 00 00 00 ];
 			interrupts = <0x23 0x2 0x24 0x2 0x28 0x2>;
 			interrupt-parent = <&mpic>;
+			tbi-handle = <&tbi1>;
 			phy-handle = <&phy1>;
 		};
 
diff --git a/arch/powerpc/boot/dts/sbc8560.dts b/arch/powerpc/boot/dts/sbc8560.dts
index db3632ef988..01542f7062a 100644
--- a/arch/powerpc/boot/dts/sbc8560.dts
+++ b/arch/powerpc/boot/dts/sbc8560.dts
@@ -168,6 +168,22 @@
 				reg = <0x1c>;
 				device_type = "ethernet-phy";
 			};
+			tbi0: tbi-phy@11 {
+				reg = <0x11>;
+				device_type = "tbi-phy";
+			};
+		};
+
+		mdio@25520 {
+			#address-cells = <1>;
+			#size-cells = <0>;
+			compatible = "fsl,gianfar-tbi";
+			reg = <0x25520 0x20>;
+
+			tbi1: tbi-phy@11 {
+				reg = <0x11>;
+				device_type = "tbi-phy";
+			};
 		};
 
 		enet0: ethernet@24000 {
@@ -179,6 +195,7 @@
 			local-mac-address = [ 00 00 00 00 00 00 ];
 			interrupts = <0x1d 0x2 0x1e 0x2 0x22 0x2>;
 			interrupt-parent = <&mpic>;
+			tbi-handle = <&tbi0>;
 			phy-handle = <&phy0>;
 		};
 
@@ -191,6 +208,7 @@
 			local-mac-address = [ 00 00 00 00 00 00 ];
 			interrupts = <0x23 0x2 0x24 0x2 0x28 0x2>;
 			interrupt-parent = <&mpic>;
+			tbi-handle = <&tbi1>;
 			phy-handle = <&phy1>;
 		};
 
diff --git a/arch/powerpc/boot/dts/sbc8641d.dts b/arch/powerpc/boot/dts/sbc8641d.dts
index 9652456158f..36db981548e 100644
--- a/arch/powerpc/boot/dts/sbc8641d.dts
+++ b/arch/powerpc/boot/dts/sbc8641d.dts
@@ -222,6 +222,46 @@
 				reg = <2>;
 				device_type = "ethernet-phy";
 			};
+			tbi0: tbi-phy@11 {
+				reg = <0x11>;
+				device_type = "tbi-phy";
+			};
+		};
+
+		mdio@25520 {
+			#address-cells = <1>;
+			#size-cells = <0>;
+			compatible = "fsl,gianfar-tbi";
+			reg = <0x25520 0x20>;
+
+			tbi1: tbi-phy@11 {
+				reg = <0x11>;
+				device_type = "tbi-phy";
+			};
+		};
+
+		mdio@26520 {
+			#address-cells = <1>;
+			#size-cells = <0>;
+			compatible = "fsl,gianfar-tbi";
+			reg = <0x26520 0x20>;
+
+			tbi2: tbi-phy@11 {
+				reg = <0x11>;
+				device_type = "tbi-phy";
+			};
+		};
+
+		mdio@27520 {
+			#address-cells = <1>;
+			#size-cells = <0>;
+			compatible = "fsl,gianfar-tbi";
+			reg = <0x27520 0x20>;
+
+			tbi3: tbi-phy@11 {
+				reg = <0x11>;
+				device_type = "tbi-phy";
+			};
 		};
 
 		enet0: ethernet@24000 {
@@ -233,6 +273,7 @@
 			local-mac-address = [ 00 00 00 00 00 00 ];
 			interrupts = <29 2 30  2 34 2>;
 			interrupt-parent = <&mpic>;
+			tbi-handle = <&tbi0>;
 			phy-handle = <&phy0>;
 			phy-connection-type = "rgmii-id";
 		};
@@ -246,6 +287,7 @@
 			local-mac-address = [ 00 00 00 00 00 00 ];
 			interrupts = <35 2 36 2 40 2>;
 			interrupt-parent = <&mpic>;
+			tbi-handle = <&tbi1>;
 			phy-handle = <&phy1>;
 			phy-connection-type = "rgmii-id";
 		};
@@ -259,6 +301,7 @@
 			local-mac-address = [ 00 00 00 00 00 00 ];
 			interrupts = <31 2 32 2 33 2>;
 			interrupt-parent = <&mpic>;
+			tbi-handle = <&tbi2>;
 			phy-handle = <&phy2>;
 			phy-connection-type = "rgmii-id";
 		};
@@ -272,6 +315,7 @@
 			local-mac-address = [ 00 00 00 00 00 00 ];
 			interrupts = <37 2 38 2 39 2>;
 			interrupt-parent = <&mpic>;
+			tbi-handle = <&tbi3>;
 			phy-handle = <&phy3>;
 			phy-connection-type = "rgmii-id";
 		};
diff --git a/arch/powerpc/boot/dts/stx_gp3_8560.dts b/arch/powerpc/boot/dts/stx_gp3_8560.dts
index fcd1db6ca0a..fff33fe6efc 100644
--- a/arch/powerpc/boot/dts/stx_gp3_8560.dts
+++ b/arch/powerpc/boot/dts/stx_gp3_8560.dts
@@ -142,6 +142,22 @@
 				reg = <4>;
 				device_type = "ethernet-phy";
 			};
+			tbi0: tbi-phy@11 {
+				reg = <0x11>;
+				device_type = "tbi-phy";
+			};
+		};
+
+		mdio@25520 {
+			#address-cells = <1>;
+			#size-cells = <0>;
+			compatible = "fsl,gianfar-tbi";
+			reg = <0x25520 0x20>;
+
+			tbi1: tbi-phy@11 {
+				reg = <0x11>;
+				device_type = "tbi-phy";
+			};
 		};
 
 		enet0: ethernet@24000 {
@@ -153,6 +169,7 @@
 			local-mac-address = [ 00 00 00 00 00 00 ];
 			interrupts = <29 2 30 2 34 2>;
 			interrupt-parent = <&mpic>;
+			tbi-handle = <&tbi0>;
 			phy-handle = <&phy2>;
 		};
 
@@ -165,6 +182,7 @@
 			local-mac-address = [ 00 00 00 00 00 00 ];
 			interrupts = <35 2 36 2 40 2>;
 			interrupt-parent = <&mpic>;
+			tbi-handle = <&tbi1>;
 			phy-handle = <&phy4>;
 		};
 
diff --git a/arch/powerpc/boot/dts/tqm8540.dts b/arch/powerpc/boot/dts/tqm8540.dts
index e1d260b9085..a693f01c21a 100644
--- a/arch/powerpc/boot/dts/tqm8540.dts
+++ b/arch/powerpc/boot/dts/tqm8540.dts
@@ -155,6 +155,34 @@
 				reg = <3>;
 				device_type = "ethernet-phy";
 			};
+			tbi0: tbi-phy@11 {
+				reg = <0x11>;
+				device_type = "tbi-phy";
+			};
+		};
+
+		mdio@25520 {
+			#address-cells = <1>;
+			#size-cells = <0>;
+			compatible = "fsl,gianfar-tbi";
+			reg = <0x25520 0x20>;
+
+			tbi1: tbi-phy@11 {
+				reg = <0x11>;
+				device_type = "tbi-phy";
+			};
+		};
+
+		mdio@26520 {
+			#address-cells = <1>;
+			#size-cells = <0>;
+			compatible = "fsl,gianfar-tbi";
+			reg = <0x26520 0x20>;
+
+			tbi2: tbi-phy@11 {
+				reg = <0x11>;
+				device_type = "tbi-phy";
+			};
 		};
 
 		enet0: ethernet@24000 {
diff --git a/arch/powerpc/boot/dts/tqm8541.dts b/arch/powerpc/boot/dts/tqm8541.dts
index d76441ec5dc..9e3f5f0dde2 100644
--- a/arch/powerpc/boot/dts/tqm8541.dts
+++ b/arch/powerpc/boot/dts/tqm8541.dts
@@ -154,6 +154,22 @@
 				reg = <3>;
 				device_type = "ethernet-phy";
 			};
+			tbi0: tbi-phy@11 {
+				reg = <0x11>;
+				device_type = "tbi-phy";
+			};
+		};
+
+		mdio@25520 {
+			#address-cells = <1>;
+			#size-cells = <0>;
+			compatible = "fsl,gianfar-tbi";
+			reg = <0x25520 0x20>;
+
+			tbi1: tbi-phy@11 {
+				reg = <0x11>;
+				device_type = "tbi-phy";
+			};
 		};
 
 		enet0: ethernet@24000 {
@@ -165,6 +181,7 @@
 			local-mac-address = [ 00 00 00 00 00 00 ];
 			interrupts = <29 2 30 2 34 2>;
 			interrupt-parent = <&mpic>;
+			tbi-handle = <&tbi0>;
 			phy-handle = <&phy2>;
 		};
 
@@ -177,6 +194,7 @@
 			local-mac-address = [ 00 00 00 00 00 00 ];
 			interrupts = <35 2 36 2 40 2>;
 			interrupt-parent = <&mpic>;
+			tbi-handle = <&tbi1>;
 			phy-handle = <&phy1>;
 		};
 
diff --git a/arch/powerpc/boot/dts/tqm8548-bigflash.dts b/arch/powerpc/boot/dts/tqm8548-bigflash.dts
index 4199e89b4e5..15086eb65c5 100644
--- a/arch/powerpc/boot/dts/tqm8548-bigflash.dts
+++ b/arch/powerpc/boot/dts/tqm8548-bigflash.dts
@@ -179,6 +179,46 @@
 				reg = <5>;
 				device_type = "ethernet-phy";
 			};
+			tbi0: tbi-phy@11 {
+				reg = <0x11>;
+				device_type = "tbi-phy";
+			};
+		};
+
+		mdio@25520 {
+			#address-cells = <1>;
+			#size-cells = <0>;
+			compatible = "fsl,gianfar-tbi";
+			reg = <0x25520 0x20>;
+
+			tbi1: tbi-phy@11 {
+				reg = <0x11>;
+				device_type = "tbi-phy";
+			};
+		};
+
+		mdio@26520 {
+			#address-cells = <1>;
+			#size-cells = <0>;
+			compatible = "fsl,gianfar-tbi";
+			reg = <0x26520 0x20>;
+
+			tbi2: tbi-phy@11 {
+				reg = <0x11>;
+				device_type = "tbi-phy";
+			};
+		};
+
+		mdio@27520 {
+			#address-cells = <1>;
+			#size-cells = <0>;
+			compatible = "fsl,gianfar-tbi";
+			reg = <0x27520 0x20>;
+
+			tbi3: tbi-phy@11 {
+				reg = <0x11>;
+				device_type = "tbi-phy";
+			};
 		};
 
 		enet0: ethernet@24000 {
@@ -190,6 +230,7 @@
 			local-mac-address = [ 00 00 00 00 00 00 ];
 			interrupts = <29 2 30 2 34 2>;
 			interrupt-parent = <&mpic>;
+			tbi-handle = <&tbi0>;
 			phy-handle = <&phy2>;
 		};
 
@@ -202,6 +243,7 @@
 			local-mac-address = [ 00 00 00 00 00 00 ];
 			interrupts = <35 2 36 2 40 2>;
 			interrupt-parent = <&mpic>;
+			tbi-handle = <&tbi1>;
 			phy-handle = <&phy1>;
 		};
 
@@ -214,6 +256,7 @@
 			local-mac-address = [ 00 00 00 00 00 00 ];
 			interrupts = <31 2 32 2 33 2>;
 			interrupt-parent = <&mpic>;
+			tbi-handle = <&tbi2>;
 			phy-handle = <&phy3>;
 		};
 
@@ -226,6 +269,7 @@
 			local-mac-address = [ 00 00 00 00 00 00 ];
 			interrupts = <37 2 38 2 39 2>;
 			interrupt-parent = <&mpic>;
+			tbi-handle = <&tbi3>;
 			phy-handle = <&phy4>;
 		};
 
diff --git a/arch/powerpc/boot/dts/tqm8548.dts b/arch/powerpc/boot/dts/tqm8548.dts
index 58ee4185454..b7b65f5e79b 100644
--- a/arch/powerpc/boot/dts/tqm8548.dts
+++ b/arch/powerpc/boot/dts/tqm8548.dts
@@ -179,6 +179,46 @@
 				reg = <5>;
 				device_type = "ethernet-phy";
 			};
+			tbi0: tbi-phy@11 {
+				reg = <0x11>;
+				device_type = "tbi-phy";
+			};
+		};
+
+		mdio@25520 {
+			#address-cells = <1>;
+			#size-cells = <0>;
+			compatible = "fsl,gianfar-tbi";
+			reg = <0x25520 0x20>;
+
+			tbi1: tbi-phy@11 {
+				reg = <0x11>;
+				device_type = "tbi-phy";
+			};
+		};
+
+		mdio@26520 {
+			#address-cells = <1>;
+			#size-cells = <0>;
+			compatible = "fsl,gianfar-tbi";
+			reg = <0x26520 0x20>;
+
+			tbi2: tbi-phy@11 {
+				reg = <0x11>;
+				device_type = "tbi-phy";
+			};
+		};
+
+		mdio@27520 {
+			#address-cells = <1>;
+			#size-cells = <0>;
+			compatible = "fsl,gianfar-tbi";
+			reg = <0x27520 0x20>;
+
+			tbi3: tbi-phy@11 {
+				reg = <0x11>;
+				device_type = "tbi-phy";
+			};
 		};
 
 		enet0: ethernet@24000 {
@@ -190,6 +230,7 @@
 			local-mac-address = [ 00 00 00 00 00 00 ];
 			interrupts = <29 2 30 2 34 2>;
 			interrupt-parent = <&mpic>;
+			tbi-handle = <&tbi0>;
 			phy-handle = <&phy2>;
 		};
 
@@ -202,6 +243,7 @@
 			local-mac-address = [ 00 00 00 00 00 00 ];
 			interrupts = <35 2 36 2 40 2>;
 			interrupt-parent = <&mpic>;
+			tbi-handle = <&tbi1>;
 			phy-handle = <&phy1>;
 		};
 
@@ -214,6 +256,7 @@
 			local-mac-address = [ 00 00 00 00 00 00 ];
 			interrupts = <31 2 32 2 33 2>;
 			interrupt-parent = <&mpic>;
+			tbi-handle = <&tbi2>;
 			phy-handle = <&phy3>;
 		};
 
@@ -226,6 +269,7 @@
 			local-mac-address = [ 00 00 00 00 00 00 ];
 			interrupts = <37 2 38 2 39 2>;
 			interrupt-parent = <&mpic>;
+			tbi-handle = <&tbi3>;
 			phy-handle = <&phy4>;
 		};
 
diff --git a/arch/powerpc/boot/dts/tqm8555.dts b/arch/powerpc/boot/dts/tqm8555.dts
index 6f7ea59c484..cf92b4e7945 100644
--- a/arch/powerpc/boot/dts/tqm8555.dts
+++ b/arch/powerpc/boot/dts/tqm8555.dts
@@ -154,6 +154,22 @@
 				reg = <3>;
 				device_type = "ethernet-phy";
 			};
+			tbi0: tbi-phy@11 {
+				reg = <0x11>;
+				device_type = "tbi-phy";
+			};
+		};
+
+		mdio@25520 {
+			#address-cells = <1>;
+			#size-cells = <0>;
+			compatible = "fsl,gianfar-tbi";
+			reg = <0x25520 0x20>;
+
+			tbi1: tbi-phy@11 {
+				reg = <0x11>;
+				device_type = "tbi-phy";
+			};
 		};
 
 		enet0: ethernet@24000 {
@@ -165,6 +181,7 @@
 			local-mac-address = [ 00 00 00 00 00 00 ];
 			interrupts = <29 2 30 2 34 2>;
 			interrupt-parent = <&mpic>;
+			tbi-handle = <&tbi0>;
 			phy-handle = <&phy2>;
 		};
 
@@ -177,6 +194,7 @@
 			local-mac-address = [ 00 00 00 00 00 00 ];
 			interrupts = <35 2 36 2 40 2>;
 			interrupt-parent = <&mpic>;
+			tbi-handle = <&tbi1>;
 			phy-handle = <&phy1>;
 		};
 
diff --git a/arch/powerpc/boot/dts/tqm8560.dts b/arch/powerpc/boot/dts/tqm8560.dts
index 3fe35208907..9e1ab2d2f66 100644
--- a/arch/powerpc/boot/dts/tqm8560.dts
+++ b/arch/powerpc/boot/dts/tqm8560.dts
@@ -156,6 +156,22 @@
 				reg = <3>;
 				device_type = "ethernet-phy";
 			};
+			tbi0: tbi-phy@11 {
+				reg = <0x11>;
+				device_type = "tbi-phy";
+			};
+		};
+
+		mdio@25520 {
+			#address-cells = <1>;
+			#size-cells = <0>;
+			compatible = "fsl,gianfar-tbi";
+			reg = <0x25520 0x20>;
+
+			tbi1: tbi-phy@11 {
+				reg = <0x11>;
+				device_type = "tbi-phy";
+			};
 		};
 
 		enet0: ethernet@24000 {
@@ -167,6 +183,7 @@
 			local-mac-address = [ 00 00 00 00 00 00 ];
 			interrupts = <29 2 30 2 34 2>;
 			interrupt-parent = <&mpic>;
+			tbi-handle = <&tbi0>;
 			phy-handle = <&phy2>;
 		};
 
@@ -179,6 +196,7 @@
 			local-mac-address = [ 00 00 00 00 00 00 ];
 			interrupts = <35 2 36 2 40 2>;
 			interrupt-parent = <&mpic>;
+			tbi-handle = <&tbi1>;
 			phy-handle = <&phy1>;
 		};
 
diff --git a/arch/powerpc/sysdev/fsl_soc.c b/arch/powerpc/sysdev/fsl_soc.c
index 26ecb96f973..115cb16351f 100644
--- a/arch/powerpc/sysdev/fsl_soc.c
+++ b/arch/powerpc/sysdev/fsl_soc.c
@@ -207,236 +207,51 @@ static int __init of_add_fixed_phys(void)
 arch_initcall(of_add_fixed_phys);
 #endif /* CONFIG_FIXED_PHY */
 
-static int gfar_mdio_of_init_one(struct device_node *np)
-{
-	int k;
-	struct device_node *child = NULL;
-	struct gianfar_mdio_data mdio_data;
-	struct platform_device *mdio_dev;
-	struct resource res;
-	int ret;
-
-	memset(&res, 0, sizeof(res));
-	memset(&mdio_data, 0, sizeof(mdio_data));
-
-	ret = of_address_to_resource(np, 0, &res);
-	if (ret)
-		return ret;
-
-	/* The gianfar device will try to use the same ID created below to find
-	 * this bus, to coordinate register access (since they share).  */
-	mdio_dev = platform_device_register_simple("fsl-gianfar_mdio",
-			res.start&0xfffff, &res, 1);
-	if (IS_ERR(mdio_dev))
-		return PTR_ERR(mdio_dev);
-
-	for (k = 0; k < 32; k++)
-		mdio_data.irq[k] = PHY_POLL;
-
-	while ((child = of_get_next_child(np, child)) != NULL) {
-		int irq = irq_of_parse_and_map(child, 0);
-		if (irq != NO_IRQ) {
-			const u32 *id = of_get_property(child, "reg", NULL);
-			mdio_data.irq[*id] = irq;
-		}
-	}
-
-	ret = platform_device_add_data(mdio_dev, &mdio_data,
-				sizeof(struct gianfar_mdio_data));
-	if (ret)
-		platform_device_unregister(mdio_dev);
-
-	return ret;
-}
-
-static int __init gfar_mdio_of_init(void)
-{
-	struct device_node *np = NULL;
-
-	for_each_compatible_node(np, NULL, "fsl,gianfar-mdio")
-		gfar_mdio_of_init_one(np);
-
-	/* try the deprecated version */
-	for_each_compatible_node(np, "mdio", "gianfar");
-		gfar_mdio_of_init_one(np);
-
-	return 0;
-}
-
-arch_initcall(gfar_mdio_of_init);
-
-static const char *gfar_tx_intr = "tx";
-static const char *gfar_rx_intr = "rx";
-static const char *gfar_err_intr = "error";
-
-static int __init gfar_of_init(void)
+#ifdef CONFIG_PPC_83xx
+static int __init mpc83xx_wdt_init(void)
 {
+	struct resource r;
 	struct device_node *np;
-	unsigned int i;
-	struct platform_device *gfar_dev;
-	struct resource res;
+	struct platform_device *dev;
+	u32 freq = fsl_get_sys_freq();
 	int ret;
 
-	for (np = NULL, i = 0;
-	     (np = of_find_compatible_node(np, "network", "gianfar")) != NULL;
-	     i++) {
-		struct resource r[4];
-		struct device_node *phy, *mdio;
-		struct gianfar_platform_data gfar_data;
-		const unsigned int *id;
-		const char *model;
-		const char *ctype;
-		const void *mac_addr;
-		const phandle *ph;
-		int n_res = 2;
-
-		if (!of_device_is_available(np))
-			continue;
-
-		memset(r, 0, sizeof(r));
-		memset(&gfar_data, 0, sizeof(gfar_data));
-
-		ret = of_address_to_resource(np, 0, &r[0]);
-		if (ret)
-			goto err;
-
-		of_irq_to_resource(np, 0, &r[1]);
-
-		model = of_get_property(np, "model", NULL);
-
-		/* If we aren't the FEC we have multiple interrupts */
-		if (model && strcasecmp(model, "FEC")) {
-			r[1].name = gfar_tx_intr;
-
-			r[2].name = gfar_rx_intr;
-			of_irq_to_resource(np, 1, &r[2]);
+	np = of_find_compatible_node(NULL, "watchdog", "mpc83xx_wdt");
 
-			r[3].name = gfar_err_intr;
-			of_irq_to_resource(np, 2, &r[3]);
-
-			n_res += 2;
-		}
-
-		gfar_dev =
-		    platform_device_register_simple("fsl-gianfar", i, &r[0],
-						    n_res);
-
-		if (IS_ERR(gfar_dev)) {
-			ret = PTR_ERR(gfar_dev);
-			goto err;
-		}
-
-		mac_addr = of_get_mac_address(np);
-		if (mac_addr)
-			memcpy(gfar_data.mac_addr, mac_addr, 6);
-
-		if (model && !strcasecmp(model, "TSEC"))
-			gfar_data.device_flags =
-			    FSL_GIANFAR_DEV_HAS_GIGABIT |
-			    FSL_GIANFAR_DEV_HAS_COALESCE |
-			    FSL_GIANFAR_DEV_HAS_RMON |
-			    FSL_GIANFAR_DEV_HAS_MULTI_INTR;
-		if (model && !strcasecmp(model, "eTSEC"))
-			gfar_data.device_flags =
-			    FSL_GIANFAR_DEV_HAS_GIGABIT |
-			    FSL_GIANFAR_DEV_HAS_COALESCE |
-			    FSL_GIANFAR_DEV_HAS_RMON |
-			    FSL_GIANFAR_DEV_HAS_MULTI_INTR |
-			    FSL_GIANFAR_DEV_HAS_CSUM |
-			    FSL_GIANFAR_DEV_HAS_VLAN |
-			    FSL_GIANFAR_DEV_HAS_EXTENDED_HASH;
-
-		ctype = of_get_property(np, "phy-connection-type", NULL);
-
-		/* We only care about rgmii-id.  The rest are autodetected */
-		if (ctype && !strcmp(ctype, "rgmii-id"))
-			gfar_data.interface = PHY_INTERFACE_MODE_RGMII_ID;
-		else
-			gfar_data.interface = PHY_INTERFACE_MODE_MII;
-
-		if (of_get_property(np, "fsl,magic-packet", NULL))
-			gfar_data.device_flags |= FSL_GIANFAR_DEV_HAS_MAGIC_PACKET;
-
-		ph = of_get_property(np, "phy-handle", NULL);
-		if (ph == NULL) {
-			u32 *fixed_link;
-
-			fixed_link = (u32 *)of_get_property(np, "fixed-link",
-							   NULL);
-			if (!fixed_link) {
-				ret = -ENODEV;
-				goto unreg;
-			}
-
-			snprintf(gfar_data.bus_id, MII_BUS_ID_SIZE, "0");
-			gfar_data.phy_id = fixed_link[0];
-		} else {
-			phy = of_find_node_by_phandle(*ph);
-
-			if (phy == NULL) {
-				ret = -ENODEV;
-				goto unreg;
-			}
-
-			mdio = of_get_parent(phy);
-
-			id = of_get_property(phy, "reg", NULL);
-			ret = of_address_to_resource(mdio, 0, &res);
-			if (ret) {
-				of_node_put(phy);
-				of_node_put(mdio);
-				goto unreg;
-			}
-
-			gfar_data.phy_id = *id;
-			snprintf(gfar_data.bus_id, MII_BUS_ID_SIZE, "%llx",
-				 (unsigned long long)res.start&0xfffff);
+	if (!np) {
+		ret = -ENODEV;
+		goto nodev;
+	}
 
-			of_node_put(phy);
-			of_node_put(mdio);
-		}
+	memset(&r, 0, sizeof(r));
 
-		/* Get MDIO bus controlled by this eTSEC, if any.  Normally only
-		 * eTSEC 1 will control an MDIO bus, not necessarily the same
-		 * bus that its PHY is on ('mdio' above), so we can't just use
-		 * that.  What we do is look for a gianfar mdio device that has
-		 * overlapping registers with this device.  That's really the
-		 * whole point, to find the device sharing our registers to
-		 * coordinate access with it.
-		 */
-		for_each_compatible_node(mdio, NULL, "fsl,gianfar-mdio") {
-			if (of_address_to_resource(mdio, 0, &res))
-				continue;
-
-			if (res.start >= r[0].start && res.end <= r[0].end) {
-				/* Get the ID the mdio bus platform device was
-				 * registered with.  gfar_data.bus_id is
-				 * different because it's for finding a PHY,
-				 * while this is for finding a MII bus.
-				 */
-				gfar_data.mdio_bus = res.start&0xfffff;
-				of_node_put(mdio);
-				break;
-			}
-		}
+	ret = of_address_to_resource(np, 0, &r);
+	if (ret)
+		goto err;
 
-		ret =
-		    platform_device_add_data(gfar_dev, &gfar_data,
-					     sizeof(struct
-						    gianfar_platform_data));
-		if (ret)
-			goto unreg;
+	dev = platform_device_register_simple("mpc83xx_wdt", 0, &r, 1);
+	if (IS_ERR(dev)) {
+		ret = PTR_ERR(dev);
+		goto err;
 	}
 
+	ret = platform_device_add_data(dev, &freq, sizeof(freq));
+	if (ret)
+		goto unreg;
+
+	of_node_put(np);
 	return 0;
 
 unreg:
-	platform_device_unregister(gfar_dev);
+	platform_device_unregister(dev);
 err:
+	of_node_put(np);
+nodev:
 	return ret;
 }
 
-arch_initcall(gfar_of_init);
+arch_initcall(mpc83xx_wdt_init);
+#endif
 
 static enum fsl_usb2_phy_modes determine_usb_phy(const char *phy_type)
 {
diff --git a/drivers/net/gianfar.c b/drivers/net/gianfar.c
index 55e319fa7fe..7398704c4b5 100644
--- a/drivers/net/gianfar.c
+++ b/drivers/net/gianfar.c
@@ -25,11 +25,8 @@
  *
  *  Theory of operation
  *
- *  The driver is initialized through platform_device.  Structures which
- *  define the configuration needed by the board are defined in a
- *  board structure in arch/ppc/platforms (though I do not
- *  discount the possibility that other architectures could one
- *  day be supported.
+ *  The driver is initialized through of_device. Configuration information
+ *  is therefore conveyed through an OF-style device tree.
  *
  *  The Gianfar Ethernet Controller uses a ring of buffer
  *  descriptors.  The beginning is indicated by a register
@@ -78,7 +75,7 @@
 #include <linux/if_vlan.h>
 #include <linux/spinlock.h>
 #include <linux/mm.h>
-#include <linux/platform_device.h>
+#include <linux/of_platform.h>
 #include <linux/ip.h>
 #include <linux/tcp.h>
 #include <linux/udp.h>
@@ -92,6 +89,8 @@
 #include <linux/crc32.h>
 #include <linux/mii.h>
 #include <linux/phy.h>
+#include <linux/phy_fixed.h>
+#include <linux/of.h>
 
 #include "gianfar.h"
 #include "gianfar_mii.h"
@@ -119,8 +118,9 @@ static irqreturn_t gfar_interrupt(int irq, void *dev_id);
 static void adjust_link(struct net_device *dev);
 static void init_registers(struct net_device *dev);
 static int init_phy(struct net_device *dev);
-static int gfar_probe(struct platform_device *pdev);
-static int gfar_remove(struct platform_device *pdev);
+static int gfar_probe(struct of_device *ofdev,
+		const struct of_device_id *match);
+static int gfar_remove(struct of_device *ofdev);
 static void free_skb_resources(struct gfar_private *priv);
 static void gfar_set_multi(struct net_device *dev);
 static void gfar_set_hash_for_addr(struct net_device *dev, u8 *addr);
@@ -152,25 +152,158 @@ static inline int gfar_uses_fcb(struct gfar_private *priv)
 	return (priv->vlan_enable || priv->rx_csum_enable);
 }
 
+static int gfar_of_init(struct net_device *dev)
+{
+	struct device_node *phy, *mdio;
+	const unsigned int *id;
+	const char *model;
+	const char *ctype;
+	const void *mac_addr;
+	const phandle *ph;
+	u64 addr, size;
+	int err = 0;
+	struct gfar_private *priv = netdev_priv(dev);
+	struct device_node *np = priv->node;
+	char bus_name[MII_BUS_ID_SIZE];
+
+	if (!np || !of_device_is_available(np))
+		return -ENODEV;
+
+	/* get a pointer to the register memory */
+	addr = of_translate_address(np, of_get_address(np, 0, &size, NULL));
+	priv->regs = ioremap(addr, size);
+
+	if (priv->regs == NULL)
+		return -ENOMEM;
+
+	priv->interruptTransmit = irq_of_parse_and_map(np, 0);
+
+	model = of_get_property(np, "model", NULL);
+
+	/* If we aren't the FEC we have multiple interrupts */
+	if (model && strcasecmp(model, "FEC")) {
+		priv->interruptReceive = irq_of_parse_and_map(np, 1);
+
+		priv->interruptError = irq_of_parse_and_map(np, 2);
+
+		if (priv->interruptTransmit < 0 ||
+				priv->interruptReceive < 0 ||
+				priv->interruptError < 0) {
+			err = -EINVAL;
+			goto err_out;
+		}
+	}
+
+	mac_addr = of_get_mac_address(np);
+	if (mac_addr)
+		memcpy(dev->dev_addr, mac_addr, MAC_ADDR_LEN);
+
+	if (model && !strcasecmp(model, "TSEC"))
+		priv->device_flags =
+			FSL_GIANFAR_DEV_HAS_GIGABIT |
+			FSL_GIANFAR_DEV_HAS_COALESCE |
+			FSL_GIANFAR_DEV_HAS_RMON |
+			FSL_GIANFAR_DEV_HAS_MULTI_INTR;
+	if (model && !strcasecmp(model, "eTSEC"))
+		priv->device_flags =
+			FSL_GIANFAR_DEV_HAS_GIGABIT |
+			FSL_GIANFAR_DEV_HAS_COALESCE |
+			FSL_GIANFAR_DEV_HAS_RMON |
+			FSL_GIANFAR_DEV_HAS_MULTI_INTR |
+			FSL_GIANFAR_DEV_HAS_CSUM |
+			FSL_GIANFAR_DEV_HAS_VLAN |
+			FSL_GIANFAR_DEV_HAS_MAGIC_PACKET |
+			FSL_GIANFAR_DEV_HAS_EXTENDED_HASH;
+
+	ctype = of_get_property(np, "phy-connection-type", NULL);
+
+	/* We only care about rgmii-id.  The rest are autodetected */
+	if (ctype && !strcmp(ctype, "rgmii-id"))
+		priv->interface = PHY_INTERFACE_MODE_RGMII_ID;
+	else
+		priv->interface = PHY_INTERFACE_MODE_MII;
+
+	if (of_get_property(np, "fsl,magic-packet", NULL))
+		priv->device_flags |= FSL_GIANFAR_DEV_HAS_MAGIC_PACKET;
+
+	ph = of_get_property(np, "phy-handle", NULL);
+	if (ph == NULL) {
+		u32 *fixed_link;
+
+		fixed_link = (u32 *)of_get_property(np, "fixed-link", NULL);
+		if (!fixed_link) {
+			err = -ENODEV;
+			goto err_out;
+		}
+
+		snprintf(priv->phy_bus_id, BUS_ID_SIZE, PHY_ID_FMT, "0",
+				fixed_link[0]);
+	} else {
+		phy = of_find_node_by_phandle(*ph);
+
+		if (phy == NULL) {
+			err = -ENODEV;
+			goto err_out;
+		}
+
+		mdio = of_get_parent(phy);
+
+		id = of_get_property(phy, "reg", NULL);
+
+		of_node_put(phy);
+		of_node_put(mdio);
+
+		gfar_mdio_bus_name(bus_name, mdio);
+		snprintf(priv->phy_bus_id, BUS_ID_SIZE, "%s:%02x",
+				bus_name, *id);
+	}
+
+	/* Find the TBI PHY.  If it's not there, we don't support SGMII */
+	ph = of_get_property(np, "tbi-handle", NULL);
+	if (ph) {
+		struct device_node *tbi = of_find_node_by_phandle(*ph);
+		struct of_device *ofdev;
+		struct mii_bus *bus;
+
+		if (!tbi)
+			return 0;
+
+		mdio = of_get_parent(tbi);
+		if (!mdio)
+			return 0;
+
+		ofdev = of_find_device_by_node(mdio);
+
+		of_node_put(mdio);
+
+		id = of_get_property(tbi, "reg", NULL);
+		if (!id)
+			return 0;
+
+		of_node_put(tbi);
+
+		bus = dev_get_drvdata(&ofdev->dev);
+
+		priv->tbiphy = bus->phy_map[*id];
+	}
+
+	return 0;
+
+err_out:
+	iounmap(priv->regs);
+	return err;
+}
+
 /* Set up the ethernet device structure, private data,
  * and anything else we need before we start */
-static int gfar_probe(struct platform_device *pdev)
+static int gfar_probe(struct of_device *ofdev,
+		const struct of_device_id *match)
 {
 	u32 tempval;
 	struct net_device *dev = NULL;
 	struct gfar_private *priv = NULL;
-	struct gianfar_platform_data *einfo;
-	struct resource *r;
-	int err = 0, irq;
-
-	einfo = (struct gianfar_platform_data *) pdev->dev.platform_data;
-
-	if (NULL == einfo) {
-		printk(KERN_ERR "gfar %d: Missing additional data!\n",
-		       pdev->id);
-
-		return -ENODEV;
-	}
+	int err = 0;
+	DECLARE_MAC_BUF(mac);
 
 	/* Create an ethernet device instance */
 	dev = alloc_etherdev(sizeof (*priv));
@@ -180,48 +313,19 @@ static int gfar_probe(struct platform_device *pdev)
 
 	priv = netdev_priv(dev);
 	priv->dev = dev;
+	priv->node = ofdev->node;
 
-	/* Set the info in the priv to the current info */
-	priv->einfo = einfo;
-
-	/* fill out IRQ fields */
-	if (einfo->device_flags & FSL_GIANFAR_DEV_HAS_MULTI_INTR) {
-		irq = platform_get_irq_byname(pdev, "tx");
-		if (irq < 0)
-			goto regs_fail;
-		priv->interruptTransmit = irq;
-
-		irq = platform_get_irq_byname(pdev, "rx");
-		if (irq < 0)
-			goto regs_fail;
-		priv->interruptReceive = irq;
-
-		irq = platform_get_irq_byname(pdev, "error");
-		if (irq < 0)
-			goto regs_fail;
-		priv->interruptError = irq;
-	} else {
-		irq = platform_get_irq(pdev, 0);
-		if (irq < 0)
-			goto regs_fail;
-		priv->interruptTransmit = irq;
-	}
-
-	/* get a pointer to the register memory */
-	r = platform_get_resource(pdev, IORESOURCE_MEM, 0);
-	priv->regs = ioremap(r->start, sizeof (struct gfar));
+	err = gfar_of_init(dev);
 
-	if (NULL == priv->regs) {
-		err = -ENOMEM;
+	if (err)
 		goto regs_fail;
-	}
 
 	spin_lock_init(&priv->txlock);
 	spin_lock_init(&priv->rxlock);
 	spin_lock_init(&priv->bflock);
 	INIT_WORK(&priv->reset_task, gfar_reset_task);
 
-	platform_set_drvdata(pdev, dev);
+	dev_set_drvdata(&ofdev->dev, priv);
 
 	/* Stop the DMA engine now, in case it was running before */
 	/* (The firmware could have used it, and left it running). */
@@ -239,13 +343,10 @@ static int gfar_probe(struct platform_device *pdev)
 	/* Initialize ECNTRL */
 	gfar_write(&priv->regs->ecntrl, ECNTRL_INIT_SETTINGS);
 
-	/* Copy the station address into the dev structure, */
-	memcpy(dev->dev_addr, einfo->mac_addr, MAC_ADDR_LEN);
-
 	/* Set the dev->base_addr to the gfar reg region */
 	dev->base_addr = (unsigned long) (priv->regs);
 
-	SET_NETDEV_DEV(dev, &pdev->dev);
+	SET_NETDEV_DEV(dev, &ofdev->dev);
 
 	/* Fill in the dev structure */
 	dev->open = gfar_enet_open;
@@ -263,7 +364,7 @@ static int gfar_probe(struct platform_device *pdev)
 
 	dev->ethtool_ops = &gfar_ethtool_ops;
 
-	if (priv->einfo->device_flags & FSL_GIANFAR_DEV_HAS_CSUM) {
+	if (priv->device_flags & FSL_GIANFAR_DEV_HAS_CSUM) {
 		priv->rx_csum_enable = 1;
 		dev->features |= NETIF_F_IP_CSUM;
 	} else
@@ -271,7 +372,7 @@ static int gfar_probe(struct platform_device *pdev)
 
 	priv->vlgrp = NULL;
 
-	if (priv->einfo->device_flags & FSL_GIANFAR_DEV_HAS_VLAN) {
+	if (priv->device_flags & FSL_GIANFAR_DEV_HAS_VLAN) {
 		dev->vlan_rx_register = gfar_vlan_rx_register;
 
 		dev->features |= NETIF_F_HW_VLAN_TX | NETIF_F_HW_VLAN_RX;
@@ -279,7 +380,7 @@ static int gfar_probe(struct platform_device *pdev)
 		priv->vlan_enable = 1;
 	}
 
-	if (priv->einfo->device_flags & FSL_GIANFAR_DEV_HAS_EXTENDED_HASH) {
+	if (priv->device_flags & FSL_GIANFAR_DEV_HAS_EXTENDED_HASH) {
 		priv->extended_hash = 1;
 		priv->hash_width = 9;
 
@@ -314,7 +415,7 @@ static int gfar_probe(struct platform_device *pdev)
 		priv->hash_regs[7] = &priv->regs->gaddr7;
 	}
 
-	if (priv->einfo->device_flags & FSL_GIANFAR_DEV_HAS_PADDING)
+	if (priv->device_flags & FSL_GIANFAR_DEV_HAS_PADDING)
 		priv->padding = DEFAULT_PADDING;
 	else
 		priv->padding = 0;
@@ -368,29 +469,28 @@ regs_fail:
 	return err;
 }
 
-static int gfar_remove(struct platform_device *pdev)
+static int gfar_remove(struct of_device *ofdev)
 {
-	struct net_device *dev = platform_get_drvdata(pdev);
-	struct gfar_private *priv = netdev_priv(dev);
+	struct gfar_private *priv = dev_get_drvdata(&ofdev->dev);
 
-	platform_set_drvdata(pdev, NULL);
+	dev_set_drvdata(&ofdev->dev, NULL);
 
 	iounmap(priv->regs);
-	free_netdev(dev);
+	free_netdev(priv->dev);
 
 	return 0;
 }
 
 #ifdef CONFIG_PM
-static int gfar_suspend(struct platform_device *pdev, pm_message_t state)
+static int gfar_suspend(struct of_device *ofdev, pm_message_t state)
 {
-	struct net_device *dev = platform_get_drvdata(pdev);
-	struct gfar_private *priv = netdev_priv(dev);
+	struct gfar_private *priv = dev_get_drvdata(&ofdev->dev);
+	struct net_device *dev = priv->dev;
 	unsigned long flags;
 	u32 tempval;
 
 	int magic_packet = priv->wol_en &&
-		(priv->einfo->device_flags & FSL_GIANFAR_DEV_HAS_MAGIC_PACKET);
+		(priv->device_flags & FSL_GIANFAR_DEV_HAS_MAGIC_PACKET);
 
 	netif_device_detach(dev);
 
@@ -431,14 +531,14 @@ static int gfar_suspend(struct platform_device *pdev, pm_message_t state)
 	return 0;
 }
 
-static int gfar_resume(struct platform_device *pdev)
+static int gfar_resume(struct of_device *ofdev)
 {
-	struct net_device *dev = platform_get_drvdata(pdev);
-	struct gfar_private *priv = netdev_priv(dev);
+	struct gfar_private *priv = dev_get_drvdata(&ofdev->dev);
+	struct net_device *dev = priv->dev;
 	unsigned long flags;
 	u32 tempval;
 	int magic_packet = priv->wol_en &&
-		(priv->einfo->device_flags & FSL_GIANFAR_DEV_HAS_MAGIC_PACKET);
+		(priv->device_flags & FSL_GIANFAR_DEV_HAS_MAGIC_PACKET);
 
 	if (!netif_running(dev)) {
 		netif_device_attach(dev);
@@ -497,7 +597,7 @@ static phy_interface_t gfar_get_interface(struct net_device *dev)
 		if (ecntrl & ECNTRL_REDUCED_MII_MODE)
 			return PHY_INTERFACE_MODE_RMII;
 		else {
-			phy_interface_t interface = priv->einfo->interface;
+			phy_interface_t interface = priv->interface;
 
 			/*
 			 * This isn't autodetected right now, so it must
@@ -510,7 +610,7 @@ static phy_interface_t gfar_get_interface(struct net_device *dev)
 		}
 	}
 
-	if (priv->einfo->device_flags & FSL_GIANFAR_DEV_HAS_GIGABIT)
+	if (priv->device_flags & FSL_GIANFAR_DEV_HAS_GIGABIT)
 		return PHY_INTERFACE_MODE_GMII;
 
 	return PHY_INTERFACE_MODE_MII;
@@ -524,21 +624,18 @@ static int init_phy(struct net_device *dev)
 {
 	struct gfar_private *priv = netdev_priv(dev);
 	uint gigabit_support =
-		priv->einfo->device_flags & FSL_GIANFAR_DEV_HAS_GIGABIT ?
+		priv->device_flags & FSL_GIANFAR_DEV_HAS_GIGABIT ?
 		SUPPORTED_1000baseT_Full : 0;
 	struct phy_device *phydev;
-	char phy_id[BUS_ID_SIZE];
 	phy_interface_t interface;
 
 	priv->oldlink = 0;
 	priv->oldspeed = 0;
 	priv->oldduplex = -1;
 
-	snprintf(phy_id, sizeof(phy_id), PHY_ID_FMT, priv->einfo->bus_id, priv->einfo->phy_id);
-
 	interface = gfar_get_interface(dev);
 
-	phydev = phy_connect(dev, phy_id, &adjust_link, 0, interface);
+	phydev = phy_connect(dev, priv->phy_bus_id, &adjust_link, 0, interface);
 
 	if (interface == PHY_INTERFACE_MODE_SGMII)
 		gfar_configure_serdes(dev);
@@ -569,35 +666,31 @@ static int init_phy(struct net_device *dev)
 static void gfar_configure_serdes(struct net_device *dev)
 {
 	struct gfar_private *priv = netdev_priv(dev);
-	struct gfar_mii __iomem *regs =
-			(void __iomem *)&priv->regs->gfar_mii_regs;
-	int tbipa = gfar_read(&priv->regs->tbipa);
-	struct mii_bus *bus = gfar_get_miibus(priv);
 
-	if (bus)
-		mutex_lock(&bus->mdio_lock);
+	if (!priv->tbiphy) {
+		printk(KERN_WARNING "SGMII mode requires that the device "
+				"tree specify a tbi-handle\n");
+		return;
+	}
 
-	/* If the link is already up, we must already be ok, and don't need to
+	/*
+	 * If the link is already up, we must already be ok, and don't need to
 	 * configure and reset the TBI<->SerDes link.  Maybe U-Boot configured
 	 * everything for us?  Resetting it takes the link down and requires
 	 * several seconds for it to come back.
 	 */
-	if (gfar_local_mdio_read(regs, tbipa, MII_BMSR) & BMSR_LSTATUS)
-		goto done;
+	if (phy_read(priv->tbiphy, MII_BMSR) & BMSR_LSTATUS)
+		return;
 
 	/* Single clk mode, mii mode off(for serdes communication) */
-	gfar_local_mdio_write(regs, tbipa, MII_TBICON, TBICON_CLK_SELECT);
+	phy_write(priv->tbiphy, MII_TBICON, TBICON_CLK_SELECT);
 
-	gfar_local_mdio_write(regs, tbipa, MII_ADVERTISE,
+	phy_write(priv->tbiphy, MII_ADVERTISE,
 			ADVERTISE_1000XFULL | ADVERTISE_1000XPAUSE |
 			ADVERTISE_1000XPSE_ASYM);
 
-	gfar_local_mdio_write(regs, tbipa, MII_BMCR, BMCR_ANENABLE |
+	phy_write(priv->tbiphy, MII_BMCR, BMCR_ANENABLE |
 			BMCR_ANRESTART | BMCR_FULLDPLX | BMCR_SPEED1000);
-
-	done:
-	if (bus)
-		mutex_unlock(&bus->mdio_lock);
 }
 
 static void init_registers(struct net_device *dev)
@@ -630,7 +723,7 @@ static void init_registers(struct net_device *dev)
 	gfar_write(&priv->regs->gaddr7, 0);
 
 	/* Zero out the rmon mib registers if it has them */
-	if (priv->einfo->device_flags & FSL_GIANFAR_DEV_HAS_RMON) {
+	if (priv->device_flags & FSL_GIANFAR_DEV_HAS_RMON) {
 		memset_io(&(priv->regs->rmon), 0, sizeof (struct rmon_mib));
 
 		/* Mask off the CAM interrupts */
@@ -705,7 +798,7 @@ void stop_gfar(struct net_device *dev)
 	spin_unlock_irqrestore(&priv->txlock, flags);
 
 	/* Free the IRQs */
-	if (priv->einfo->device_flags & FSL_GIANFAR_DEV_HAS_MULTI_INTR) {
+	if (priv->device_flags & FSL_GIANFAR_DEV_HAS_MULTI_INTR) {
 		free_irq(priv->interruptError, dev);
 		free_irq(priv->interruptTransmit, dev);
 		free_irq(priv->interruptReceive, dev);
@@ -919,7 +1012,7 @@ int startup_gfar(struct net_device *dev)
 
 	/* If the device has multiple interrupts, register for
 	 * them.  Otherwise, only register for the one */
-	if (priv->einfo->device_flags & FSL_GIANFAR_DEV_HAS_MULTI_INTR) {
+	if (priv->device_flags & FSL_GIANFAR_DEV_HAS_MULTI_INTR) {
 		/* Install our interrupt handlers for Error,
 		 * Transmit, and Receive */
 		if (request_irq(priv->interruptError, gfar_error,
@@ -1751,7 +1844,7 @@ static void gfar_netpoll(struct net_device *dev)
 	struct gfar_private *priv = netdev_priv(dev);
 
 	/* If the device has multiple interrupts, run tx/rx */
-	if (priv->einfo->device_flags & FSL_GIANFAR_DEV_HAS_MULTI_INTR) {
+	if (priv->device_flags & FSL_GIANFAR_DEV_HAS_MULTI_INTR) {
 		disable_irq(priv->interruptTransmit);
 		disable_irq(priv->interruptReceive);
 		disable_irq(priv->interruptError);
@@ -2045,7 +2138,7 @@ static irqreturn_t gfar_error(int irq, void *dev_id)
 	gfar_write(&priv->regs->ievent, events & IEVENT_ERR_MASK);
 
 	/* Magic Packet is not an error. */
-	if ((priv->einfo->device_flags & FSL_GIANFAR_DEV_HAS_MAGIC_PACKET) &&
+	if ((priv->device_flags & FSL_GIANFAR_DEV_HAS_MAGIC_PACKET) &&
 	    (events & IEVENT_MAG))
 		events &= ~IEVENT_MAG;
 
@@ -2111,16 +2204,24 @@ static irqreturn_t gfar_error(int irq, void *dev_id)
 /* work with hotplug and coldplug */
 MODULE_ALIAS("platform:fsl-gianfar");
 
+static struct of_device_id gfar_match[] =
+{
+	{
+		.type = "network",
+		.compatible = "gianfar",
+	},
+	{},
+};
+
 /* Structure for a device driver */
-static struct platform_driver gfar_driver = {
+static struct of_platform_driver gfar_driver = {
+	.name = "fsl-gianfar",
+	.match_table = gfar_match,
+
 	.probe = gfar_probe,
 	.remove = gfar_remove,
 	.suspend = gfar_suspend,
 	.resume = gfar_resume,
-	.driver	= {
-		.name = "fsl-gianfar",
-		.owner = THIS_MODULE,
-	},
 };
 
 static int __init gfar_init(void)
@@ -2130,7 +2231,7 @@ static int __init gfar_init(void)
 	if (err)
 		return err;
 
-	err = platform_driver_register(&gfar_driver);
+	err = of_register_platform_driver(&gfar_driver);
 
 	if (err)
 		gfar_mdio_exit();
@@ -2140,7 +2241,7 @@ static int __init gfar_init(void)
 
 static void __exit gfar_exit(void)
 {
-	platform_driver_unregister(&gfar_driver);
+	of_unregister_platform_driver(&gfar_driver);
 	gfar_mdio_exit();
 }
 
diff --git a/drivers/net/gianfar.h b/drivers/net/gianfar.h
index f46e9b63af1..ca7f0a6a68c 100644
--- a/drivers/net/gianfar.h
+++ b/drivers/net/gianfar.h
@@ -657,6 +657,19 @@ struct gfar {
 
 };
 
+/* Flags related to gianfar device features */
+#define FSL_GIANFAR_DEV_HAS_GIGABIT		0x00000001
+#define FSL_GIANFAR_DEV_HAS_COALESCE		0x00000002
+#define FSL_GIANFAR_DEV_HAS_RMON		0x00000004
+#define FSL_GIANFAR_DEV_HAS_MULTI_INTR		0x00000008
+#define FSL_GIANFAR_DEV_HAS_CSUM		0x00000010
+#define FSL_GIANFAR_DEV_HAS_VLAN		0x00000020
+#define FSL_GIANFAR_DEV_HAS_EXTENDED_HASH	0x00000040
+#define FSL_GIANFAR_DEV_HAS_PADDING		0x00000080
+#define FSL_GIANFAR_DEV_HAS_MAGIC_PACKET	0x00000100
+#define FSL_GIANFAR_DEV_HAS_BD_STASHING		0x00000200
+#define FSL_GIANFAR_DEV_HAS_BUF_STASHING	0x00000400
+
 /* Struct stolen almost completely (and shamelessly) from the FCC enet source
  * (Ok, that's not so true anymore, but there is a family resemblence)
  * The GFAR buffer descriptors track the ring buffers.  The rx_bd_base
@@ -694,6 +707,7 @@ struct gfar_private {
 	/* RX Locked fields */
 	spinlock_t rxlock;
 
+	struct device_node *node;
 	struct net_device *dev;
 	struct napi_struct napi;
 
@@ -733,6 +747,9 @@ struct gfar_private {
 	/* Bitfield update lock */
 	spinlock_t bflock;
 
+	phy_interface_t interface;
+	char	phy_bus_id[BUS_ID_SIZE];
+	u32 device_flags;
 	unsigned char vlan_enable:1,
 		rx_csum_enable:1,
 		extended_hash:1,
@@ -744,11 +761,9 @@ struct gfar_private {
 	unsigned int interruptReceive;
 	unsigned int interruptError;
 
-	/* info structure initialized by platform code */
-	struct gianfar_platform_data *einfo;
-
 	/* PHY stuff */
 	struct phy_device *phydev;
+	struct phy_device *tbiphy;
 	struct mii_bus *mii_bus;
 	int oldspeed;
 	int oldduplex;
diff --git a/drivers/net/gianfar_ethtool.c b/drivers/net/gianfar_ethtool.c
index fb7d3ccc0fd..53944b120a3 100644
--- a/drivers/net/gianfar_ethtool.c
+++ b/drivers/net/gianfar_ethtool.c
@@ -121,7 +121,7 @@ static void gfar_gstrings(struct net_device *dev, u32 stringset, u8 * buf)
 {
 	struct gfar_private *priv = netdev_priv(dev);
 
-	if (priv->einfo->device_flags & FSL_GIANFAR_DEV_HAS_RMON)
+	if (priv->device_flags & FSL_GIANFAR_DEV_HAS_RMON)
 		memcpy(buf, stat_gstrings, GFAR_STATS_LEN * ETH_GSTRING_LEN);
 	else
 		memcpy(buf, stat_gstrings,
@@ -138,7 +138,7 @@ static void gfar_fill_stats(struct net_device *dev, struct ethtool_stats *dummy,
 	struct gfar_private *priv = netdev_priv(dev);
 	u64 *extra = (u64 *) & priv->extra_stats;
 
-	if (priv->einfo->device_flags & FSL_GIANFAR_DEV_HAS_RMON) {
+	if (priv->device_flags & FSL_GIANFAR_DEV_HAS_RMON) {
 		u32 __iomem *rmon = (u32 __iomem *) & priv->regs->rmon;
 		struct gfar_stats *stats = (struct gfar_stats *) buf;
 
@@ -158,7 +158,7 @@ static int gfar_sset_count(struct net_device *dev, int sset)
 
 	switch (sset) {
 	case ETH_SS_STATS:
-		if (priv->einfo->device_flags & FSL_GIANFAR_DEV_HAS_RMON)
+		if (priv->device_flags & FSL_GIANFAR_DEV_HAS_RMON)
 			return GFAR_STATS_LEN;
 		else
 			return GFAR_EXTRA_STATS_LEN;
@@ -280,7 +280,7 @@ static int gfar_gcoalesce(struct net_device *dev, struct ethtool_coalesce *cvals
 {
 	struct gfar_private *priv = netdev_priv(dev);
 
-	if (!(priv->einfo->device_flags & FSL_GIANFAR_DEV_HAS_COALESCE))
+	if (!(priv->device_flags & FSL_GIANFAR_DEV_HAS_COALESCE))
 		return -EOPNOTSUPP;
 
 	if (NULL == priv->phydev)
@@ -332,7 +332,7 @@ static int gfar_scoalesce(struct net_device *dev, struct ethtool_coalesce *cvals
 {
 	struct gfar_private *priv = netdev_priv(dev);
 
-	if (!(priv->einfo->device_flags & FSL_GIANFAR_DEV_HAS_COALESCE))
+	if (!(priv->device_flags & FSL_GIANFAR_DEV_HAS_COALESCE))
 		return -EOPNOTSUPP;
 
 	/* Set up rx coalescing */
@@ -482,7 +482,7 @@ static int gfar_set_rx_csum(struct net_device *dev, uint32_t data)
 	unsigned long flags;
 	int err = 0;
 
-	if (!(priv->einfo->device_flags & FSL_GIANFAR_DEV_HAS_CSUM))
+	if (!(priv->device_flags & FSL_GIANFAR_DEV_HAS_CSUM))
 		return -EOPNOTSUPP;
 
 	if (dev->flags & IFF_UP) {
@@ -515,7 +515,7 @@ static uint32_t gfar_get_rx_csum(struct net_device *dev)
 {
 	struct gfar_private *priv = netdev_priv(dev);
 
-	if (!(priv->einfo->device_flags & FSL_GIANFAR_DEV_HAS_CSUM))
+	if (!(priv->device_flags & FSL_GIANFAR_DEV_HAS_CSUM))
 		return 0;
 
 	return priv->rx_csum_enable;
@@ -526,7 +526,7 @@ static int gfar_set_tx_csum(struct net_device *dev, uint32_t data)
 	unsigned long flags;
 	struct gfar_private *priv = netdev_priv(dev);
 
-	if (!(priv->einfo->device_flags & FSL_GIANFAR_DEV_HAS_CSUM))
+	if (!(priv->device_flags & FSL_GIANFAR_DEV_HAS_CSUM))
 		return -EOPNOTSUPP;
 
 	spin_lock_irqsave(&priv->txlock, flags);
@@ -547,7 +547,7 @@ static uint32_t gfar_get_tx_csum(struct net_device *dev)
 {
 	struct gfar_private *priv = netdev_priv(dev);
 
-	if (!(priv->einfo->device_flags & FSL_GIANFAR_DEV_HAS_CSUM))
+	if (!(priv->device_flags & FSL_GIANFAR_DEV_HAS_CSUM))
 		return 0;
 
 	return (dev->features & NETIF_F_IP_CSUM) != 0;
@@ -570,7 +570,7 @@ static void gfar_get_wol(struct net_device *dev, struct ethtool_wolinfo *wol)
 {
 	struct gfar_private *priv = netdev_priv(dev);
 
-	if (priv->einfo->device_flags & FSL_GIANFAR_DEV_HAS_MAGIC_PACKET) {
+	if (priv->device_flags & FSL_GIANFAR_DEV_HAS_MAGIC_PACKET) {
 		wol->supported = WAKE_MAGIC;
 		wol->wolopts = priv->wol_en ? WAKE_MAGIC : 0;
 	} else {
@@ -583,7 +583,7 @@ static int gfar_set_wol(struct net_device *dev, struct ethtool_wolinfo *wol)
 	struct gfar_private *priv = netdev_priv(dev);
 	unsigned long flags;
 
-	if (!(priv->einfo->device_flags & FSL_GIANFAR_DEV_HAS_MAGIC_PACKET) &&
+	if (!(priv->device_flags & FSL_GIANFAR_DEV_HAS_MAGIC_PACKET) &&
 	    wol->wolopts != 0)
 		return -EINVAL;
 
diff --git a/drivers/net/gianfar_mii.c b/drivers/net/gianfar_mii.c
index 0e2595d2493..f3706e415b4 100644
--- a/drivers/net/gianfar_mii.c
+++ b/drivers/net/gianfar_mii.c
@@ -34,6 +34,8 @@
 #include <linux/crc32.h>
 #include <linux/mii.h>
 #include <linux/phy.h>
+#include <linux/of.h>
+#include <linux/of_platform.h>
 
 #include <asm/io.h>
 #include <asm/irq.h>
@@ -150,19 +152,83 @@ static int gfar_mdio_reset(struct mii_bus *bus)
 	return 0;
 }
 
+/* Allocate an array which provides irq #s for each PHY on the given bus */
+static int *create_irq_map(struct device_node *np)
+{
+	int *irqs;
+	int i;
+	struct device_node *child = NULL;
+
+	irqs = kcalloc(PHY_MAX_ADDR, sizeof(int), GFP_KERNEL);
+
+	if (!irqs)
+		return NULL;
+
+	for (i = 0; i < PHY_MAX_ADDR; i++)
+		irqs[i] = PHY_POLL;
+
+	while ((child = of_get_next_child(np, child)) != NULL) {
+		int irq = irq_of_parse_and_map(child, 0);
+		const u32 *id;
+
+		if (irq == NO_IRQ)
+			continue;
+
+		id = of_get_property(child, "reg", NULL);
+
+		if (!id)
+			continue;
+
+		if (*id < PHY_MAX_ADDR && *id >= 0)
+			irqs[*id] = irq;
+		else
+			printk(KERN_WARNING "%s: "
+					"%d is not a valid PHY address\n",
+					np->full_name, *id);
+	}
+
+	return irqs;
+}
+
+
+void gfar_mdio_bus_name(char *name, struct device_node *np)
+{
+	const u32 *reg;
+
+	reg = of_get_property(np, "reg", NULL);
 
-static int gfar_mdio_probe(struct device *dev)
+	snprintf(name, MII_BUS_ID_SIZE, "%s@%x", np->name, reg ? *reg : 0);
+}
+
+/* Scan the bus in reverse, looking for an empty spot */
+static int gfar_mdio_find_free(struct mii_bus *new_bus)
+{
+	int i;
+
+	for (i = PHY_MAX_ADDR; i > 0; i--) {
+		u32 phy_id;
+
+		if (get_phy_id(new_bus, i, &phy_id))
+			return -1;
+
+		if (phy_id == 0xffffffff)
+			break;
+	}
+
+	return i;
+}
+
+static int gfar_mdio_probe(struct of_device *ofdev,
+		const struct of_device_id *match)
 {
-	struct platform_device *pdev = to_platform_device(dev);
-	struct gianfar_mdio_data *pdata;
 	struct gfar_mii __iomem *regs;
 	struct gfar __iomem *enet_regs;
 	struct mii_bus *new_bus;
-	struct resource *r;
-	int i, err = 0;
-
-	if (NULL == dev)
-		return -EINVAL;
+	int err = 0;
+	u64 addr, size;
+	struct device_node *np = ofdev->node;
+	struct device_node *tbi;
+	int tbiaddr = -1;
 
 	new_bus = mdiobus_alloc();
 	if (NULL == new_bus)
@@ -172,31 +238,28 @@ static int gfar_mdio_probe(struct device *dev)
 	new_bus->read = &gfar_mdio_read,
 	new_bus->write = &gfar_mdio_write,
 	new_bus->reset = &gfar_mdio_reset,
-	snprintf(new_bus->id, MII_BUS_ID_SIZE, "%x", pdev->id);
-
-	pdata = (struct gianfar_mdio_data *)pdev->dev.platform_data;
-
-	if (NULL == pdata) {
-		printk(KERN_ERR "gfar mdio %d: Missing platform data!\n", pdev->id);
-		return -ENODEV;
-	}
-
-	r = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+	gfar_mdio_bus_name(new_bus->id, np);
 
 	/* Set the PHY base address */
-	regs = ioremap(r->start, sizeof (struct gfar_mii));
+	addr = of_translate_address(np, of_get_address(np, 0, &size, NULL));
+	regs = ioremap(addr, size);
 
 	if (NULL == regs) {
 		err = -ENOMEM;
-		goto reg_map_fail;
+		goto err_free_bus;
 	}
 
 	new_bus->priv = (void __force *)regs;
 
-	new_bus->irq = pdata->irq;
+	new_bus->irq = create_irq_map(np);
+
+	if (new_bus->irq == NULL) {
+		err = -ENOMEM;
+		goto err_unmap_regs;
+	}
 
-	new_bus->parent = dev;
-	dev_set_drvdata(dev, new_bus);
+	new_bus->parent = &ofdev->dev;
+	dev_set_drvdata(&ofdev->dev, new_bus);
 
 	/*
 	 * This is mildly evil, but so is our hardware for doing this.
@@ -206,96 +269,109 @@ static int gfar_mdio_probe(struct device *dev)
 	enet_regs = (struct gfar __iomem *)
 		((char *)regs - offsetof(struct gfar, gfar_mii_regs));
 
-	/* Scan the bus, looking for an empty spot for TBIPA */
-	gfar_write(&enet_regs->tbipa, 0);
-	for (i = PHY_MAX_ADDR; i > 0; i--) {
-		u32 phy_id;
+	for_each_child_of_node(np, tbi) {
+		if (!strncmp(tbi->type, "tbi-phy", 8))
+			break;
+	}
 
-		err = get_phy_id(new_bus, i, &phy_id);
-		if (err)
-			goto bus_register_fail;
+	if (tbi) {
+		const u32 *prop = of_get_property(tbi, "reg", NULL);
 
-		if (phy_id == 0xffffffff)
-			break;
+		if (prop)
+			tbiaddr = *prop;
 	}
 
-	/* The bus is full.  We don't support using 31 PHYs, sorry */
-	if (i == 0) {
+	if (tbiaddr == -1) {
+		gfar_write(&enet_regs->tbipa, 0);
+
+		tbiaddr = gfar_mdio_find_free(new_bus);
+	}
+
+	/*
+	 * We define TBIPA at 0 to be illegal, opting to fail for boards that
+	 * have PHYs at 1-31, rather than change tbipa and rescan.
+	 */
+	if (tbiaddr == 0) {
 		err = -EBUSY;
 
-		goto bus_register_fail;
+		goto err_free_irqs;
 	}
 
-	gfar_write(&enet_regs->tbipa, i);
+	gfar_write(&enet_regs->tbipa, tbiaddr);
+
+	/*
+	 * The TBIPHY-only buses will find PHYs at every address,
+	 * so we mask them all but the TBI
+	 */
+	if (!of_device_is_compatible(np, "fsl,gianfar-mdio"))
+		new_bus->phy_mask = ~(1 << tbiaddr);
 
 	err = mdiobus_register(new_bus);
 
-	if (0 != err) {
+	if (err != 0) {
 		printk (KERN_ERR "%s: Cannot register as MDIO bus\n",
 				new_bus->name);
-		goto bus_register_fail;
+		goto err_free_irqs;
 	}
 
 	return 0;
 
-bus_register_fail:
+err_free_irqs:
+	kfree(new_bus->irq);
+err_unmap_regs:
 	iounmap(regs);
-reg_map_fail:
+err_free_bus:
 	mdiobus_free(new_bus);
 
 	return err;
 }
 
 
-static int gfar_mdio_remove(struct device *dev)
+static int gfar_mdio_remove(struct of_device *ofdev)
 {
-	struct mii_bus *bus = dev_get_drvdata(dev);
+	struct mii_bus *bus = dev_get_drvdata(&ofdev->dev);
 
 	mdiobus_unregister(bus);
 
-	dev_set_drvdata(dev, NULL);
+	dev_set_drvdata(&ofdev->dev, NULL);
 
 	iounmap((void __iomem *)bus->priv);
 	bus->priv = NULL;
+	kfree(bus->irq);
 	mdiobus_free(bus);
 
 	return 0;
 }
 
-static struct device_driver gianfar_mdio_driver = {
+static struct of_device_id gfar_mdio_match[] =
+{
+	{
+		.compatible = "fsl,gianfar-mdio",
+	},
+	{
+		.compatible = "fsl,gianfar-tbi",
+	},
+	{
+		.type = "mdio",
+		.compatible = "gianfar",
+	},
+	{},
+};
+
+static struct of_platform_driver gianfar_mdio_driver = {
 	.name = "fsl-gianfar_mdio",
-	.bus = &platform_bus_type,
+	.match_table = gfar_mdio_match,
+
 	.probe = gfar_mdio_probe,
 	.remove = gfar_mdio_remove,
 };
 
-static int match_mdio_bus(struct device *dev, void *data)
-{
-	const struct gfar_private *priv = data;
-	const struct platform_device *pdev = to_platform_device(dev);
-
-	return !strcmp(pdev->name, gianfar_mdio_driver.name) &&
-		pdev->id == priv->einfo->mdio_bus;
-}
-
-/* Given a gfar_priv structure, find the mii_bus controlled by this device (not
- * necessarily the same as the bus the gfar's PHY is on), if one exists.
- * Normally only the first gianfar controls a mii_bus.  */
-struct mii_bus *gfar_get_miibus(const struct gfar_private *priv)
-{
-	/*const*/ struct device *d;
-
-	d = bus_find_device(gianfar_mdio_driver.bus, NULL, (void *)priv,
-			    match_mdio_bus);
-	return d ? dev_get_drvdata(d) : NULL;
-}
-
 int __init gfar_mdio_init(void)
 {
-	return driver_register(&gianfar_mdio_driver);
+	return of_register_platform_driver(&gianfar_mdio_driver);
 }
 
 void gfar_mdio_exit(void)
 {
-	driver_unregister(&gianfar_mdio_driver);
+	of_unregister_platform_driver(&gianfar_mdio_driver);
 }
diff --git a/drivers/net/gianfar_mii.h b/drivers/net/gianfar_mii.h
index 02dc970ca1f..65c242cd468 100644
--- a/drivers/net/gianfar_mii.h
+++ b/drivers/net/gianfar_mii.h
@@ -49,4 +49,6 @@ int gfar_local_mdio_read(struct gfar_mii __iomem *regs, int mii_id, int regnum);
 struct mii_bus *gfar_get_miibus(const struct gfar_private *priv);
 int __init gfar_mdio_init(void);
 void gfar_mdio_exit(void);
+
+void gfar_mdio_bus_name(char *name, struct device_node *np);
 #endif /* GIANFAR_PHY_H */
diff --git a/include/linux/fsl_devices.h b/include/linux/fsl_devices.h
index 708bab58d8d..d9051d717d2 100644
--- a/include/linux/fsl_devices.h
+++ b/include/linux/fsl_devices.h
@@ -47,12 +47,7 @@
 struct gianfar_platform_data {
 	/* device specific information */
 	u32	device_flags;
-	/* board specific information */
-	u32	board_flags;
-	int	mdio_bus;			/* Bus controlled by us */
-	char	bus_id[MII_BUS_ID_SIZE];	/* Bus PHY is on */
-	u32	phy_id;
-	u8	mac_addr[6];
+	char	bus_id[BUS_ID_SIZE];
 	phy_interface_t interface;
 };
 
@@ -61,17 +56,6 @@ struct gianfar_mdio_data {
 	int	irq[32];
 };
 
-/* Flags related to gianfar device features */
-#define FSL_GIANFAR_DEV_HAS_GIGABIT		0x00000001
-#define FSL_GIANFAR_DEV_HAS_COALESCE		0x00000002
-#define FSL_GIANFAR_DEV_HAS_RMON		0x00000004
-#define FSL_GIANFAR_DEV_HAS_MULTI_INTR		0x00000008
-#define FSL_GIANFAR_DEV_HAS_CSUM		0x00000010
-#define FSL_GIANFAR_DEV_HAS_VLAN		0x00000020
-#define FSL_GIANFAR_DEV_HAS_EXTENDED_HASH	0x00000040
-#define FSL_GIANFAR_DEV_HAS_PADDING		0x00000080
-#define FSL_GIANFAR_DEV_HAS_MAGIC_PACKET	0x00000100
-
 /* Flags in gianfar_platform_data */
 #define FSL_GIANFAR_BRD_HAS_PHY_INTR	0x00000001 /* set or use a timer */
 #define FSL_GIANFAR_BRD_IS_REDUCED	0x00000002 /* Set if RGMII, RMII */
-- 
cgit v1.2.3-70-g09d2


From e08e1f7adba522378e8d2ae941bf25443866136d Mon Sep 17 00:00:00 2001
From: Ian Campbell <ian.campbell@citrix.com>
Date: Tue, 16 Dec 2008 12:17:30 -0800
Subject: swiotlb: allow architectures to override phys<->bus<->phys
 conversions

Impact: generalize phys<->bus<->phys conversions in the swiotlb code

Architectures may need to override these conversions. Implement a
__weak hook point containing the default implementation.

Signed-off-by: Ian Campbell <ian.campbell@citrix.com>
Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/swiotlb.h |  3 +++
 lib/swiotlb.c           | 52 ++++++++++++++++++++++++++++++++++---------------
 2 files changed, 39 insertions(+), 16 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/swiotlb.h b/include/linux/swiotlb.h
index 58b996a642f..694f1839cbc 100644
--- a/include/linux/swiotlb.h
+++ b/include/linux/swiotlb.h
@@ -27,6 +27,9 @@ swiotlb_init(void);
 extern void *swiotlb_alloc_boot(size_t bytes, unsigned long nslabs);
 extern void *swiotlb_alloc(unsigned order, unsigned long nslabs);
 
+extern dma_addr_t swiotlb_phys_to_bus(phys_addr_t address);
+extern phys_addr_t swiotlb_bus_to_phys(dma_addr_t address);
+
 extern void
 *swiotlb_alloc_coherent(struct device *hwdev, size_t size,
 			dma_addr_t *dma_handle, gfp_t flags);
diff --git a/lib/swiotlb.c b/lib/swiotlb.c
index 1272b23e476..3494263cdd9 100644
--- a/lib/swiotlb.c
+++ b/lib/swiotlb.c
@@ -125,6 +125,26 @@ void * __weak swiotlb_alloc(unsigned order, unsigned long nslabs)
 	return (void *)__get_free_pages(GFP_DMA | __GFP_NOWARN, order);
 }
 
+dma_addr_t __weak swiotlb_phys_to_bus(phys_addr_t paddr)
+{
+	return paddr;
+}
+
+phys_addr_t __weak swiotlb_bus_to_phys(dma_addr_t baddr)
+{
+	return baddr;
+}
+
+static dma_addr_t swiotlb_virt_to_bus(volatile void *address)
+{
+	return swiotlb_phys_to_bus(virt_to_phys(address));
+}
+
+static void *swiotlb_bus_to_virt(dma_addr_t address)
+{
+	return phys_to_virt(swiotlb_bus_to_phys(address));
+}
+
 /*
  * Statically reserve bounce buffer space and initialize bounce buffer data
  * structures for the software IO TLB used to implement the DMA API.
@@ -168,7 +188,7 @@ swiotlb_init_with_default_size(size_t default_size)
 		panic("Cannot allocate SWIOTLB overflow buffer!\n");
 
 	printk(KERN_INFO "Placing software IO TLB between 0x%lx - 0x%lx\n",
-	       virt_to_bus(io_tlb_start), virt_to_bus(io_tlb_end));
+	       swiotlb_virt_to_bus(io_tlb_start), swiotlb_virt_to_bus(io_tlb_end));
 }
 
 void __init
@@ -250,7 +270,7 @@ swiotlb_late_init_with_default_size(size_t default_size)
 
 	printk(KERN_INFO "Placing %luMB software IO TLB between 0x%lx - "
 	       "0x%lx\n", bytes >> 20,
-	       virt_to_bus(io_tlb_start), virt_to_bus(io_tlb_end));
+	       swiotlb_virt_to_bus(io_tlb_start), swiotlb_virt_to_bus(io_tlb_end));
 
 	return 0;
 
@@ -298,7 +318,7 @@ map_single(struct device *hwdev, char *buffer, size_t size, int dir)
 	unsigned long max_slots;
 
 	mask = dma_get_seg_boundary(hwdev);
-	start_dma_addr = virt_to_bus(io_tlb_start) & mask;
+	start_dma_addr = swiotlb_virt_to_bus(io_tlb_start) & mask;
 
 	offset_slots = ALIGN(start_dma_addr, 1 << IO_TLB_SHIFT) >> IO_TLB_SHIFT;
 
@@ -475,7 +495,7 @@ swiotlb_alloc_coherent(struct device *hwdev, size_t size,
 		dma_mask = hwdev->coherent_dma_mask;
 
 	ret = (void *)__get_free_pages(flags, order);
-	if (ret && !is_buffer_dma_capable(dma_mask, virt_to_bus(ret), size)) {
+	if (ret && !is_buffer_dma_capable(dma_mask, swiotlb_virt_to_bus(ret), size)) {
 		/*
 		 * The allocated memory isn't reachable by the device.
 		 * Fall back on swiotlb_map_single().
@@ -496,7 +516,7 @@ swiotlb_alloc_coherent(struct device *hwdev, size_t size,
 	}
 
 	memset(ret, 0, size);
-	dev_addr = virt_to_bus(ret);
+	dev_addr = swiotlb_virt_to_bus(ret);
 
 	/* Confirm address can be DMA'd by device */
 	if (!is_buffer_dma_capable(dma_mask, dev_addr, size)) {
@@ -556,7 +576,7 @@ dma_addr_t
 swiotlb_map_single_attrs(struct device *hwdev, void *ptr, size_t size,
 			 int dir, struct dma_attrs *attrs)
 {
-	dma_addr_t dev_addr = virt_to_bus(ptr);
+	dma_addr_t dev_addr = swiotlb_virt_to_bus(ptr);
 	void *map;
 
 	BUG_ON(dir == DMA_NONE);
@@ -577,7 +597,7 @@ swiotlb_map_single_attrs(struct device *hwdev, void *ptr, size_t size,
 		map = io_tlb_overflow_buffer;
 	}
 
-	dev_addr = virt_to_bus(map);
+	dev_addr = swiotlb_virt_to_bus(map);
 
 	/*
 	 * Ensure that the address returned is DMA'ble
@@ -607,7 +627,7 @@ void
 swiotlb_unmap_single_attrs(struct device *hwdev, dma_addr_t dev_addr,
 			   size_t size, int dir, struct dma_attrs *attrs)
 {
-	char *dma_addr = bus_to_virt(dev_addr);
+	char *dma_addr = swiotlb_bus_to_virt(dev_addr);
 
 	BUG_ON(dir == DMA_NONE);
 	if (is_swiotlb_buffer(dma_addr))
@@ -637,7 +657,7 @@ static void
 swiotlb_sync_single(struct device *hwdev, dma_addr_t dev_addr,
 		    size_t size, int dir, int target)
 {
-	char *dma_addr = bus_to_virt(dev_addr);
+	char *dma_addr = swiotlb_bus_to_virt(dev_addr);
 
 	BUG_ON(dir == DMA_NONE);
 	if (is_swiotlb_buffer(dma_addr))
@@ -668,7 +688,7 @@ swiotlb_sync_single_range(struct device *hwdev, dma_addr_t dev_addr,
 			  unsigned long offset, size_t size,
 			  int dir, int target)
 {
-	char *dma_addr = bus_to_virt(dev_addr) + offset;
+	char *dma_addr = swiotlb_bus_to_virt(dev_addr) + offset;
 
 	BUG_ON(dir == DMA_NONE);
 	if (is_swiotlb_buffer(dma_addr))
@@ -724,7 +744,7 @@ swiotlb_map_sg_attrs(struct device *hwdev, struct scatterlist *sgl, int nelems,
 
 	for_each_sg(sgl, sg, nelems, i) {
 		addr = SG_ENT_VIRT_ADDRESS(sg);
-		dev_addr = virt_to_bus(addr);
+		dev_addr = swiotlb_virt_to_bus(addr);
 		if (swiotlb_force ||
 		    address_needs_mapping(hwdev, dev_addr, sg->length)) {
 			void *map = map_single(hwdev, addr, sg->length, dir);
@@ -737,7 +757,7 @@ swiotlb_map_sg_attrs(struct device *hwdev, struct scatterlist *sgl, int nelems,
 				sgl[0].dma_length = 0;
 				return 0;
 			}
-			sg->dma_address = virt_to_bus(map);
+			sg->dma_address = swiotlb_virt_to_bus(map);
 		} else
 			sg->dma_address = dev_addr;
 		sg->dma_length = sg->length;
@@ -768,7 +788,7 @@ swiotlb_unmap_sg_attrs(struct device *hwdev, struct scatterlist *sgl,
 
 	for_each_sg(sgl, sg, nelems, i) {
 		if (sg->dma_address != SG_ENT_PHYS_ADDRESS(sg))
-			unmap_single(hwdev, bus_to_virt(sg->dma_address),
+			unmap_single(hwdev, swiotlb_bus_to_virt(sg->dma_address),
 				     sg->dma_length, dir);
 		else if (dir == DMA_FROM_DEVICE)
 			dma_mark_clean(SG_ENT_VIRT_ADDRESS(sg), sg->dma_length);
@@ -801,7 +821,7 @@ swiotlb_sync_sg(struct device *hwdev, struct scatterlist *sgl,
 
 	for_each_sg(sgl, sg, nelems, i) {
 		if (sg->dma_address != SG_ENT_PHYS_ADDRESS(sg))
-			sync_single(hwdev, bus_to_virt(sg->dma_address),
+			sync_single(hwdev, swiotlb_bus_to_virt(sg->dma_address),
 				    sg->dma_length, dir, target);
 		else if (dir == DMA_FROM_DEVICE)
 			dma_mark_clean(SG_ENT_VIRT_ADDRESS(sg), sg->dma_length);
@@ -825,7 +845,7 @@ swiotlb_sync_sg_for_device(struct device *hwdev, struct scatterlist *sg,
 int
 swiotlb_dma_mapping_error(struct device *hwdev, dma_addr_t dma_addr)
 {
-	return (dma_addr == virt_to_bus(io_tlb_overflow_buffer));
+	return (dma_addr == swiotlb_virt_to_bus(io_tlb_overflow_buffer));
 }
 
 /*
@@ -837,7 +857,7 @@ swiotlb_dma_mapping_error(struct device *hwdev, dma_addr_t dma_addr)
 int
 swiotlb_dma_supported(struct device *hwdev, u64 mask)
 {
-	return virt_to_bus(io_tlb_end - 1) <= mask;
+	return swiotlb_virt_to_bus(io_tlb_end - 1) <= mask;
 }
 
 EXPORT_SYMBOL(swiotlb_map_single);
-- 
cgit v1.2.3-70-g09d2


From b81ea27b2329bf44b30c427800954f845896d476 Mon Sep 17 00:00:00 2001
From: Ian Campbell <ian.campbell@citrix.com>
Date: Tue, 16 Dec 2008 12:17:31 -0800
Subject: swiotlb: add arch hook to force mapping

Impact: generalize the sw-IOTLB range checks

Some architectures require special rules to determine whether a range
needs mapping or not.  This adds a weak function for architectures to
override.

Signed-off-by: Ian Campbell <ian.campbell@citrix.com>
Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/swiotlb.h |  2 ++
 lib/swiotlb.c           | 15 +++++++++++++--
 2 files changed, 15 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/swiotlb.h b/include/linux/swiotlb.h
index 694f1839cbc..325af1de035 100644
--- a/include/linux/swiotlb.h
+++ b/include/linux/swiotlb.h
@@ -30,6 +30,8 @@ extern void *swiotlb_alloc(unsigned order, unsigned long nslabs);
 extern dma_addr_t swiotlb_phys_to_bus(phys_addr_t address);
 extern phys_addr_t swiotlb_bus_to_phys(dma_addr_t address);
 
+extern int swiotlb_arch_range_needs_mapping(void *ptr, size_t size);
+
 extern void
 *swiotlb_alloc_coherent(struct device *hwdev, size_t size,
 			dma_addr_t *dma_handle, gfp_t flags);
diff --git a/lib/swiotlb.c b/lib/swiotlb.c
index 3494263cdd9..d8b09051c45 100644
--- a/lib/swiotlb.c
+++ b/lib/swiotlb.c
@@ -145,6 +145,11 @@ static void *swiotlb_bus_to_virt(dma_addr_t address)
 	return phys_to_virt(swiotlb_bus_to_phys(address));
 }
 
+int __weak swiotlb_arch_range_needs_mapping(void *ptr, size_t size)
+{
+	return 0;
+}
+
 /*
  * Statically reserve bounce buffer space and initialize bounce buffer data
  * structures for the software IO TLB used to implement the DMA API.
@@ -297,6 +302,11 @@ address_needs_mapping(struct device *hwdev, dma_addr_t addr, size_t size)
 	return !is_buffer_dma_capable(dma_get_mask(hwdev), addr, size);
 }
 
+static inline int range_needs_mapping(void *ptr, size_t size)
+{
+	return swiotlb_force || swiotlb_arch_range_needs_mapping(ptr, size);
+}
+
 static int is_swiotlb_buffer(char *addr)
 {
 	return addr >= io_tlb_start && addr < io_tlb_end;
@@ -585,7 +595,8 @@ swiotlb_map_single_attrs(struct device *hwdev, void *ptr, size_t size,
 	 * we can safely return the device addr and not worry about bounce
 	 * buffering it.
 	 */
-	if (!address_needs_mapping(hwdev, dev_addr, size) && !swiotlb_force)
+	if (!address_needs_mapping(hwdev, dev_addr, size) &&
+	    !range_needs_mapping(ptr, size))
 		return dev_addr;
 
 	/*
@@ -745,7 +756,7 @@ swiotlb_map_sg_attrs(struct device *hwdev, struct scatterlist *sgl, int nelems,
 	for_each_sg(sgl, sg, nelems, i) {
 		addr = SG_ENT_VIRT_ADDRESS(sg);
 		dev_addr = swiotlb_virt_to_bus(addr);
-		if (swiotlb_force ||
+		if (range_needs_mapping(sg_virt(sg), sg->length) ||
 		    address_needs_mapping(hwdev, dev_addr, sg->length)) {
 			void *map = map_single(hwdev, addr, sg->length, dir);
 			if (!map) {
-- 
cgit v1.2.3-70-g09d2


From 9a9fafb89433c5fd1331bac0c84c4b321e358b42 Mon Sep 17 00:00:00 2001
From: Phil Endecott <usb_endian_patch@chezphil.org>
Date: Mon, 1 Dec 2008 10:22:33 -0500
Subject: USB: fix comment about endianness of descriptors

This patch fixes a comment and clarifies the documentation about the
endianness of descriptors. The current policy is that descriptors will
be little-endian at the API even on big-endian systems; however the
/proc/bus/usb API predates this policy and presents descriptors with
some multibyte fields byte-swapped.

Signed-off-by: Phil Endecott <usb_endian_patch@chezphil.org>
Signed-off-by: Alan Stern <stern@rowland.harvard.edu>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 Documentation/usb/proc_usb_info.txt | 6 ++++--
 include/linux/usb/ch9.h             | 8 ++++++--
 2 files changed, 10 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/usb/proc_usb_info.txt b/Documentation/usb/proc_usb_info.txt
index 077e9032d0c..fafcd472326 100644
--- a/Documentation/usb/proc_usb_info.txt
+++ b/Documentation/usb/proc_usb_info.txt
@@ -49,8 +49,10 @@ it and 002/048 sometime later.
 
 These files can be read as binary data.  The binary data consists
 of first the device descriptor, then the descriptors for each
-configuration of the device.  That information is also shown in
-text form by the /proc/bus/usb/devices file, described later.
+configuration of the device.  Multi-byte fields in the device and
+configuration descriptors, but not other descriptors, are converted
+to host endianness by the kernel.  This information is also shown
+in text form by the /proc/bus/usb/devices file, described later.
 
 These files may also be used to write user-level drivers for the USB
 devices.  You would open the /proc/bus/usb/BBB/DDD file read/write,
diff --git a/include/linux/usb/ch9.h b/include/linux/usb/ch9.h
index 73a2f4eb1f7..9b42baed390 100644
--- a/include/linux/usb/ch9.h
+++ b/include/linux/usb/ch9.h
@@ -158,8 +158,12 @@ struct usb_ctrlrequest {
  * (rarely) accepted by SET_DESCRIPTOR.
  *
  * Note that all multi-byte values here are encoded in little endian
- * byte order "on the wire".  But when exposed through Linux-USB APIs,
- * they've been converted to cpu byte order.
+ * byte order "on the wire".  Within the kernel and when exposed
+ * through the Linux-USB APIs, they are not converted to cpu byte
+ * order; it is the responsibility of the client code to do this.
+ * The single exception is when device and configuration descriptors (but
+ * not other descriptors) are read from usbfs (i.e. /proc/bus/usb/BBB/DDD);
+ * in this case the fields are converted to host endianness by the kernel.
  */
 
 /*
-- 
cgit v1.2.3-70-g09d2


From 2d91d78b68606ff7ce52ea70e187dee7831aa2f6 Mon Sep 17 00:00:00 2001
From: Rémi Denis-Courmont <remi.denis-courmont@nokia.com>
Date: Wed, 17 Dec 2008 15:47:29 -0800
Subject: Phonet: allocate a non-Ethernet ARP type
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Also leave some room for more 802.11 types.

Signed-off-by: Rémi Denis-Courmont <remi.denis-courmont@nokia.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/if_arp.h | 2 ++
 net/core/dev.c         | 8 ++++----
 2 files changed, 6 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/if_arp.h b/include/linux/if_arp.h
index 4d3401812e6..11df77ab2db 100644
--- a/include/linux/if_arp.h
+++ b/include/linux/if_arp.h
@@ -87,6 +87,8 @@
 #define ARPHRD_IEEE80211_PRISM 802	/* IEEE 802.11 + Prism2 header  */
 #define ARPHRD_IEEE80211_RADIOTAP 803	/* IEEE 802.11 + radiotap header */
 
+#define ARPHRD_PHONET	820		/* PhoNet media type		*/
+
 #define ARPHRD_VOID	  0xFFFF	/* Void type, nothing is known */
 #define ARPHRD_NONE	  0xFFFE	/* zero header length */
 
diff --git a/net/core/dev.c b/net/core/dev.c
index d8d7d1fccde..15aab0c46d6 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -283,8 +283,8 @@ static const unsigned short netdev_lock_type[] =
 	 ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET,
 	 ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL,
 	 ARPHRD_FCFABRIC, ARPHRD_IEEE802_TR, ARPHRD_IEEE80211,
-	 ARPHRD_IEEE80211_PRISM, ARPHRD_IEEE80211_RADIOTAP, ARPHRD_VOID,
-	 ARPHRD_NONE};
+	 ARPHRD_IEEE80211_PRISM, ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET,
+	 ARPHRD_VOID, ARPHRD_NONE};
 
 static const char *netdev_lock_name[] =
 	{"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
@@ -300,8 +300,8 @@ static const char *netdev_lock_name[] =
 	 "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
 	 "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
 	 "_xmit_FCFABRIC", "_xmit_IEEE802_TR", "_xmit_IEEE80211",
-	 "_xmit_IEEE80211_PRISM", "_xmit_IEEE80211_RADIOTAP", "_xmit_VOID",
-	 "_xmit_NONE"};
+	 "_xmit_IEEE80211_PRISM", "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET",
+	 "_xmit_VOID", "_xmit_NONE"};
 
 static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)];
 static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)];
-- 
cgit v1.2.3-70-g09d2


From 57c81fffc863fb4c1804bc963bcbfb82d736c6df Mon Sep 17 00:00:00 2001
From: Rémi Denis-Courmont <remi.denis-courmont@nokia.com>
Date: Wed, 17 Dec 2008 15:47:48 -0800
Subject: Phonet: allocate separate ARP type for GPRS over a Phonet pipe
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

A separate xmit lock class supports GPRS over a Phonet pipe over a TUN
device (type ARPHRD_NONE).

Signed-off-by: Rémi Denis-Courmont <remi.denis-courmont@nokia.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/if_arp.h | 1 +
 net/core/dev.c         | 4 ++--
 net/phonet/pep-gprs.c  | 2 +-
 3 files changed, 4 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/if_arp.h b/include/linux/if_arp.h
index 11df77ab2db..5ff89809a58 100644
--- a/include/linux/if_arp.h
+++ b/include/linux/if_arp.h
@@ -88,6 +88,7 @@
 #define ARPHRD_IEEE80211_RADIOTAP 803	/* IEEE 802.11 + radiotap header */
 
 #define ARPHRD_PHONET	820		/* PhoNet media type		*/
+#define ARPHRD_PHONET_PIPE 821		/* PhoNet pipe header		*/
 
 #define ARPHRD_VOID	  0xFFFF	/* Void type, nothing is known */
 #define ARPHRD_NONE	  0xFFFE	/* zero header length */
diff --git a/net/core/dev.c b/net/core/dev.c
index 15aab0c46d6..048cf119787 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -284,7 +284,7 @@ static const unsigned short netdev_lock_type[] =
 	 ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL,
 	 ARPHRD_FCFABRIC, ARPHRD_IEEE802_TR, ARPHRD_IEEE80211,
 	 ARPHRD_IEEE80211_PRISM, ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET,
-	 ARPHRD_VOID, ARPHRD_NONE};
+	 ARPHRD_PHONET_PIPE, ARPHRD_VOID, ARPHRD_NONE};
 
 static const char *netdev_lock_name[] =
 	{"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
@@ -301,7 +301,7 @@ static const char *netdev_lock_name[] =
 	 "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
 	 "_xmit_FCFABRIC", "_xmit_IEEE802_TR", "_xmit_IEEE80211",
 	 "_xmit_IEEE80211_PRISM", "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET",
-	 "_xmit_VOID", "_xmit_NONE"};
+	 "_xmit_PHONET_PIPE", "_xmit_VOID", "_xmit_NONE"};
 
 static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)];
 static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)];
diff --git a/net/phonet/pep-gprs.c b/net/phonet/pep-gprs.c
index 0b640b0fce0..a2873203dff 100644
--- a/net/phonet/pep-gprs.c
+++ b/net/phonet/pep-gprs.c
@@ -260,7 +260,7 @@ static int gprs_set_mtu(struct net_device *dev, int new_mtu)
 static void gprs_setup(struct net_device *dev)
 {
 	dev->features		= NETIF_F_FRAGLIST;
-	dev->type		= ARPHRD_NONE;
+	dev->type		= ARPHRD_PHONET_PIPE;
 	dev->flags		= IFF_POINTOPOINT | IFF_NOARP;
 	dev->mtu		= GPRS_DEFAULT_MTU;
 	dev->hard_header_len	= 0;
-- 
cgit v1.2.3-70-g09d2


From f38f1d2aa5a3520cf05da7cd6bd12fe2b0c509b7 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Tue, 16 Dec 2008 23:06:40 -0500
Subject: trace: add a way to enable or disable the stack tracer

Impact: enhancement to stack tracer

The stack tracer currently is either on when configured in or
off when it is not. It can not be disabled when it is configured on.
(besides disabling the function tracer that it uses)

This patch adds a way to enable or disable the stack tracer at
run time. It defaults off on bootup, but a kernel parameter 'stacktrace'
has been added to enable it on bootup.

A new sysctl has been added "kernel.stack_tracer_enabled" to let
the user enable or disable the stack tracer at run time.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 Documentation/kernel-parameters.txt |  4 +++
 include/linux/ftrace.h              |  8 ++++++
 kernel/sysctl.c                     | 10 +++++++
 kernel/trace/Kconfig                | 13 +++++++---
 kernel/trace/trace_stack.c          | 52 ++++++++++++++++++++++++++++++++++---
 5 files changed, 79 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index 2919a2e9193..edab81c1318 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -89,6 +89,7 @@ parameter is applicable:
 	SPARC	Sparc architecture is enabled.
 	SWSUSP	Software suspend (hibernation) is enabled.
 	SUSPEND	System suspend states are enabled.
+	FTRACE	Function tracing enabled.
 	TS	Appropriate touchscreen support is enabled.
 	USB	USB support is enabled.
 	USBHID	USB Human Interface Device support is enabled.
@@ -2173,6 +2174,9 @@ and is between 256 and 4096 characters. It is defined in the file
 	st=		[HW,SCSI] SCSI tape parameters (buffers, etc.)
 			See Documentation/scsi/st.txt.
 
+	stacktrace	[FTRACE]
+			Enabled the stack tracer on boot up.
+
 	sti=		[PARISC,HW]
 			Format: <num>
 			Set the STI (builtin display/keyboard on the HP-PARISC
diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index 44020f31bd8..6b0db53caa7 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -86,6 +86,14 @@ static inline void ftrace_stop(void) { }
 static inline void ftrace_start(void) { }
 #endif /* CONFIG_FUNCTION_TRACER */
 
+#ifdef CONFIG_STACK_TRACER
+extern int stack_tracer_enabled;
+int
+stack_trace_sysctl(struct ctl_table *table, int write,
+		   struct file *file, void __user *buffer, size_t *lenp,
+		   loff_t *ppos);
+#endif
+
 #ifdef CONFIG_DYNAMIC_FTRACE
 /* asm/ftrace.h must be defined for archs supporting dynamic ftrace */
 #include <asm/ftrace.h>
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index c83f566e940..6ac501a2dcc 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -487,6 +487,16 @@ static struct ctl_table kern_table[] = {
 		.proc_handler	= &ftrace_enable_sysctl,
 	},
 #endif
+#ifdef CONFIG_STACK_TRACER
+	{
+		.ctl_name	= CTL_UNNUMBERED,
+		.procname	= "stack_tracer_enabled",
+		.data		= &stack_tracer_enabled,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &stack_trace_sysctl,
+	},
+#endif
 #ifdef CONFIG_TRACING
 	{
 		.ctl_name	= CTL_UNNUMBERED,
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index d8bae6f4219..e2a4ff6fc3a 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -244,10 +244,15 @@ config STACK_TRACER
 
 	  This tracer works by hooking into every function call that the
 	  kernel executes, and keeping a maximum stack depth value and
-	  stack-trace saved. Because this logic has to execute in every
-	  kernel function, all the time, this option can slow down the
-	  kernel measurably and is generally intended for kernel
-	  developers only.
+	  stack-trace saved.  If this is configured with DYNAMIC_FTRACE
+	  then it will not have any overhead while the stack tracer
+	  is disabled.
+
+	  To enable the stack tracer on bootup, pass in 'stacktrace'
+	  on the kernel command line.
+
+	  The stack tracer can also be enabled or disabled via the
+	  sysctl kernel.stack_tracer_enabled
 
 	  Say N if unsure.
 
diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c
index 0b863f2cbc8..4842c969c78 100644
--- a/kernel/trace/trace_stack.c
+++ b/kernel/trace/trace_stack.c
@@ -10,6 +10,7 @@
 #include <linux/debugfs.h>
 #include <linux/ftrace.h>
 #include <linux/module.h>
+#include <linux/sysctl.h>
 #include <linux/init.h>
 #include <linux/fs.h>
 #include "trace.h"
@@ -31,6 +32,10 @@ static raw_spinlock_t max_stack_lock =
 
 static int stack_trace_disabled __read_mostly;
 static DEFINE_PER_CPU(int, trace_active);
+static DEFINE_MUTEX(stack_sysctl_mutex);
+
+int stack_tracer_enabled;
+static int last_stack_tracer_enabled;
 
 static inline void check_stack(void)
 {
@@ -174,7 +179,7 @@ stack_max_size_write(struct file *filp, const char __user *ubuf,
 	return count;
 }
 
-static struct file_operations stack_max_size_fops = {
+static const struct file_operations stack_max_size_fops = {
 	.open		= tracing_open_generic,
 	.read		= stack_max_size_read,
 	.write		= stack_max_size_write,
@@ -272,7 +277,7 @@ static int t_show(struct seq_file *m, void *v)
 	return 0;
 }
 
-static struct seq_operations stack_trace_seq_ops = {
+static const struct seq_operations stack_trace_seq_ops = {
 	.start		= t_start,
 	.next		= t_next,
 	.stop		= t_stop,
@@ -288,12 +293,48 @@ static int stack_trace_open(struct inode *inode, struct file *file)
 	return ret;
 }
 
-static struct file_operations stack_trace_fops = {
+static const struct file_operations stack_trace_fops = {
 	.open		= stack_trace_open,
 	.read		= seq_read,
 	.llseek		= seq_lseek,
 };
 
+int
+stack_trace_sysctl(struct ctl_table *table, int write,
+		   struct file *file, void __user *buffer, size_t *lenp,
+		   loff_t *ppos)
+{
+	int ret;
+
+	mutex_lock(&stack_sysctl_mutex);
+
+	ret  = proc_dointvec(table, write, file, buffer, lenp, ppos);
+
+	if (ret || !write ||
+	    (last_stack_tracer_enabled == stack_tracer_enabled))
+		goto out;
+
+	last_stack_tracer_enabled = stack_tracer_enabled;
+
+	if (stack_tracer_enabled)
+		register_ftrace_function(&trace_ops);
+	else
+		unregister_ftrace_function(&trace_ops);
+
+ out:
+	mutex_unlock(&stack_sysctl_mutex);
+	return ret;
+}
+
+static int start_stack_trace __initdata;
+
+static __init int enable_stacktrace(char *str)
+{
+	start_stack_trace = 1;
+	return 1;
+}
+__setup("stacktrace", enable_stacktrace);
+
 static __init int stack_trace_init(void)
 {
 	struct dentry *d_tracer;
@@ -311,7 +352,10 @@ static __init int stack_trace_init(void)
 	if (!entry)
 		pr_warning("Could not create debugfs 'stack_trace' entry\n");
 
-	register_ftrace_function(&trace_ops);
+	if (start_stack_trace) {
+		register_ftrace_function(&trace_ops);
+		stack_tracer_enabled = 1;
+	}
 
 	return 0;
 }
-- 
cgit v1.2.3-70-g09d2


From 9c2c48020ec0dd6ecd27e5a1298f73b40d85a595 Mon Sep 17 00:00:00 2001
From: Ken Chen <kenchen@google.com>
Date: Tue, 16 Dec 2008 23:41:22 -0800
Subject: schedstat: consolidate per-task cpu runtime stats

Impact: simplify code

When we turn on CONFIG_SCHEDSTATS, per-task cpu runtime is accumulated
twice. Once in task->se.sum_exec_runtime and once in sched_info.cpu_time.
These two stats are exactly the same.

Given that task->se.sum_exec_runtime is always accumulated by the core
scheduler, sched_info can reuse that data instead of duplicate the accounting.

Signed-off-by: Ken Chen <kenchen@google.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 fs/proc/base.c        | 2 +-
 include/linux/sched.h | 3 +--
 kernel/delayacct.c    | 2 +-
 kernel/sched.c        | 2 ++
 kernel/sched_stats.h  | 5 ++---
 5 files changed, 7 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/fs/proc/base.c b/fs/proc/base.c
index d4677603c88..4d745bac768 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -347,7 +347,7 @@ static int proc_pid_wchan(struct task_struct *task, char *buffer)
 static int proc_pid_schedstat(struct task_struct *task, char *buffer)
 {
 	return sprintf(buffer, "%llu %llu %lu\n",
-			task->sched_info.cpu_time,
+			task->se.sum_exec_runtime,
 			task->sched_info.run_delay,
 			task->sched_info.pcount);
 }
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 8cccd6dc5d6..2d1e840ddd3 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -670,8 +670,7 @@ struct reclaim_state;
 struct sched_info {
 	/* cumulative counters */
 	unsigned long pcount;	      /* # of times run on this cpu */
-	unsigned long long cpu_time,  /* time spent on the cpu */
-			   run_delay; /* time spent waiting on a runqueue */
+	unsigned long long run_delay; /* time spent waiting on a runqueue */
 
 	/* timestamps */
 	unsigned long long last_arrival,/* when we last ran on a cpu */
diff --git a/kernel/delayacct.c b/kernel/delayacct.c
index b3179dad71b..abb6e17505e 100644
--- a/kernel/delayacct.c
+++ b/kernel/delayacct.c
@@ -127,7 +127,7 @@ int __delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk)
 	 */
 	t1 = tsk->sched_info.pcount;
 	t2 = tsk->sched_info.run_delay;
-	t3 = tsk->sched_info.cpu_time;
+	t3 = tsk->se.sum_exec_runtime;
 
 	d->cpu_count += t1;
 
diff --git a/kernel/sched.c b/kernel/sched.c
index f53e2b8ef52..fd835fc320b 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -596,6 +596,8 @@ struct rq {
 #ifdef CONFIG_SCHEDSTATS
 	/* latency stats */
 	struct sched_info rq_sched_info;
+	unsigned long long rq_cpu_time;
+	/* could above be rq->cfs_rq.exec_clock + rq->rt_rq.rt_runtime ? */
 
 	/* sys_sched_yield() stats */
 	unsigned int yld_exp_empty;
diff --git a/kernel/sched_stats.h b/kernel/sched_stats.h
index 7dbf72a2b02..3b01098164c 100644
--- a/kernel/sched_stats.h
+++ b/kernel/sched_stats.h
@@ -31,7 +31,7 @@ static int show_schedstat(struct seq_file *seq, void *v)
 		    rq->yld_act_empty, rq->yld_exp_empty, rq->yld_count,
 		    rq->sched_switch, rq->sched_count, rq->sched_goidle,
 		    rq->ttwu_count, rq->ttwu_local,
-		    rq->rq_sched_info.cpu_time,
+		    rq->rq_cpu_time,
 		    rq->rq_sched_info.run_delay, rq->rq_sched_info.pcount);
 
 		seq_printf(seq, "\n");
@@ -123,7 +123,7 @@ static inline void
 rq_sched_info_depart(struct rq *rq, unsigned long long delta)
 {
 	if (rq)
-		rq->rq_sched_info.cpu_time += delta;
+		rq->rq_cpu_time += delta;
 }
 
 static inline void
@@ -236,7 +236,6 @@ static inline void sched_info_depart(struct task_struct *t)
 	unsigned long long delta = task_rq(t)->clock -
 					t->sched_info.last_arrival;
 
-	t->sched_info.cpu_time += delta;
 	rq_sched_info_depart(task_rq(t), delta);
 
 	if (t->state == TASK_RUNNING)
-- 
cgit v1.2.3-70-g09d2


From 74c8a6130486bed224e960790f4aa72dd09c061e Mon Sep 17 00:00:00 2001
From: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Date: Wed, 17 Dec 2008 19:40:33 +0900
Subject: locking, irq: enclose irq_desc_lock_class in CONFIG_LOCKDEP

Impact: simplify code

commit "08678b0: generic: sparse irqs: use irq_desc() [...]" introduced
the irq_desc_lock_class variable.

But it is used only if CONFIG_SPARSE_IRQ=Y or CONFIG_TRACE_IRQFLAGS=Y.
Otherwise, following warnings happen:

	CC      kernel/irq/handle.o
	kernel/irq/handle.c:26: warning: 'irq_desc_lock_class' defined but not used

Actually, current early_init_irq_lock_class has a bit strange and messy ifdef.
In addition, it is not valueable.

1. this function is protected by !CONFIG_SPARSE_IRQ, but that is not necessary.
   if CONFIG_SPARSE_IRQ=Y, desc of all irq number are initialized by NULL
   at first - then this function calling is safe.

2. this function protected by CONFIG_TRACE_IRQFLAGS too. but it is not
   necessary either, because lockdep_set_class() doesn't have bad side
   effect even if CONFIG_TRACE_IRQFLAGS=n.

This patch bloat kernel size a bit on CONFIG_TRACE_IRQFLAGS=n and
CONFIG_SPARSE_IRQ=Y - but that's ok. early_init_irq_lock_class() is not
a fastpatch at all.

To avoid messy ifdefs is more important than a few bytes diet.

Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/lockdep.h | 2 +-
 kernel/irq/handle.c     | 5 -----
 2 files changed, 1 insertion(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h
index 29aec6e1002..9dba554c802 100644
--- a/include/linux/lockdep.h
+++ b/include/linux/lockdep.h
@@ -377,7 +377,7 @@ do {								\
 
 #endif /* CONFIG_LOCK_STAT */
 
-#if defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_GENERIC_HARDIRQS)
+#ifdef CONFIG_GENERIC_HARDIRQS
 extern void early_init_irq_lock_class(void);
 #else
 static inline void early_init_irq_lock_class(void)
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index f1a23069c20..6492400cb50 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -422,11 +422,8 @@ out:
 }
 #endif
 
-
-#ifdef CONFIG_TRACE_IRQFLAGS
 void early_init_irq_lock_class(void)
 {
-#ifndef CONFIG_SPARSE_IRQ
 	struct irq_desc *desc;
 	int i;
 
@@ -436,9 +433,7 @@ void early_init_irq_lock_class(void)
 
 		lockdep_set_class(&desc->lock, &irq_desc_lock_class);
 	}
-#endif
 }
-#endif
 
 #ifdef CONFIG_SPARSE_IRQ
 unsigned int kstat_irqs_cpu(unsigned int irq, int cpu)
-- 
cgit v1.2.3-70-g09d2


From 40aa4a30d0fd075fb934de4ee8163056827052ab Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@opensource.wolfsonmicro.com>
Date: Tue, 16 Dec 2008 10:15:12 +0000
Subject: ASoC: Add WM8350 AudioPlus codec driver

The WM8350 is an integrated audio and power management subsystem which
provides a single-chip solution for portable audio and multimedia systems.

The integrated audio CODEC provides all the necessary functions for
high-quality stereo recording and playback. Programmable on-chip
amplifiers allow for the direct connection of headphones and microphones
with a minimum of external components. A programmable low-noise bias
voltage is available to feed one or more electret microphones.
Additional audio features include programmable high-pass filter in the
ADC input path.

This driver was originally written by Liam Girdwood with further updates
from me.

Signed-off-by: Mark Brown <broonie@opensource.wolfsonmicro.com>
---
 include/linux/mfd/wm8350/audio.h |   38 +-
 sound/soc/codecs/Kconfig         |    4 +
 sound/soc/codecs/Makefile        |    2 +
 sound/soc/codecs/wm8350.c        | 1583 ++++++++++++++++++++++++++++++++++++++
 sound/soc/codecs/wm8350.h        |   20 +
 5 files changed, 1643 insertions(+), 4 deletions(-)
 create mode 100644 sound/soc/codecs/wm8350.c
 create mode 100644 sound/soc/codecs/wm8350.h

(limited to 'include/linux')

diff --git a/include/linux/mfd/wm8350/audio.h b/include/linux/mfd/wm8350/audio.h
index 217bb22ebb8..af95a1d2f3a 100644
--- a/include/linux/mfd/wm8350/audio.h
+++ b/include/linux/mfd/wm8350/audio.h
@@ -1,7 +1,7 @@
 /*
  * audio.h  --  Audio Driver for Wolfson WM8350 PMIC
  *
- * Copyright 2007 Wolfson Microelectronics PLC
+ * Copyright 2007, 2008 Wolfson Microelectronics PLC
  *
  *  This program is free software; you can redistribute  it and/or modify it
  *  under  the terms of  the GNU General  Public License as published by the
@@ -70,9 +70,9 @@
 #define WM8350_CODEC_ISEL_0_5                   3	/* x0.5 */
 
 #define WM8350_VMID_OFF                         0
-#define WM8350_VMID_500K                        1
-#define WM8350_VMID_100K                        2
-#define WM8350_VMID_10K                         3
+#define WM8350_VMID_300K                        1
+#define WM8350_VMID_50K                         2
+#define WM8350_VMID_5K                          3
 
 /*
  * R40 (0x28) - Clock Control 1
@@ -591,8 +591,38 @@
 #define WM8350_IRQ_CODEC_MICSCD			41
 #define WM8350_IRQ_CODEC_MICD			42
 
+/*
+ * WM8350 Platform data.
+ *
+ * This must be initialised per platform for best audio performance.
+ * Please see WM8350 datasheet for information.
+ */
+struct wm8350_audio_platform_data {
+	int vmid_discharge_msecs;	/* VMID --> OFF discharge time */
+	int drain_msecs;	/* OFF drain time */
+	int cap_discharge_msecs;	/* Cap ON (from OFF) discharge time */
+	int vmid_charge_msecs;	/* vmid power up time */
+	u32 vmid_s_curve:2;	/* vmid enable s curve speed */
+	u32 dis_out4:2;		/* out4 discharge speed */
+	u32 dis_out3:2;		/* out3 discharge speed */
+	u32 dis_out2:2;		/* out2 discharge speed */
+	u32 dis_out1:2;		/* out1 discharge speed */
+	u32 vroi_out4:1;	/* out4 tie off */
+	u32 vroi_out3:1;	/* out3 tie off */
+	u32 vroi_out2:1;	/* out2 tie off */
+	u32 vroi_out1:1;	/* out1 tie off */
+	u32 vroi_enable:1;	/* enable tie off */
+	u32 codec_current_on:2;	/* current level ON */
+	u32 codec_current_standby:2;	/* current level STANDBY */
+	u32 codec_current_charge:2;	/* codec current @ vmid charge */
+};
+
+struct snd_soc_codec;
+
 struct wm8350_codec {
 	struct platform_device *pdev;
+	struct snd_soc_codec *codec;
+	struct wm8350_audio_platform_data *platform_data;
 };
 
 #endif
diff --git a/sound/soc/codecs/Kconfig b/sound/soc/codecs/Kconfig
index bf68052d692..c41289b5f58 100644
--- a/sound/soc/codecs/Kconfig
+++ b/sound/soc/codecs/Kconfig
@@ -13,6 +13,7 @@ config SND_SOC_ALL_CODECS
 	select SND_SOC_TWL4030 if TWL4030_CORE
 	select SND_SOC_UDA134X
 	select SND_SOC_UDA1380 if I2C
+	select SND_SOC_WM8350 if MFD_WM8350
 	select SND_SOC_WM8510 if (I2C || SPI_MASTER)
 	select SND_SOC_WM8580 if I2C
 	select SND_SOC_WM8728 if (I2C || SPI_MASTER)
@@ -100,6 +101,9 @@ config SND_SOC_UDA134X
 config SND_SOC_UDA1380
         tristate
 
+config SND_SOC_WM8350
+	tristate
+
 config SND_SOC_WM8510
 	tristate
 
diff --git a/sound/soc/codecs/Makefile b/sound/soc/codecs/Makefile
index 9a20fddd09c..c4ddc9aa2bb 100644
--- a/sound/soc/codecs/Makefile
+++ b/sound/soc/codecs/Makefile
@@ -12,6 +12,7 @@ snd-soc-tlv320aic3x-objs := tlv320aic3x.o
 snd-soc-twl4030-objs := twl4030.o
 snd-soc-uda134x-objs := uda134x.o
 snd-soc-uda1380-objs := uda1380.o
+snd-soc-wm8350-objs := wm8350.o
 snd-soc-wm8510-objs := wm8510.o
 snd-soc-wm8580-objs := wm8580.o
 snd-soc-wm8728-objs := wm8728.o
@@ -39,6 +40,7 @@ obj-$(CONFIG_SND_SOC_TLV320AIC3X)	+= snd-soc-tlv320aic3x.o
 obj-$(CONFIG_SND_SOC_TWL4030)	+= snd-soc-twl4030.o
 obj-$(CONFIG_SND_SOC_UDA134X)	+= snd-soc-uda134x.o
 obj-$(CONFIG_SND_SOC_UDA1380)	+= snd-soc-uda1380.o
+obj-$(CONFIG_SND_SOC_WM8350)	+= snd-soc-wm8350.o
 obj-$(CONFIG_SND_SOC_WM8510)	+= snd-soc-wm8510.o
 obj-$(CONFIG_SND_SOC_WM8580)	+= snd-soc-wm8580.o
 obj-$(CONFIG_SND_SOC_WM8728)	+= snd-soc-wm8728.o
diff --git a/sound/soc/codecs/wm8350.c b/sound/soc/codecs/wm8350.c
new file mode 100644
index 00000000000..4bbfb5a5894
--- /dev/null
+++ b/sound/soc/codecs/wm8350.c
@@ -0,0 +1,1583 @@
+/*
+ * wm8350.c -- WM8350 ALSA SoC audio driver
+ *
+ * Copyright (C) 2007, 2008 Wolfson Microelectronics PLC.
+ *
+ * Author: Liam Girdwood <lg@opensource.wolfsonmicro.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/init.h>
+#include <linux/delay.h>
+#include <linux/pm.h>
+#include <linux/platform_device.h>
+#include <linux/mfd/wm8350/audio.h>
+#include <linux/mfd/wm8350/core.h>
+#include <linux/regulator/consumer.h>
+#include <sound/core.h>
+#include <sound/pcm.h>
+#include <sound/pcm_params.h>
+#include <sound/soc.h>
+#include <sound/soc-dapm.h>
+#include <sound/initval.h>
+#include <sound/tlv.h>
+
+#include "wm8350.h"
+
+#define WM8350_OUTn_0dB 0x39
+
+#define WM8350_RAMP_NONE	0
+#define WM8350_RAMP_UP		1
+#define WM8350_RAMP_DOWN	2
+
+/* We only include the analogue supplies here; the digital supplies
+ * need to be available well before this driver can be probed.
+ */
+static const char *supply_names[] = {
+	"AVDD",
+	"HPVDD",
+};
+
+struct wm8350_output {
+	u16 active;
+	u16 left_vol;
+	u16 right_vol;
+	u16 ramp;
+	u16 mute;
+};
+
+struct wm8350_data {
+	struct snd_soc_codec codec;
+	struct wm8350_output out1;
+	struct wm8350_output out2;
+	struct regulator_bulk_data supplies[ARRAY_SIZE(supply_names)];
+};
+
+static unsigned int wm8350_codec_cache_read(struct snd_soc_codec *codec,
+					    unsigned int reg)
+{
+	struct wm8350 *wm8350 = codec->control_data;
+	return wm8350->reg_cache[reg];
+}
+
+static unsigned int wm8350_codec_read(struct snd_soc_codec *codec,
+				      unsigned int reg)
+{
+	struct wm8350 *wm8350 = codec->control_data;
+	return wm8350_reg_read(wm8350, reg);
+}
+
+static int wm8350_codec_write(struct snd_soc_codec *codec, unsigned int reg,
+			      unsigned int value)
+{
+	struct wm8350 *wm8350 = codec->control_data;
+	return wm8350_reg_write(wm8350, reg, value);
+}
+
+/*
+ * Ramp OUT1 PGA volume to minimise pops at stream startup and shutdown.
+ */
+static inline int wm8350_out1_ramp_step(struct snd_soc_codec *codec)
+{
+	struct wm8350_data *wm8350_data = codec->private_data;
+	struct wm8350_output *out1 = &wm8350_data->out1;
+	struct wm8350 *wm8350 = codec->control_data;
+	int left_complete = 0, right_complete = 0;
+	u16 reg, val;
+
+	/* left channel */
+	reg = wm8350_reg_read(wm8350, WM8350_LOUT1_VOLUME);
+	val = (reg & WM8350_OUT1L_VOL_MASK) >> WM8350_OUT1L_VOL_SHIFT;
+
+	if (out1->ramp == WM8350_RAMP_UP) {
+		/* ramp step up */
+		if (val < out1->left_vol) {
+			val++;
+			reg &= ~WM8350_OUT1L_VOL_MASK;
+			wm8350_reg_write(wm8350, WM8350_LOUT1_VOLUME,
+					 reg | (val << WM8350_OUT1L_VOL_SHIFT));
+		} else
+			left_complete = 1;
+	} else if (out1->ramp == WM8350_RAMP_DOWN) {
+		/* ramp step down */
+		if (val > 0) {
+			val--;
+			reg &= ~WM8350_OUT1L_VOL_MASK;
+			wm8350_reg_write(wm8350, WM8350_LOUT1_VOLUME,
+					 reg | (val << WM8350_OUT1L_VOL_SHIFT));
+		} else
+			left_complete = 1;
+	} else
+		return 1;
+
+	/* right channel */
+	reg = wm8350_reg_read(wm8350, WM8350_ROUT1_VOLUME);
+	val = (reg & WM8350_OUT1R_VOL_MASK) >> WM8350_OUT1R_VOL_SHIFT;
+	if (out1->ramp == WM8350_RAMP_UP) {
+		/* ramp step up */
+		if (val < out1->right_vol) {
+			val++;
+			reg &= ~WM8350_OUT1R_VOL_MASK;
+			wm8350_reg_write(wm8350, WM8350_ROUT1_VOLUME,
+					 reg | (val << WM8350_OUT1R_VOL_SHIFT));
+		} else
+			right_complete = 1;
+	} else if (out1->ramp == WM8350_RAMP_DOWN) {
+		/* ramp step down */
+		if (val > 0) {
+			val--;
+			reg &= ~WM8350_OUT1R_VOL_MASK;
+			wm8350_reg_write(wm8350, WM8350_ROUT1_VOLUME,
+					 reg | (val << WM8350_OUT1R_VOL_SHIFT));
+		} else
+			right_complete = 1;
+	}
+
+	/* only hit the update bit if either volume has changed this step */
+	if (!left_complete || !right_complete)
+		wm8350_set_bits(wm8350, WM8350_LOUT1_VOLUME, WM8350_OUT1_VU);
+
+	return left_complete & right_complete;
+}
+
+/*
+ * Ramp OUT2 PGA volume to minimise pops at stream startup and shutdown.
+ */
+static inline int wm8350_out2_ramp_step(struct snd_soc_codec *codec)
+{
+	struct wm8350_data *wm8350_data = codec->private_data;
+	struct wm8350_output *out2 = &wm8350_data->out2;
+	struct wm8350 *wm8350 = codec->control_data;
+	int left_complete = 0, right_complete = 0;
+	u16 reg, val;
+
+	/* left channel */
+	reg = wm8350_reg_read(wm8350, WM8350_LOUT2_VOLUME);
+	val = (reg & WM8350_OUT2L_VOL_MASK) >> WM8350_OUT1L_VOL_SHIFT;
+	if (out2->ramp == WM8350_RAMP_UP) {
+		/* ramp step up */
+		if (val < out2->left_vol) {
+			val++;
+			reg &= ~WM8350_OUT2L_VOL_MASK;
+			wm8350_reg_write(wm8350, WM8350_LOUT2_VOLUME,
+					 reg | (val << WM8350_OUT1L_VOL_SHIFT));
+		} else
+			left_complete = 1;
+	} else if (out2->ramp == WM8350_RAMP_DOWN) {
+		/* ramp step down */
+		if (val > 0) {
+			val--;
+			reg &= ~WM8350_OUT2L_VOL_MASK;
+			wm8350_reg_write(wm8350, WM8350_LOUT2_VOLUME,
+					 reg | (val << WM8350_OUT1L_VOL_SHIFT));
+		} else
+			left_complete = 1;
+	} else
+		return 1;
+
+	/* right channel */
+	reg = wm8350_reg_read(wm8350, WM8350_ROUT2_VOLUME);
+	val = (reg & WM8350_OUT2R_VOL_MASK) >> WM8350_OUT1R_VOL_SHIFT;
+	if (out2->ramp == WM8350_RAMP_UP) {
+		/* ramp step up */
+		if (val < out2->right_vol) {
+			val++;
+			reg &= ~WM8350_OUT2R_VOL_MASK;
+			wm8350_reg_write(wm8350, WM8350_ROUT2_VOLUME,
+					 reg | (val << WM8350_OUT1R_VOL_SHIFT));
+		} else
+			right_complete = 1;
+	} else if (out2->ramp == WM8350_RAMP_DOWN) {
+		/* ramp step down */
+		if (val > 0) {
+			val--;
+			reg &= ~WM8350_OUT2R_VOL_MASK;
+			wm8350_reg_write(wm8350, WM8350_ROUT2_VOLUME,
+					 reg | (val << WM8350_OUT1R_VOL_SHIFT));
+		} else
+			right_complete = 1;
+	}
+
+	/* only hit the update bit if either volume has changed this step */
+	if (!left_complete || !right_complete)
+		wm8350_set_bits(wm8350, WM8350_LOUT2_VOLUME, WM8350_OUT2_VU);
+
+	return left_complete & right_complete;
+}
+
+/*
+ * This work ramps both output PGAs at stream start/stop time to
+ * minimise pop associated with DAPM power switching.
+ * It's best to enable Zero Cross when ramping occurs to minimise any
+ * zipper noises.
+ */
+static void wm8350_pga_work(struct work_struct *work)
+{
+	struct snd_soc_codec *codec =
+	    container_of(work, struct snd_soc_codec, delayed_work.work);
+	struct wm8350_data *wm8350_data = codec->private_data;
+	struct wm8350_output *out1 = &wm8350_data->out1,
+	    *out2 = &wm8350_data->out2;
+	int i, out1_complete, out2_complete;
+
+	/* do we need to ramp at all ? */
+	if (out1->ramp == WM8350_RAMP_NONE && out2->ramp == WM8350_RAMP_NONE)
+		return;
+
+	/* PGA volumes have 6 bits of resolution to ramp */
+	for (i = 0; i <= 63; i++) {
+		out1_complete = 1, out2_complete = 1;
+		if (out1->ramp != WM8350_RAMP_NONE)
+			out1_complete = wm8350_out1_ramp_step(codec);
+		if (out2->ramp != WM8350_RAMP_NONE)
+			out2_complete = wm8350_out2_ramp_step(codec);
+
+		/* ramp finished ? */
+		if (out1_complete && out2_complete)
+			break;
+
+		/* we need to delay longer on the up ramp */
+		if (out1->ramp == WM8350_RAMP_UP ||
+		    out2->ramp == WM8350_RAMP_UP) {
+			/* delay is longer over 0dB as increases are larger */
+			if (i >= WM8350_OUTn_0dB)
+				schedule_timeout_interruptible(msecs_to_jiffies
+							       (2));
+			else
+				schedule_timeout_interruptible(msecs_to_jiffies
+							       (1));
+		} else
+			udelay(50);	/* doesn't matter if we delay longer */
+	}
+
+	out1->ramp = WM8350_RAMP_NONE;
+	out2->ramp = WM8350_RAMP_NONE;
+}
+
+/*
+ * WM8350 Controls
+ */
+
+static int pga_event(struct snd_soc_dapm_widget *w,
+		     struct snd_kcontrol *kcontrol, int event)
+{
+	struct snd_soc_codec *codec = w->codec;
+	struct wm8350_data *wm8350_data = codec->private_data;
+	struct wm8350_output *out;
+
+	switch (w->shift) {
+	case 0:
+	case 1:
+		out = &wm8350_data->out1;
+		break;
+	case 2:
+	case 3:
+		out = &wm8350_data->out2;
+		break;
+
+	default:
+		BUG();
+		return -1;
+	}
+
+	switch (event) {
+	case SND_SOC_DAPM_POST_PMU:
+		out->ramp = WM8350_RAMP_UP;
+		out->active = 1;
+
+		if (!delayed_work_pending(&codec->delayed_work))
+			schedule_delayed_work(&codec->delayed_work,
+					      msecs_to_jiffies(1));
+		break;
+
+	case SND_SOC_DAPM_PRE_PMD:
+		out->ramp = WM8350_RAMP_DOWN;
+		out->active = 0;
+
+		if (!delayed_work_pending(&codec->delayed_work))
+			schedule_delayed_work(&codec->delayed_work,
+					      msecs_to_jiffies(1));
+		break;
+	}
+
+	return 0;
+}
+
+static int wm8350_put_volsw_2r_vu(struct snd_kcontrol *kcontrol,
+				  struct snd_ctl_elem_value *ucontrol)
+{
+	struct snd_soc_codec *codec = snd_kcontrol_chip(kcontrol);
+	struct wm8350_data *wm8350_priv = codec->private_data;
+	struct wm8350_output *out = NULL;
+	struct soc_mixer_control *mc =
+		(struct soc_mixer_control *)kcontrol->private_value;
+	int ret;
+	unsigned int reg = mc->reg;
+	u16 val;
+
+	/* For OUT1 and OUT2 we shadow the values and only actually write
+	 * them out when active in order to ensure the amplifier comes on
+	 * as quietly as possible. */
+	switch (reg) {
+	case WM8350_LOUT1_VOLUME:
+		out = &wm8350_priv->out1;
+		break;
+	case WM8350_LOUT2_VOLUME:
+		out = &wm8350_priv->out2;
+		break;
+	default:
+		break;
+	}
+
+	if (out) {
+		out->left_vol = ucontrol->value.integer.value[0];
+		out->right_vol = ucontrol->value.integer.value[1];
+		if (!out->active)
+			return 1;
+	}
+
+	ret = snd_soc_put_volsw_2r(kcontrol, ucontrol);
+	if (ret < 0)
+		return ret;
+
+	/* now hit the volume update bits (always bit 8) */
+	val = wm8350_codec_read(codec, reg);
+	wm8350_codec_write(codec, reg, val | WM8350_OUT1_VU);
+	return 1;
+}
+
+static int wm8350_get_volsw_2r(struct snd_kcontrol *kcontrol,
+			       struct snd_ctl_elem_value *ucontrol)
+{
+	struct snd_soc_codec *codec = snd_kcontrol_chip(kcontrol);
+	struct wm8350_data *wm8350_priv = codec->private_data;
+	struct wm8350_output *out1 = &wm8350_priv->out1;
+	struct wm8350_output *out2 = &wm8350_priv->out2;
+	struct soc_mixer_control *mc =
+		(struct soc_mixer_control *)kcontrol->private_value;
+	unsigned int reg = mc->reg;
+
+	/* If these are cached registers use the cache */
+	switch (reg) {
+	case WM8350_LOUT1_VOLUME:
+		ucontrol->value.integer.value[0] = out1->left_vol;
+		ucontrol->value.integer.value[1] = out1->right_vol;
+		return 0;
+
+	case WM8350_LOUT2_VOLUME:
+		ucontrol->value.integer.value[0] = out2->left_vol;
+		ucontrol->value.integer.value[1] = out2->right_vol;
+		return 0;
+
+	default:
+		break;
+	}
+
+	return snd_soc_get_volsw_2r(kcontrol, ucontrol);
+}
+
+/* double control with volume update */
+#define SOC_WM8350_DOUBLE_R_TLV(xname, reg_left, reg_right, xshift, xmax, \
+				xinvert, tlv_array) \
+{	.iface = SNDRV_CTL_ELEM_IFACE_MIXER, .name = (xname), \
+	.access = SNDRV_CTL_ELEM_ACCESS_TLV_READ | \
+		SNDRV_CTL_ELEM_ACCESS_READWRITE | \
+		SNDRV_CTL_ELEM_ACCESS_VOLATILE, \
+	.tlv.p = (tlv_array), \
+	.info = snd_soc_info_volsw_2r, \
+	.get = wm8350_get_volsw_2r, .put = wm8350_put_volsw_2r_vu, \
+	.private_value = (unsigned long)&(struct soc_mixer_control) \
+		{.reg = reg_left, .rreg = reg_right, .shift = xshift, \
+		 .rshift = xshift, .max = xmax, .invert = xinvert}, }
+
+static const char *wm8350_deemp[] = { "None", "32kHz", "44.1kHz", "48kHz" };
+static const char *wm8350_pol[] = { "Normal", "Inv R", "Inv L", "Inv L & R" };
+static const char *wm8350_dacmutem[] = { "Normal", "Soft" };
+static const char *wm8350_dacmutes[] = { "Fast", "Slow" };
+static const char *wm8350_dacfilter[] = { "Normal", "Sloping" };
+static const char *wm8350_adcfilter[] = { "None", "High Pass" };
+static const char *wm8350_adchp[] = { "44.1kHz", "8kHz", "16kHz", "32kHz" };
+static const char *wm8350_lr[] = { "Left", "Right" };
+
+static const struct soc_enum wm8350_enum[] = {
+	SOC_ENUM_SINGLE(WM8350_DAC_CONTROL, 4, 4, wm8350_deemp),
+	SOC_ENUM_SINGLE(WM8350_DAC_CONTROL, 0, 4, wm8350_pol),
+	SOC_ENUM_SINGLE(WM8350_DAC_MUTE_VOLUME, 14, 2, wm8350_dacmutem),
+	SOC_ENUM_SINGLE(WM8350_DAC_MUTE_VOLUME, 13, 2, wm8350_dacmutes),
+	SOC_ENUM_SINGLE(WM8350_DAC_MUTE_VOLUME, 12, 2, wm8350_dacfilter),
+	SOC_ENUM_SINGLE(WM8350_ADC_CONTROL, 15, 2, wm8350_adcfilter),
+	SOC_ENUM_SINGLE(WM8350_ADC_CONTROL, 8, 4, wm8350_adchp),
+	SOC_ENUM_SINGLE(WM8350_ADC_CONTROL, 0, 4, wm8350_pol),
+	SOC_ENUM_SINGLE(WM8350_INPUT_MIXER_VOLUME, 15, 2, wm8350_lr),
+};
+
+static DECLARE_TLV_DB_LINEAR(pre_amp_tlv, -1200, 3525);
+static DECLARE_TLV_DB_LINEAR(out_pga_tlv, -5700, 600);
+static DECLARE_TLV_DB_SCALE(dac_pcm_tlv, -7163, 36, 1);
+static DECLARE_TLV_DB_SCALE(adc_pcm_tlv, -12700, 50, 1);
+static DECLARE_TLV_DB_SCALE(out_mix_tlv, -1500, 300, 1);
+
+static const unsigned int capture_sd_tlv[] = {
+	TLV_DB_RANGE_HEAD(2),
+	0, 12, TLV_DB_SCALE_ITEM(-3600, 300, 1),
+	13, 15, TLV_DB_SCALE_ITEM(0, 0, 0),
+};
+
+static const struct snd_kcontrol_new wm8350_snd_controls[] = {
+	SOC_ENUM("Playback Deemphasis", wm8350_enum[0]),
+	SOC_ENUM("Playback DAC Inversion", wm8350_enum[1]),
+	SOC_WM8350_DOUBLE_R_TLV("Playback PCM Volume",
+				WM8350_DAC_DIGITAL_VOLUME_L,
+				WM8350_DAC_DIGITAL_VOLUME_R,
+				0, 255, 0, dac_pcm_tlv),
+	SOC_ENUM("Playback PCM Mute Function", wm8350_enum[2]),
+	SOC_ENUM("Playback PCM Mute Speed", wm8350_enum[3]),
+	SOC_ENUM("Playback PCM Filter", wm8350_enum[4]),
+	SOC_ENUM("Capture PCM Filter", wm8350_enum[5]),
+	SOC_ENUM("Capture PCM HP Filter", wm8350_enum[6]),
+	SOC_ENUM("Capture ADC Inversion", wm8350_enum[7]),
+	SOC_WM8350_DOUBLE_R_TLV("Capture PCM Volume",
+				WM8350_ADC_DIGITAL_VOLUME_L,
+				WM8350_ADC_DIGITAL_VOLUME_R,
+				0, 255, 0, adc_pcm_tlv),
+	SOC_DOUBLE_TLV("Capture Sidetone Volume",
+		       WM8350_ADC_DIVIDER,
+		       8, 4, 15, 1, capture_sd_tlv),
+	SOC_WM8350_DOUBLE_R_TLV("Capture Volume",
+				WM8350_LEFT_INPUT_VOLUME,
+				WM8350_RIGHT_INPUT_VOLUME,
+				2, 63, 0, pre_amp_tlv),
+	SOC_DOUBLE_R("Capture ZC Switch",
+		     WM8350_LEFT_INPUT_VOLUME,
+		     WM8350_RIGHT_INPUT_VOLUME, 13, 1, 0),
+	SOC_SINGLE_TLV("Left Input Left Sidetone Volume",
+		       WM8350_OUTPUT_LEFT_MIXER_VOLUME, 1, 7, 0, out_mix_tlv),
+	SOC_SINGLE_TLV("Left Input Right Sidetone Volume",
+		       WM8350_OUTPUT_LEFT_MIXER_VOLUME,
+		       5, 7, 0, out_mix_tlv),
+	SOC_SINGLE_TLV("Left Input Bypass Volume",
+		       WM8350_OUTPUT_LEFT_MIXER_VOLUME,
+		       9, 7, 0, out_mix_tlv),
+	SOC_SINGLE_TLV("Right Input Left Sidetone Volume",
+		       WM8350_OUTPUT_RIGHT_MIXER_VOLUME,
+		       1, 7, 0, out_mix_tlv),
+	SOC_SINGLE_TLV("Right Input Right Sidetone Volume",
+		       WM8350_OUTPUT_RIGHT_MIXER_VOLUME,
+		       5, 7, 0, out_mix_tlv),
+	SOC_SINGLE_TLV("Right Input Bypass Volume",
+		       WM8350_OUTPUT_RIGHT_MIXER_VOLUME,
+		       13, 7, 0, out_mix_tlv),
+	SOC_SINGLE("Left Input Mixer +20dB Switch",
+		   WM8350_INPUT_MIXER_VOLUME_L, 0, 1, 0),
+	SOC_SINGLE("Right Input Mixer +20dB Switch",
+		   WM8350_INPUT_MIXER_VOLUME_R, 0, 1, 0),
+	SOC_SINGLE_TLV("Out4 Capture Volume",
+		       WM8350_INPUT_MIXER_VOLUME,
+		       1, 7, 0, out_mix_tlv),
+	SOC_WM8350_DOUBLE_R_TLV("Out1 Playback Volume",
+				WM8350_LOUT1_VOLUME,
+				WM8350_ROUT1_VOLUME,
+				2, 63, 0, out_pga_tlv),
+	SOC_DOUBLE_R("Out1 Playback ZC Switch",
+		     WM8350_LOUT1_VOLUME,
+		     WM8350_ROUT1_VOLUME, 13, 1, 0),
+	SOC_WM8350_DOUBLE_R_TLV("Out2 Playback Volume",
+				WM8350_LOUT2_VOLUME,
+				WM8350_ROUT2_VOLUME,
+				2, 63, 0, out_pga_tlv),
+	SOC_DOUBLE_R("Out2 Playback ZC Switch", WM8350_LOUT2_VOLUME,
+		     WM8350_ROUT2_VOLUME, 13, 1, 0),
+	SOC_SINGLE("Out2 Right Invert Switch", WM8350_ROUT2_VOLUME, 10, 1, 0),
+	SOC_SINGLE_TLV("Out2 Beep Volume", WM8350_BEEP_VOLUME,
+		       5, 7, 0, out_mix_tlv),
+
+	SOC_DOUBLE_R("Out1 Playback Switch",
+		     WM8350_LOUT1_VOLUME,
+		     WM8350_ROUT1_VOLUME,
+		     14, 1, 1),
+	SOC_DOUBLE_R("Out2 Playback Switch",
+		     WM8350_LOUT2_VOLUME,
+		     WM8350_ROUT2_VOLUME,
+		     14, 1, 1),
+};
+
+/*
+ * DAPM Controls
+ */
+
+/* Left Playback Mixer */
+static const struct snd_kcontrol_new wm8350_left_play_mixer_controls[] = {
+	SOC_DAPM_SINGLE("Playback Switch",
+			WM8350_LEFT_MIXER_CONTROL, 11, 1, 0),
+	SOC_DAPM_SINGLE("Left Bypass Switch",
+			WM8350_LEFT_MIXER_CONTROL, 2, 1, 0),
+	SOC_DAPM_SINGLE("Right Playback Switch",
+			WM8350_LEFT_MIXER_CONTROL, 12, 1, 0),
+	SOC_DAPM_SINGLE("Left Sidetone Switch",
+			WM8350_LEFT_MIXER_CONTROL, 0, 1, 0),
+	SOC_DAPM_SINGLE("Right Sidetone Switch",
+			WM8350_LEFT_MIXER_CONTROL, 1, 1, 0),
+};
+
+/* Right Playback Mixer */
+static const struct snd_kcontrol_new wm8350_right_play_mixer_controls[] = {
+	SOC_DAPM_SINGLE("Playback Switch",
+			WM8350_RIGHT_MIXER_CONTROL, 12, 1, 0),
+	SOC_DAPM_SINGLE("Right Bypass Switch",
+			WM8350_RIGHT_MIXER_CONTROL, 3, 1, 0),
+	SOC_DAPM_SINGLE("Left Playback Switch",
+			WM8350_RIGHT_MIXER_CONTROL, 11, 1, 0),
+	SOC_DAPM_SINGLE("Left Sidetone Switch",
+			WM8350_RIGHT_MIXER_CONTROL, 0, 1, 0),
+	SOC_DAPM_SINGLE("Right Sidetone Switch",
+			WM8350_RIGHT_MIXER_CONTROL, 1, 1, 0),
+};
+
+/* Out4 Mixer */
+static const struct snd_kcontrol_new wm8350_out4_mixer_controls[] = {
+	SOC_DAPM_SINGLE("Right Playback Switch",
+			WM8350_OUT4_MIXER_CONTROL, 12, 1, 0),
+	SOC_DAPM_SINGLE("Left Playback Switch",
+			WM8350_OUT4_MIXER_CONTROL, 11, 1, 0),
+	SOC_DAPM_SINGLE("Right Capture Switch",
+			WM8350_OUT4_MIXER_CONTROL, 9, 1, 0),
+	SOC_DAPM_SINGLE("Out3 Playback Switch",
+			WM8350_OUT4_MIXER_CONTROL, 2, 1, 0),
+	SOC_DAPM_SINGLE("Right Mixer Switch",
+			WM8350_OUT4_MIXER_CONTROL, 1, 1, 0),
+	SOC_DAPM_SINGLE("Left Mixer Switch",
+			WM8350_OUT4_MIXER_CONTROL, 0, 1, 0),
+};
+
+/* Out3 Mixer */
+static const struct snd_kcontrol_new wm8350_out3_mixer_controls[] = {
+	SOC_DAPM_SINGLE("Left Playback Switch",
+			WM8350_OUT3_MIXER_CONTROL, 11, 1, 0),
+	SOC_DAPM_SINGLE("Left Capture Switch",
+			WM8350_OUT3_MIXER_CONTROL, 8, 1, 0),
+	SOC_DAPM_SINGLE("Out4 Playback Switch",
+			WM8350_OUT3_MIXER_CONTROL, 3, 1, 0),
+	SOC_DAPM_SINGLE("Left Mixer Switch",
+			WM8350_OUT3_MIXER_CONTROL, 0, 1, 0),
+};
+
+/* Left Input Mixer */
+static const struct snd_kcontrol_new wm8350_left_capt_mixer_controls[] = {
+	SOC_DAPM_SINGLE_TLV("L2 Capture Volume",
+			    WM8350_INPUT_MIXER_VOLUME_L, 1, 7, 0, out_mix_tlv),
+	SOC_DAPM_SINGLE_TLV("L3 Capture Volume",
+			    WM8350_INPUT_MIXER_VOLUME_L, 9, 7, 0, out_mix_tlv),
+	SOC_DAPM_SINGLE("PGA Capture Switch",
+			WM8350_LEFT_INPUT_VOLUME, 14, 1, 0),
+};
+
+/* Right Input Mixer */
+static const struct snd_kcontrol_new wm8350_right_capt_mixer_controls[] = {
+	SOC_DAPM_SINGLE_TLV("L2 Capture Volume",
+			    WM8350_INPUT_MIXER_VOLUME_R, 5, 7, 0, out_mix_tlv),
+	SOC_DAPM_SINGLE_TLV("L3 Capture Volume",
+			    WM8350_INPUT_MIXER_VOLUME_R, 13, 7, 0, out_mix_tlv),
+	SOC_DAPM_SINGLE("PGA Capture Switch",
+			WM8350_RIGHT_INPUT_VOLUME, 14, 1, 0),
+};
+
+/* Left Mic Mixer */
+static const struct snd_kcontrol_new wm8350_left_mic_mixer_controls[] = {
+	SOC_DAPM_SINGLE("INN Capture Switch", WM8350_INPUT_CONTROL, 1, 1, 0),
+	SOC_DAPM_SINGLE("INP Capture Switch", WM8350_INPUT_CONTROL, 0, 1, 0),
+	SOC_DAPM_SINGLE("IN2 Capture Switch", WM8350_INPUT_CONTROL, 2, 1, 0),
+};
+
+/* Right Mic Mixer */
+static const struct snd_kcontrol_new wm8350_right_mic_mixer_controls[] = {
+	SOC_DAPM_SINGLE("INN Capture Switch", WM8350_INPUT_CONTROL, 9, 1, 0),
+	SOC_DAPM_SINGLE("INP Capture Switch", WM8350_INPUT_CONTROL, 8, 1, 0),
+	SOC_DAPM_SINGLE("IN2 Capture Switch", WM8350_INPUT_CONTROL, 10, 1, 0),
+};
+
+/* Beep Switch */
+static const struct snd_kcontrol_new wm8350_beep_switch_controls =
+SOC_DAPM_SINGLE("Switch", WM8350_BEEP_VOLUME, 15, 1, 1);
+
+/* Out4 Capture Mux */
+static const struct snd_kcontrol_new wm8350_out4_capture_controls =
+SOC_DAPM_ENUM("Route", wm8350_enum[8]);
+
+static const struct snd_soc_dapm_widget wm8350_dapm_widgets[] = {
+
+	SND_SOC_DAPM_PGA("IN3R PGA", WM8350_POWER_MGMT_2, 11, 0, NULL, 0),
+	SND_SOC_DAPM_PGA("IN3L PGA", WM8350_POWER_MGMT_2, 10, 0, NULL, 0),
+	SND_SOC_DAPM_PGA_E("Right Out2 PGA", WM8350_POWER_MGMT_3, 3, 0, NULL,
+			   0, pga_event,
+			   SND_SOC_DAPM_POST_PMU | SND_SOC_DAPM_PRE_PMD),
+	SND_SOC_DAPM_PGA_E("Left Out2 PGA", WM8350_POWER_MGMT_3, 2, 0, NULL, 0,
+			   pga_event,
+			   SND_SOC_DAPM_POST_PMU | SND_SOC_DAPM_PRE_PMD),
+	SND_SOC_DAPM_PGA_E("Right Out1 PGA", WM8350_POWER_MGMT_3, 1, 0, NULL,
+			   0, pga_event,
+			   SND_SOC_DAPM_POST_PMU | SND_SOC_DAPM_PRE_PMD),
+	SND_SOC_DAPM_PGA_E("Left Out1 PGA", WM8350_POWER_MGMT_3, 0, 0, NULL, 0,
+			   pga_event,
+			   SND_SOC_DAPM_POST_PMU | SND_SOC_DAPM_PRE_PMD),
+
+	SND_SOC_DAPM_MIXER("Right Capture Mixer", WM8350_POWER_MGMT_2,
+			   7, 0, &wm8350_right_capt_mixer_controls[0],
+			   ARRAY_SIZE(wm8350_right_capt_mixer_controls)),
+
+	SND_SOC_DAPM_MIXER("Left Capture Mixer", WM8350_POWER_MGMT_2,
+			   6, 0, &wm8350_left_capt_mixer_controls[0],
+			   ARRAY_SIZE(wm8350_left_capt_mixer_controls)),
+
+	SND_SOC_DAPM_MIXER("Out4 Mixer", WM8350_POWER_MGMT_2, 5, 0,
+			   &wm8350_out4_mixer_controls[0],
+			   ARRAY_SIZE(wm8350_out4_mixer_controls)),
+
+	SND_SOC_DAPM_MIXER("Out3 Mixer", WM8350_POWER_MGMT_2, 4, 0,
+			   &wm8350_out3_mixer_controls[0],
+			   ARRAY_SIZE(wm8350_out3_mixer_controls)),
+
+	SND_SOC_DAPM_MIXER("Right Playback Mixer", WM8350_POWER_MGMT_2, 1, 0,
+			   &wm8350_right_play_mixer_controls[0],
+			   ARRAY_SIZE(wm8350_right_play_mixer_controls)),
+
+	SND_SOC_DAPM_MIXER("Left Playback Mixer", WM8350_POWER_MGMT_2, 0, 0,
+			   &wm8350_left_play_mixer_controls[0],
+			   ARRAY_SIZE(wm8350_left_play_mixer_controls)),
+
+	SND_SOC_DAPM_MIXER("Left Mic Mixer", WM8350_POWER_MGMT_2, 8, 0,
+			   &wm8350_left_mic_mixer_controls[0],
+			   ARRAY_SIZE(wm8350_left_mic_mixer_controls)),
+
+	SND_SOC_DAPM_MIXER("Right Mic Mixer", WM8350_POWER_MGMT_2, 9, 0,
+			   &wm8350_right_mic_mixer_controls[0],
+			   ARRAY_SIZE(wm8350_right_mic_mixer_controls)),
+
+	/* virtual mixer for Beep and Out2R */
+	SND_SOC_DAPM_MIXER("Out2 Mixer", SND_SOC_NOPM, 0, 0, NULL, 0),
+
+	SND_SOC_DAPM_SWITCH("Beep", WM8350_POWER_MGMT_3, 7, 0,
+			    &wm8350_beep_switch_controls),
+
+	SND_SOC_DAPM_ADC("Right ADC", "Right Capture",
+			 WM8350_POWER_MGMT_4, 3, 0),
+	SND_SOC_DAPM_ADC("Left ADC", "Left Capture",
+			 WM8350_POWER_MGMT_4, 2, 0),
+	SND_SOC_DAPM_DAC("Right DAC", "Right Playback",
+			 WM8350_POWER_MGMT_4, 5, 0),
+	SND_SOC_DAPM_DAC("Left DAC", "Left Playback",
+			 WM8350_POWER_MGMT_4, 4, 0),
+
+	SND_SOC_DAPM_MICBIAS("Mic Bias", WM8350_POWER_MGMT_1, 4, 0),
+
+	SND_SOC_DAPM_MUX("Out4 Capture Channel", SND_SOC_NOPM, 0, 0,
+			 &wm8350_out4_capture_controls),
+
+	SND_SOC_DAPM_OUTPUT("OUT1R"),
+	SND_SOC_DAPM_OUTPUT("OUT1L"),
+	SND_SOC_DAPM_OUTPUT("OUT2R"),
+	SND_SOC_DAPM_OUTPUT("OUT2L"),
+	SND_SOC_DAPM_OUTPUT("OUT3"),
+	SND_SOC_DAPM_OUTPUT("OUT4"),
+
+	SND_SOC_DAPM_INPUT("IN1RN"),
+	SND_SOC_DAPM_INPUT("IN1RP"),
+	SND_SOC_DAPM_INPUT("IN2R"),
+	SND_SOC_DAPM_INPUT("IN1LP"),
+	SND_SOC_DAPM_INPUT("IN1LN"),
+	SND_SOC_DAPM_INPUT("IN2L"),
+	SND_SOC_DAPM_INPUT("IN3R"),
+	SND_SOC_DAPM_INPUT("IN3L"),
+};
+
+static const struct snd_soc_dapm_route audio_map[] = {
+
+	/* left playback mixer */
+	{"Left Playback Mixer", "Playback Switch", "Left DAC"},
+	{"Left Playback Mixer", "Left Bypass Switch", "IN3L PGA"},
+	{"Left Playback Mixer", "Right Playback Switch", "Right DAC"},
+	{"Left Playback Mixer", "Left Sidetone Switch", "Left Mic Mixer"},
+	{"Left Playback Mixer", "Right Sidetone Switch", "Right Mic Mixer"},
+
+	/* right playback mixer */
+	{"Right Playback Mixer", "Playback Switch", "Right DAC"},
+	{"Right Playback Mixer", "Right Bypass Switch", "IN3R PGA"},
+	{"Right Playback Mixer", "Left Playback Switch", "Left DAC"},
+	{"Right Playback Mixer", "Left Sidetone Switch", "Left Mic Mixer"},
+	{"Right Playback Mixer", "Right Sidetone Switch", "Right Mic Mixer"},
+
+	/* out4 playback mixer */
+	{"Out4 Mixer", "Right Playback Switch", "Right DAC"},
+	{"Out4 Mixer", "Left Playback Switch", "Left DAC"},
+	{"Out4 Mixer", "Right Capture Switch", "Right Capture Mixer"},
+	{"Out4 Mixer", "Out3 Playback Switch", "Out3 Mixer"},
+	{"Out4 Mixer", "Right Mixer Switch", "Right Playback Mixer"},
+	{"Out4 Mixer", "Left Mixer Switch", "Left Playback Mixer"},
+	{"OUT4", NULL, "Out4 Mixer"},
+
+	/* out3 playback mixer */
+	{"Out3 Mixer", "Left Playback Switch", "Left DAC"},
+	{"Out3 Mixer", "Left Capture Switch", "Left Capture Mixer"},
+	{"Out3 Mixer", "Left Mixer Switch", "Left Playback Mixer"},
+	{"Out3 Mixer", "Out4 Playback Switch", "Out4 Mixer"},
+	{"OUT3", NULL, "Out3 Mixer"},
+
+	/* out2 */
+	{"Right Out2 PGA", NULL, "Right Playback Mixer"},
+	{"Left Out2 PGA", NULL, "Left Playback Mixer"},
+	{"OUT2L", NULL, "Left Out2 PGA"},
+	{"OUT2R", NULL, "Right Out2 PGA"},
+
+	/* out1 */
+	{"Right Out1 PGA", NULL, "Right Playback Mixer"},
+	{"Left Out1 PGA", NULL, "Left Playback Mixer"},
+	{"OUT1L", NULL, "Left Out1 PGA"},
+	{"OUT1R", NULL, "Right Out1 PGA"},
+
+	/* ADCs */
+	{"Left ADC", NULL, "Left Capture Mixer"},
+	{"Right ADC", NULL, "Right Capture Mixer"},
+
+	/* Left capture mixer */
+	{"Left Capture Mixer", "L2 Capture Volume", "IN2L"},
+	{"Left Capture Mixer", "L3 Capture Volume", "IN3L PGA"},
+	{"Left Capture Mixer", "PGA Capture Switch", "Left Mic Mixer"},
+	{"Left Capture Mixer", NULL, "Out4 Capture Channel"},
+
+	/* Right capture mixer */
+	{"Right Capture Mixer", "L2 Capture Volume", "IN2R"},
+	{"Right Capture Mixer", "L3 Capture Volume", "IN3R PGA"},
+	{"Right Capture Mixer", "PGA Capture Switch", "Right Mic Mixer"},
+	{"Right Capture Mixer", NULL, "Out4 Capture Channel"},
+
+	/* L3 Inputs */
+	{"IN3L PGA", NULL, "IN3L"},
+	{"IN3R PGA", NULL, "IN3R"},
+
+	/* Left Mic mixer */
+	{"Left Mic Mixer", "INN Capture Switch", "IN1LN"},
+	{"Left Mic Mixer", "INP Capture Switch", "IN1LP"},
+	{"Left Mic Mixer", "IN2 Capture Switch", "IN2L"},
+
+	/* Right Mic mixer */
+	{"Right Mic Mixer", "INN Capture Switch", "IN1RN"},
+	{"Right Mic Mixer", "INP Capture Switch", "IN1RP"},
+	{"Right Mic Mixer", "IN2 Capture Switch", "IN2R"},
+
+	/* out 4 capture */
+	{"Out4 Capture Channel", NULL, "Out4 Mixer"},
+
+	/* Beep */
+	{"Beep", NULL, "IN3R PGA"},
+};
+
+static int wm8350_add_controls(struct snd_soc_codec *codec)
+{
+	int err, i;
+
+	for (i = 0; i < ARRAY_SIZE(wm8350_snd_controls); i++) {
+		err = snd_ctl_add(codec->card,
+				  snd_soc_cnew(&wm8350_snd_controls[i],
+					       codec, NULL));
+		if (err < 0)
+			return err;
+	}
+
+	return 0;
+}
+
+static int wm8350_add_widgets(struct snd_soc_codec *codec)
+{
+	int ret;
+
+	ret = snd_soc_dapm_new_controls(codec,
+					wm8350_dapm_widgets,
+					ARRAY_SIZE(wm8350_dapm_widgets));
+	if (ret != 0) {
+		dev_err(codec->dev, "dapm control register failed\n");
+		return ret;
+	}
+
+	/* set up audio paths */
+	ret = snd_soc_dapm_add_routes(codec, audio_map, ARRAY_SIZE(audio_map));
+	if (ret != 0) {
+		dev_err(codec->dev, "DAPM route register failed\n");
+		return ret;
+	}
+
+	return snd_soc_dapm_new_widgets(codec);
+}
+
+static int wm8350_set_dai_sysclk(struct snd_soc_dai *codec_dai,
+				 int clk_id, unsigned int freq, int dir)
+{
+	struct snd_soc_codec *codec = codec_dai->codec;
+	struct wm8350 *wm8350 = codec->control_data;
+	u16 fll_4;
+
+	switch (clk_id) {
+	case WM8350_MCLK_SEL_MCLK:
+		wm8350_clear_bits(wm8350, WM8350_CLOCK_CONTROL_1,
+				  WM8350_MCLK_SEL);
+		break;
+	case WM8350_MCLK_SEL_PLL_MCLK:
+	case WM8350_MCLK_SEL_PLL_DAC:
+	case WM8350_MCLK_SEL_PLL_ADC:
+	case WM8350_MCLK_SEL_PLL_32K:
+		wm8350_set_bits(wm8350, WM8350_CLOCK_CONTROL_1,
+				WM8350_MCLK_SEL);
+		fll_4 = wm8350_codec_read(codec, WM8350_FLL_CONTROL_4) &
+		    ~WM8350_FLL_CLK_SRC_MASK;
+		wm8350_codec_write(codec, WM8350_FLL_CONTROL_4, fll_4 | clk_id);
+		break;
+	}
+
+	/* MCLK direction */
+	if (dir == WM8350_MCLK_DIR_OUT)
+		wm8350_set_bits(wm8350, WM8350_CLOCK_CONTROL_2,
+				WM8350_MCLK_DIR);
+	else
+		wm8350_clear_bits(wm8350, WM8350_CLOCK_CONTROL_2,
+				  WM8350_MCLK_DIR);
+
+	return 0;
+}
+
+static int wm8350_set_clkdiv(struct snd_soc_dai *codec_dai, int div_id, int div)
+{
+	struct snd_soc_codec *codec = codec_dai->codec;
+	u16 val;
+
+	switch (div_id) {
+	case WM8350_ADC_CLKDIV:
+		val = wm8350_codec_read(codec, WM8350_ADC_DIVIDER) &
+		    ~WM8350_ADC_CLKDIV_MASK;
+		wm8350_codec_write(codec, WM8350_ADC_DIVIDER, val | div);
+		break;
+	case WM8350_DAC_CLKDIV:
+		val = wm8350_codec_read(codec, WM8350_DAC_CLOCK_CONTROL) &
+		    ~WM8350_DAC_CLKDIV_MASK;
+		wm8350_codec_write(codec, WM8350_DAC_CLOCK_CONTROL, val | div);
+		break;
+	case WM8350_BCLK_CLKDIV:
+		val = wm8350_codec_read(codec, WM8350_CLOCK_CONTROL_1) &
+		    ~WM8350_BCLK_DIV_MASK;
+		wm8350_codec_write(codec, WM8350_CLOCK_CONTROL_1, val | div);
+		break;
+	case WM8350_OPCLK_CLKDIV:
+		val = wm8350_codec_read(codec, WM8350_CLOCK_CONTROL_1) &
+		    ~WM8350_OPCLK_DIV_MASK;
+		wm8350_codec_write(codec, WM8350_CLOCK_CONTROL_1, val | div);
+		break;
+	case WM8350_SYS_CLKDIV:
+		val = wm8350_codec_read(codec, WM8350_CLOCK_CONTROL_1) &
+		    ~WM8350_MCLK_DIV_MASK;
+		wm8350_codec_write(codec, WM8350_CLOCK_CONTROL_1, val | div);
+		break;
+	case WM8350_DACLR_CLKDIV:
+		val = wm8350_codec_read(codec, WM8350_DAC_LR_RATE) &
+		    ~WM8350_DACLRC_RATE_MASK;
+		wm8350_codec_write(codec, WM8350_DAC_LR_RATE, val | div);
+		break;
+	case WM8350_ADCLR_CLKDIV:
+		val = wm8350_codec_read(codec, WM8350_ADC_LR_RATE) &
+		    ~WM8350_ADCLRC_RATE_MASK;
+		wm8350_codec_write(codec, WM8350_ADC_LR_RATE, val | div);
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static int wm8350_set_dai_fmt(struct snd_soc_dai *codec_dai, unsigned int fmt)
+{
+	struct snd_soc_codec *codec = codec_dai->codec;
+	u16 iface = wm8350_codec_read(codec, WM8350_AI_FORMATING) &
+	    ~(WM8350_AIF_BCLK_INV | WM8350_AIF_LRCLK_INV | WM8350_AIF_FMT_MASK);
+	u16 master = wm8350_codec_read(codec, WM8350_AI_DAC_CONTROL) &
+	    ~WM8350_BCLK_MSTR;
+	u16 dac_lrc = wm8350_codec_read(codec, WM8350_DAC_LR_RATE) &
+	    ~WM8350_DACLRC_ENA;
+	u16 adc_lrc = wm8350_codec_read(codec, WM8350_ADC_LR_RATE) &
+	    ~WM8350_ADCLRC_ENA;
+
+	/* set master/slave audio interface */
+	switch (fmt & SND_SOC_DAIFMT_MASTER_MASK) {
+	case SND_SOC_DAIFMT_CBM_CFM:
+		master |= WM8350_BCLK_MSTR;
+		dac_lrc |= WM8350_DACLRC_ENA;
+		adc_lrc |= WM8350_ADCLRC_ENA;
+		break;
+	case SND_SOC_DAIFMT_CBS_CFS:
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	/* interface format */
+	switch (fmt & SND_SOC_DAIFMT_FORMAT_MASK) {
+	case SND_SOC_DAIFMT_I2S:
+		iface |= 0x2 << 8;
+		break;
+	case SND_SOC_DAIFMT_RIGHT_J:
+		break;
+	case SND_SOC_DAIFMT_LEFT_J:
+		iface |= 0x1 << 8;
+		break;
+	case SND_SOC_DAIFMT_DSP_A:
+		iface |= 0x3 << 8;
+		break;
+	case SND_SOC_DAIFMT_DSP_B:
+		iface |= 0x3 << 8;	/* lg not sure which mode */
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	/* clock inversion */
+	switch (fmt & SND_SOC_DAIFMT_INV_MASK) {
+	case SND_SOC_DAIFMT_NB_NF:
+		break;
+	case SND_SOC_DAIFMT_IB_IF:
+		iface |= WM8350_AIF_LRCLK_INV | WM8350_AIF_BCLK_INV;
+		break;
+	case SND_SOC_DAIFMT_IB_NF:
+		iface |= WM8350_AIF_BCLK_INV;
+		break;
+	case SND_SOC_DAIFMT_NB_IF:
+		iface |= WM8350_AIF_LRCLK_INV;
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	wm8350_codec_write(codec, WM8350_AI_FORMATING, iface);
+	wm8350_codec_write(codec, WM8350_AI_DAC_CONTROL, master);
+	wm8350_codec_write(codec, WM8350_DAC_LR_RATE, dac_lrc);
+	wm8350_codec_write(codec, WM8350_ADC_LR_RATE, adc_lrc);
+	return 0;
+}
+
+static int wm8350_pcm_trigger(struct snd_pcm_substream *substream,
+			      int cmd, struct snd_soc_dai *codec_dai)
+{
+	struct snd_soc_codec *codec = codec_dai->codec;
+	int master = wm8350_codec_cache_read(codec, WM8350_AI_DAC_CONTROL) &
+	    WM8350_BCLK_MSTR;
+	int enabled = 0;
+
+	/* Check that the DACs or ADCs are enabled since they are
+	 * required for LRC in master mode. The DACs or ADCs need a
+	 * valid audio path i.e. pin -> ADC or DAC -> pin before
+	 * the LRC will be enabled in master mode. */
+	if (!master && cmd != SNDRV_PCM_TRIGGER_START)
+		return 0;
+
+	if (substream->stream == SNDRV_PCM_STREAM_CAPTURE) {
+		enabled = wm8350_codec_cache_read(codec, WM8350_POWER_MGMT_4) &
+		    (WM8350_ADCR_ENA | WM8350_ADCL_ENA);
+	} else {
+		enabled = wm8350_codec_cache_read(codec, WM8350_POWER_MGMT_4) &
+		    (WM8350_DACR_ENA | WM8350_DACL_ENA);
+	}
+
+	if (!enabled) {
+		dev_err(codec->dev,
+		       "%s: invalid audio path - no clocks available\n",
+		       __func__);
+		return -EINVAL;
+	}
+	return 0;
+}
+
+static int wm8350_pcm_hw_params(struct snd_pcm_substream *substream,
+				struct snd_pcm_hw_params *params,
+				struct snd_soc_dai *codec_dai)
+{
+	struct snd_soc_codec *codec = codec_dai->codec;
+	u16 iface = wm8350_codec_read(codec, WM8350_AI_FORMATING) &
+	    ~WM8350_AIF_WL_MASK;
+
+	/* bit size */
+	switch (params_format(params)) {
+	case SNDRV_PCM_FORMAT_S16_LE:
+		break;
+	case SNDRV_PCM_FORMAT_S20_3LE:
+		iface |= 0x1 << 10;
+		break;
+	case SNDRV_PCM_FORMAT_S24_LE:
+		iface |= 0x2 << 10;
+		break;
+	case SNDRV_PCM_FORMAT_S32_LE:
+		iface |= 0x3 << 10;
+		break;
+	}
+
+	wm8350_codec_write(codec, WM8350_AI_FORMATING, iface);
+	return 0;
+}
+
+static int wm8350_mute(struct snd_soc_dai *dai, int mute)
+{
+	struct snd_soc_codec *codec = dai->codec;
+	struct wm8350 *wm8350 = codec->control_data;
+
+	if (mute)
+		wm8350_set_bits(wm8350, WM8350_DAC_MUTE, WM8350_DAC_MUTE_ENA);
+	else
+		wm8350_clear_bits(wm8350, WM8350_DAC_MUTE, WM8350_DAC_MUTE_ENA);
+	return 0;
+}
+
+/* FLL divisors */
+struct _fll_div {
+	int div;		/* FLL_OUTDIV */
+	int n;
+	int k;
+	int ratio;		/* FLL_FRATIO */
+};
+
+/* The size in bits of the fll divide multiplied by 10
+ * to allow rounding later */
+#define FIXED_FLL_SIZE ((1 << 16) * 10)
+
+static inline int fll_factors(struct _fll_div *fll_div, unsigned int input,
+			      unsigned int output)
+{
+	u64 Kpart;
+	unsigned int t1, t2, K, Nmod;
+
+	if (output >= 2815250 && output <= 3125000)
+		fll_div->div = 0x4;
+	else if (output >= 5625000 && output <= 6250000)
+		fll_div->div = 0x3;
+	else if (output >= 11250000 && output <= 12500000)
+		fll_div->div = 0x2;
+	else if (output >= 22500000 && output <= 25000000)
+		fll_div->div = 0x1;
+	else {
+		printk(KERN_ERR "wm8350: fll freq %d out of range\n", output);
+		return -EINVAL;
+	}
+
+	if (input > 48000)
+		fll_div->ratio = 1;
+	else
+		fll_div->ratio = 8;
+
+	t1 = output * (1 << (fll_div->div + 1));
+	t2 = input * fll_div->ratio;
+
+	fll_div->n = t1 / t2;
+	Nmod = t1 % t2;
+
+	if (Nmod) {
+		Kpart = FIXED_FLL_SIZE * (long long)Nmod;
+		do_div(Kpart, t2);
+		K = Kpart & 0xFFFFFFFF;
+
+		/* Check if we need to round */
+		if ((K % 10) >= 5)
+			K += 5;
+
+		/* Move down to proper range now rounding is done */
+		K /= 10;
+		fll_div->k = K;
+	} else
+		fll_div->k = 0;
+
+	return 0;
+}
+
+static int wm8350_set_fll(struct snd_soc_dai *codec_dai,
+			  int pll_id, unsigned int freq_in,
+			  unsigned int freq_out)
+{
+	struct snd_soc_codec *codec = codec_dai->codec;
+	struct wm8350 *wm8350 = codec->control_data;
+	struct _fll_div fll_div;
+	int ret = 0;
+	u16 fll_1, fll_4;
+
+	/* power down FLL - we need to do this for reconfiguration */
+	wm8350_clear_bits(wm8350, WM8350_POWER_MGMT_4,
+			  WM8350_FLL_ENA | WM8350_FLL_OSC_ENA);
+
+	if (freq_out == 0 || freq_in == 0)
+		return ret;
+
+	ret = fll_factors(&fll_div, freq_in, freq_out);
+	if (ret < 0)
+		return ret;
+	dev_dbg(wm8350->dev,
+		"FLL in %d FLL out %d N 0x%x K 0x%x div %d ratio %d",
+		freq_in, freq_out, fll_div.n, fll_div.k, fll_div.div,
+		fll_div.ratio);
+
+	/* set up N.K & dividers */
+	fll_1 = wm8350_codec_read(codec, WM8350_FLL_CONTROL_1) &
+	    ~(WM8350_FLL_OUTDIV_MASK | WM8350_FLL_RSP_RATE_MASK | 0xc000);
+	wm8350_codec_write(codec, WM8350_FLL_CONTROL_1,
+			   fll_1 | (fll_div.div << 8) | 0x50);
+	wm8350_codec_write(codec, WM8350_FLL_CONTROL_2,
+			   (fll_div.ratio << 11) | (fll_div.
+						    n & WM8350_FLL_N_MASK));
+	wm8350_codec_write(codec, WM8350_FLL_CONTROL_3, fll_div.k);
+	fll_4 = wm8350_codec_read(codec, WM8350_FLL_CONTROL_4) &
+	    ~(WM8350_FLL_FRAC | WM8350_FLL_SLOW_LOCK_REF);
+	wm8350_codec_write(codec, WM8350_FLL_CONTROL_4,
+			   fll_4 | (fll_div.k ? WM8350_FLL_FRAC : 0) |
+			   (fll_div.ratio == 8 ? WM8350_FLL_SLOW_LOCK_REF : 0));
+
+	/* power FLL on */
+	wm8350_set_bits(wm8350, WM8350_POWER_MGMT_4, WM8350_FLL_OSC_ENA);
+	wm8350_set_bits(wm8350, WM8350_POWER_MGMT_4, WM8350_FLL_ENA);
+
+	return 0;
+}
+
+static int wm8350_set_bias_level(struct snd_soc_codec *codec,
+				 enum snd_soc_bias_level level)
+{
+	struct wm8350 *wm8350 = codec->control_data;
+	struct wm8350_data *priv = codec->private_data;
+	struct wm8350_audio_platform_data *platform =
+		wm8350->codec.platform_data;
+	u16 pm1;
+	int ret;
+
+	switch (level) {
+	case SND_SOC_BIAS_ON:
+		pm1 = wm8350_reg_read(wm8350, WM8350_POWER_MGMT_1) &
+		    ~(WM8350_VMID_MASK | WM8350_CODEC_ISEL_MASK);
+		wm8350_reg_write(wm8350, WM8350_POWER_MGMT_1,
+				 pm1 | WM8350_VMID_50K |
+				 platform->codec_current_on << 14);
+		break;
+
+	case SND_SOC_BIAS_PREPARE:
+		pm1 = wm8350_reg_read(wm8350, WM8350_POWER_MGMT_1);
+		pm1 &= ~WM8350_VMID_MASK;
+		wm8350_reg_write(wm8350, WM8350_POWER_MGMT_1,
+				 pm1 | WM8350_VMID_50K);
+		break;
+
+	case SND_SOC_BIAS_STANDBY:
+		if (codec->bias_level == SND_SOC_BIAS_OFF) {
+			ret = regulator_bulk_enable(ARRAY_SIZE(priv->supplies),
+						    priv->supplies);
+			if (ret != 0)
+				return ret;
+
+			/* Enable the system clock */
+			wm8350_set_bits(wm8350, WM8350_POWER_MGMT_4,
+					WM8350_SYSCLK_ENA);
+
+			/* mute DAC & outputs */
+			wm8350_set_bits(wm8350, WM8350_DAC_MUTE,
+					WM8350_DAC_MUTE_ENA);
+
+			/* discharge cap memory */
+			wm8350_reg_write(wm8350, WM8350_ANTI_POP_CONTROL,
+					 platform->dis_out1 |
+					 (platform->dis_out2 << 2) |
+					 (platform->dis_out3 << 4) |
+					 (platform->dis_out4 << 6));
+
+			/* wait for discharge */
+			schedule_timeout_interruptible(msecs_to_jiffies
+						       (platform->
+							cap_discharge_msecs));
+
+			/* enable antipop */
+			wm8350_reg_write(wm8350, WM8350_ANTI_POP_CONTROL,
+					 (platform->vmid_s_curve << 8));
+
+			/* ramp up vmid */
+			wm8350_reg_write(wm8350, WM8350_POWER_MGMT_1,
+					 (platform->
+					  codec_current_charge << 14) |
+					 WM8350_VMID_5K | WM8350_VMIDEN |
+					 WM8350_VBUFEN);
+
+			/* wait for vmid */
+			schedule_timeout_interruptible(msecs_to_jiffies
+						       (platform->
+							vmid_charge_msecs));
+
+			/* turn on vmid 300k  */
+			pm1 = wm8350_reg_read(wm8350, WM8350_POWER_MGMT_1) &
+			    ~(WM8350_VMID_MASK | WM8350_CODEC_ISEL_MASK);
+			pm1 |= WM8350_VMID_300K |
+				(platform->codec_current_standby << 14);
+			wm8350_reg_write(wm8350, WM8350_POWER_MGMT_1,
+					 pm1);
+
+
+			/* enable analogue bias */
+			pm1 |= WM8350_BIASEN;
+			wm8350_reg_write(wm8350, WM8350_POWER_MGMT_1, pm1);
+
+			/* disable antipop */
+			wm8350_reg_write(wm8350, WM8350_ANTI_POP_CONTROL, 0);
+
+		} else {
+			/* turn on vmid 300k and reduce current */
+			pm1 = wm8350_reg_read(wm8350, WM8350_POWER_MGMT_1) &
+			    ~(WM8350_VMID_MASK | WM8350_CODEC_ISEL_MASK);
+			wm8350_reg_write(wm8350, WM8350_POWER_MGMT_1,
+					 pm1 | WM8350_VMID_300K |
+					 (platform->
+					  codec_current_standby << 14));
+
+		}
+		break;
+
+	case SND_SOC_BIAS_OFF:
+
+		/* mute DAC & enable outputs */
+		wm8350_set_bits(wm8350, WM8350_DAC_MUTE, WM8350_DAC_MUTE_ENA);
+
+		wm8350_set_bits(wm8350, WM8350_POWER_MGMT_3,
+				WM8350_OUT1L_ENA | WM8350_OUT1R_ENA |
+				WM8350_OUT2L_ENA | WM8350_OUT2R_ENA);
+
+		/* enable anti pop S curve */
+		wm8350_reg_write(wm8350, WM8350_ANTI_POP_CONTROL,
+				 (platform->vmid_s_curve << 8));
+
+		/* turn off vmid  */
+		pm1 = wm8350_reg_read(wm8350, WM8350_POWER_MGMT_1) &
+		    ~WM8350_VMIDEN;
+		wm8350_reg_write(wm8350, WM8350_POWER_MGMT_1, pm1);
+
+		/* wait */
+		schedule_timeout_interruptible(msecs_to_jiffies
+					       (platform->
+						vmid_discharge_msecs));
+
+		wm8350_reg_write(wm8350, WM8350_ANTI_POP_CONTROL,
+				 (platform->vmid_s_curve << 8) |
+				 platform->dis_out1 |
+				 (platform->dis_out2 << 2) |
+				 (platform->dis_out3 << 4) |
+				 (platform->dis_out4 << 6));
+
+		/* turn off VBuf and drain */
+		pm1 = wm8350_reg_read(wm8350, WM8350_POWER_MGMT_1) &
+		    ~(WM8350_VBUFEN | WM8350_VMID_MASK);
+		wm8350_reg_write(wm8350, WM8350_POWER_MGMT_1,
+				 pm1 | WM8350_OUTPUT_DRAIN_EN);
+
+		/* wait */
+		schedule_timeout_interruptible(msecs_to_jiffies
+					       (platform->drain_msecs));
+
+		pm1 &= ~WM8350_BIASEN;
+		wm8350_reg_write(wm8350, WM8350_POWER_MGMT_1, pm1);
+
+		/* disable anti-pop */
+		wm8350_reg_write(wm8350, WM8350_ANTI_POP_CONTROL, 0);
+
+		wm8350_clear_bits(wm8350, WM8350_LOUT1_VOLUME,
+				  WM8350_OUT1L_ENA);
+		wm8350_clear_bits(wm8350, WM8350_ROUT1_VOLUME,
+				  WM8350_OUT1R_ENA);
+		wm8350_clear_bits(wm8350, WM8350_LOUT2_VOLUME,
+				  WM8350_OUT2L_ENA);
+		wm8350_clear_bits(wm8350, WM8350_ROUT2_VOLUME,
+				  WM8350_OUT2R_ENA);
+
+		/* disable clock gen */
+		wm8350_clear_bits(wm8350, WM8350_POWER_MGMT_4,
+				  WM8350_SYSCLK_ENA);
+
+		regulator_bulk_disable(ARRAY_SIZE(priv->supplies),
+				       priv->supplies);
+		break;
+	}
+	codec->bias_level = level;
+	return 0;
+}
+
+static int wm8350_suspend(struct platform_device *pdev, pm_message_t state)
+{
+	struct snd_soc_device *socdev = platform_get_drvdata(pdev);
+	struct snd_soc_codec *codec = socdev->codec;
+
+	wm8350_set_bias_level(codec, SND_SOC_BIAS_OFF);
+	return 0;
+}
+
+static int wm8350_resume(struct platform_device *pdev)
+{
+	struct snd_soc_device *socdev = platform_get_drvdata(pdev);
+	struct snd_soc_codec *codec = socdev->codec;
+
+	wm8350_set_bias_level(codec, SND_SOC_BIAS_STANDBY);
+
+	if (codec->suspend_bias_level == SND_SOC_BIAS_ON)
+		wm8350_set_bias_level(codec, SND_SOC_BIAS_ON);
+
+	return 0;
+}
+
+static struct snd_soc_codec *wm8350_codec;
+
+static int wm8350_probe(struct platform_device *pdev)
+{
+	struct snd_soc_device *socdev = platform_get_drvdata(pdev);
+	struct snd_soc_codec *codec;
+	struct wm8350 *wm8350;
+	struct wm8350_data *priv;
+	int ret;
+	struct wm8350_output *out1;
+	struct wm8350_output *out2;
+
+	BUG_ON(!wm8350_codec);
+
+	socdev->codec = wm8350_codec;
+	codec = socdev->codec;
+	wm8350 = codec->control_data;
+	priv = codec->private_data;
+
+	/* Enable the codec */
+	wm8350_set_bits(wm8350, WM8350_POWER_MGMT_5, WM8350_CODEC_ENA);
+
+	/* Enable robust clocking mode in ADC */
+	wm8350_codec_write(codec, WM8350_SECURITY, 0xa7);
+	wm8350_codec_write(codec, 0xde, 0x13);
+	wm8350_codec_write(codec, WM8350_SECURITY, 0);
+
+	/* read OUT1 & OUT2 volumes */
+	out1 = &priv->out1;
+	out2 = &priv->out2;
+	out1->left_vol = (wm8350_reg_read(wm8350, WM8350_LOUT1_VOLUME) &
+			  WM8350_OUT1L_VOL_MASK) >> WM8350_OUT1L_VOL_SHIFT;
+	out1->right_vol = (wm8350_reg_read(wm8350, WM8350_ROUT1_VOLUME) &
+			   WM8350_OUT1R_VOL_MASK) >> WM8350_OUT1R_VOL_SHIFT;
+	out2->left_vol = (wm8350_reg_read(wm8350, WM8350_LOUT2_VOLUME) &
+			  WM8350_OUT2L_VOL_MASK) >> WM8350_OUT1L_VOL_SHIFT;
+	out2->right_vol = (wm8350_reg_read(wm8350, WM8350_ROUT2_VOLUME) &
+			   WM8350_OUT2R_VOL_MASK) >> WM8350_OUT1R_VOL_SHIFT;
+	wm8350_reg_write(wm8350, WM8350_LOUT1_VOLUME, 0);
+	wm8350_reg_write(wm8350, WM8350_ROUT1_VOLUME, 0);
+	wm8350_reg_write(wm8350, WM8350_LOUT2_VOLUME, 0);
+	wm8350_reg_write(wm8350, WM8350_ROUT2_VOLUME, 0);
+
+	/* Latch VU bits & mute */
+	wm8350_set_bits(wm8350, WM8350_LOUT1_VOLUME,
+			WM8350_OUT1_VU | WM8350_OUT1L_MUTE);
+	wm8350_set_bits(wm8350, WM8350_LOUT2_VOLUME,
+			WM8350_OUT2_VU | WM8350_OUT2L_MUTE);
+	wm8350_set_bits(wm8350, WM8350_ROUT1_VOLUME,
+			WM8350_OUT1_VU | WM8350_OUT1R_MUTE);
+	wm8350_set_bits(wm8350, WM8350_ROUT2_VOLUME,
+			WM8350_OUT2_VU | WM8350_OUT2R_MUTE);
+
+	ret = snd_soc_new_pcms(socdev, SNDRV_DEFAULT_IDX1, SNDRV_DEFAULT_STR1);
+	if (ret < 0) {
+		dev_err(&pdev->dev, "failed to create pcms\n");
+		return ret;
+	}
+
+	wm8350_add_controls(codec);
+	wm8350_add_widgets(codec);
+
+	wm8350_set_bias_level(codec, SND_SOC_BIAS_STANDBY);
+
+	ret = snd_soc_init_card(socdev);
+	if (ret < 0) {
+		dev_err(&pdev->dev, "failed to register card\n");
+		goto card_err;
+	}
+
+	return 0;
+
+card_err:
+	snd_soc_free_pcms(socdev);
+	snd_soc_dapm_free(socdev);
+	return ret;
+}
+
+static int wm8350_remove(struct platform_device *pdev)
+{
+	struct snd_soc_device *socdev = platform_get_drvdata(pdev);
+	struct snd_soc_codec *codec = socdev->codec;
+	struct wm8350 *wm8350 = codec->control_data;
+	int ret;
+
+	/* cancel any work waiting to be queued. */
+	ret = cancel_delayed_work(&codec->delayed_work);
+
+	/* if there was any work waiting then we run it now and
+	 * wait for its completion */
+	if (ret) {
+		schedule_delayed_work(&codec->delayed_work, 0);
+		flush_scheduled_work();
+	}
+
+	wm8350_set_bias_level(codec, SND_SOC_BIAS_OFF);
+
+	wm8350_clear_bits(wm8350, WM8350_POWER_MGMT_5, WM8350_CODEC_ENA);
+
+	return 0;
+}
+
+#define WM8350_RATES (SNDRV_PCM_RATE_8000_96000)
+
+#define WM8350_FORMATS (SNDRV_PCM_FMTBIT_S16_LE |\
+			SNDRV_PCM_FMTBIT_S20_3LE |\
+			SNDRV_PCM_FMTBIT_S24_LE)
+
+struct snd_soc_dai wm8350_dai = {
+	.name = "WM8350",
+	.playback = {
+		.stream_name = "Playback",
+		.channels_min = 1,
+		.channels_max = 2,
+		.rates = WM8350_RATES,
+		.formats = WM8350_FORMATS,
+	},
+	.capture = {
+		 .stream_name = "Capture",
+		 .channels_min = 1,
+		 .channels_max = 2,
+		 .rates = WM8350_RATES,
+		 .formats = WM8350_FORMATS,
+	 },
+	.ops = {
+		 .hw_params = wm8350_pcm_hw_params,
+		 .digital_mute = wm8350_mute,
+		 .trigger = wm8350_pcm_trigger,
+		 .set_fmt = wm8350_set_dai_fmt,
+		 .set_sysclk = wm8350_set_dai_sysclk,
+		 .set_pll = wm8350_set_fll,
+		 .set_clkdiv = wm8350_set_clkdiv,
+	 },
+};
+EXPORT_SYMBOL_GPL(wm8350_dai);
+
+struct snd_soc_codec_device soc_codec_dev_wm8350 = {
+	.probe = 	wm8350_probe,
+	.remove = 	wm8350_remove,
+	.suspend = 	wm8350_suspend,
+	.resume =	wm8350_resume,
+};
+EXPORT_SYMBOL_GPL(soc_codec_dev_wm8350);
+
+static int wm8350_codec_probe(struct platform_device *pdev)
+{
+	struct wm8350 *wm8350 = platform_get_drvdata(pdev);
+	struct wm8350_data *priv;
+	struct snd_soc_codec *codec;
+	int ret, i;
+
+	if (wm8350->codec.platform_data == NULL) {
+		dev_err(&pdev->dev, "No audio platform data supplied\n");
+		return -EINVAL;
+	}
+
+	priv = kzalloc(sizeof(struct wm8350_data), GFP_KERNEL);
+	if (priv == NULL)
+		return -ENOMEM;
+
+	for (i = 0; i < ARRAY_SIZE(supply_names); i++)
+		priv->supplies[i].supply = supply_names[i];
+
+	ret = regulator_bulk_get(wm8350->dev, ARRAY_SIZE(priv->supplies),
+				 priv->supplies);
+	if (ret != 0)
+		goto err_priv;
+
+	codec = &priv->codec;
+	wm8350->codec.codec = codec;
+
+	wm8350_dai.dev = &pdev->dev;
+
+	mutex_init(&codec->mutex);
+	INIT_LIST_HEAD(&codec->dapm_widgets);
+	INIT_LIST_HEAD(&codec->dapm_paths);
+	codec->dev = &pdev->dev;
+	codec->name = "WM8350";
+	codec->owner = THIS_MODULE;
+	codec->read = wm8350_codec_read;
+	codec->write = wm8350_codec_write;
+	codec->bias_level = SND_SOC_BIAS_OFF;
+	codec->set_bias_level = wm8350_set_bias_level;
+	codec->dai = &wm8350_dai;
+	codec->num_dai = 1;
+	codec->reg_cache_size = WM8350_MAX_REGISTER;
+	codec->private_data = priv;
+	codec->control_data = wm8350;
+
+	/* Put the codec into reset if it wasn't already */
+	wm8350_clear_bits(wm8350, WM8350_POWER_MGMT_5, WM8350_CODEC_ENA);
+
+	INIT_DELAYED_WORK(&codec->delayed_work, wm8350_pga_work);
+	ret = snd_soc_register_codec(codec);
+	if (ret != 0)
+		goto err_supply;
+
+	wm8350_codec = codec;
+
+	ret = snd_soc_register_dai(&wm8350_dai);
+	if (ret != 0)
+		goto err_codec;
+	return 0;
+
+err_codec:
+	snd_soc_unregister_codec(codec);
+err_supply:
+	regulator_bulk_free(ARRAY_SIZE(priv->supplies), priv->supplies);
+err_priv:
+	kfree(priv);
+	wm8350_codec = NULL;
+	return ret;
+}
+
+static int wm8350_codec_remove(struct platform_device *pdev)
+{
+	struct wm8350 *wm8350 = platform_get_drvdata(pdev);
+	struct snd_soc_codec *codec = wm8350->codec.codec;
+	struct wm8350_data *priv = codec->private_data;
+
+	snd_soc_unregister_dai(&wm8350_dai);
+	snd_soc_unregister_codec(codec);
+	regulator_bulk_free(ARRAY_SIZE(priv->supplies), priv->supplies);
+	kfree(priv);
+	wm8350_codec = NULL;
+	return 0;
+}
+
+static struct platform_driver wm8350_codec_driver = {
+	.driver = {
+		   .name = "wm8350-codec",
+		   .owner = THIS_MODULE,
+		   },
+	.probe = wm8350_codec_probe,
+	.remove = __devexit_p(wm8350_codec_remove),
+};
+
+static __init int wm8350_init(void)
+{
+	return platform_driver_register(&wm8350_codec_driver);
+}
+module_init(wm8350_init);
+
+static __exit void wm8350_exit(void)
+{
+	platform_driver_unregister(&wm8350_codec_driver);
+}
+module_exit(wm8350_exit);
+
+MODULE_DESCRIPTION("ASoC WM8350 driver");
+MODULE_AUTHOR("Liam Girdwood");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS("platform:wm8350-codec");
diff --git a/sound/soc/codecs/wm8350.h b/sound/soc/codecs/wm8350.h
new file mode 100644
index 00000000000..cc2887aa6c3
--- /dev/null
+++ b/sound/soc/codecs/wm8350.h
@@ -0,0 +1,20 @@
+/*
+ * wm8350.h - WM8903 audio codec interface
+ *
+ * Copyright 2008 Wolfson Microelectronics PLC.
+ *
+ *  This program is free software; you can redistribute  it and/or modify it
+ *  under  the terms of  the GNU General  Public License as published by the
+ *  Free Software Foundation;  either version 2 of the  License, or (at your
+ *  option) any later version.
+ */
+
+#ifndef _WM8350_H
+#define _WM8350_H
+
+#include <sound/soc.h>
+
+extern struct snd_soc_dai wm8350_dai;
+extern struct snd_soc_codec_device soc_codec_dev_wm8350;
+
+#endif
-- 
cgit v1.2.3-70-g09d2


From 64db4cfff99c04cd5f550357edcc8780f96b54a2 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Thu, 18 Dec 2008 21:55:32 +0100
Subject: "Tree RCU": scalable classic RCU implementation

This patch fixes a long-standing performance bug in classic RCU that
results in massive internal-to-RCU lock contention on systems with
more than a few hundred CPUs.  Although this patch creates a separate
flavor of RCU for ease of review and patch maintenance, it is intended
to replace classic RCU.

This patch still handles stress better than does mainline, so I am still
calling it ready for inclusion.  This patch is against the -tip tree.
Nevertheless, experience on an actual 1000+ CPU machine would still be
most welcome.

Most of the changes noted below were found while creating an rcutiny
(which should permit ejecting the current rcuclassic) and while doing
detailed line-by-line documentation.

Updates from v9 (http://lkml.org/lkml/2008/12/2/334):

o	Fixes from remainder of line-by-line code walkthrough,
	including comment spelling, initialization, undesirable
	narrowing due to type conversion, removing redundant memory
	barriers, removing redundant local-variable initialization,
	and removing redundant local variables.

	I do not believe that any of these fixes address the CPU-hotplug
	issues that Andi Kleen was seeing, but please do give it a whirl
	in case the machine is smarter than I am.

	A writeup from the walkthrough may be found at the following
	URL, in case you are suffering from terminal insomnia or
	masochism:

	http://www.kernel.org/pub/linux/kernel/people/paulmck/tmp/rcutree-walkthrough.2008.12.16a.pdf

o	Made rcutree tracing use seq_file, as suggested some time
	ago by Lai Jiangshan.

o	Added a .csv variant of the rcudata debugfs trace file, to allow
	people having thousands of CPUs to drop the data into
	a spreadsheet.	Tested with oocalc and gnumeric.  Updated
	documentation to suit.

Updates from v8 (http://lkml.org/lkml/2008/11/15/139):

o	Fix a theoretical race between grace-period initialization and
	force_quiescent_state() that could occur if more than three
	jiffies were required to carry out the grace-period
	initialization.  Which it might, if you had enough CPUs.

o	Apply Ingo's printk-standardization patch.

o	Substitute local variables for repeated accesses to global
	variables.

o	Fix comment misspellings and redundant (but harmless) increments
	of ->n_rcu_pending (this latter after having explicitly added it).

o	Apply checkpatch fixes.

Updates from v7 (http://lkml.org/lkml/2008/10/10/291):

o	Fixed a number of problems noted by Gautham Shenoy, including
	the cpu-stall-detection bug that he was having difficulty
	convincing me was real.  ;-)

o	Changed cpu-stall detection to wait for ten seconds rather than
	three in order to reduce false positive, as suggested by Ingo
	Molnar.

o	Produced a design document (http://lwn.net/Articles/305782/).
	The act of writing this document uncovered a number of both
	theoretical and "here and now" bugs as noted below.

o	Fix dynticks_nesting accounting confusion, simplify WARN_ON()
	condition, fix kerneldoc comments, and add memory barriers
	in dynticks interface functions.

o	Add more data to tracing.

o	Remove unused "rcu_barrier" field from rcu_data structure.

o	Count calls to rcu_pending() from scheduling-clock interrupt
	to use as a surrogate timebase should jiffies stop counting.

o	Fix a theoretical race between force_quiescent_state() and
	grace-period initialization.  Yes, initialization does have to
	go on for some jiffies for this race to occur, but given enough
	CPUs...

Updates from v6 (http://lkml.org/lkml/2008/9/23/448):

o	Fix a number of checkpatch.pl complaints.

o	Apply review comments from Ingo Molnar and Lai Jiangshan
	on the stall-detection code.

o	Fix several bugs in !CONFIG_SMP builds.

o	Fix a misspelled config-parameter name so that RCU now announces
	at boot time if stall detection is configured.

o	Run tests on numerous combinations of configurations parameters,
	which after the fixes above, now build and run correctly.

Updates from v5 (http://lkml.org/lkml/2008/9/15/92, bad subject line):

o	Fix a compiler error in the !CONFIG_FANOUT_EXACT case (blew a
	changeset some time ago, and finally got around to retesting
	this option).

o	Fix some tracing bugs in rcupreempt that caused incorrect
	totals to be printed.

o	I now test with a more brutal random-selection online/offline
	script (attached).  Probably more brutal than it needs to be
	on the people reading it as well, but so it goes.

o	A number of optimizations and usability improvements:

	o	Make rcu_pending() ignore the grace-period timeout when
		there is no grace period in progress.

	o	Make force_quiescent_state() avoid going for a global
		lock in the case where there is no grace period in
		progress.

	o	Rearrange struct fields to improve struct layout.

	o	Make call_rcu() initiate a grace period if RCU was
		idle, rather than waiting for the next scheduling
		clock interrupt.

	o	Invoke rcu_irq_enter() and rcu_irq_exit() only when
		idle, as suggested by Andi Kleen.  I still don't
		completely trust this change, and might back it out.

	o	Make CONFIG_RCU_TRACE be the single config variable
		manipulated for all forms of RCU, instead of the prior
		confusion.

	o	Document tracing files and formats for both rcupreempt
		and rcutree.

Updates from v4 for those missing v5 given its bad subject line:

o	Separated dynticks interface so that NMIs and irqs call separate
	functions, greatly simplifying it.  In particular, this code
	no longer requires a proof of correctness.  ;-)

o	Separated dynticks state out into its own per-CPU structure,
	avoiding the duplicated accounting.

o	The case where a dynticks-idle CPU runs an irq handler that
	invokes call_rcu() is now correctly handled, forcing that CPU
	out of dynticks-idle mode.

o	Review comments have been applied (thank you all!!!).
	For but one example, fixed the dynticks-ordering issue that
	Manfred pointed out, saving me much debugging.  ;-)

o	Adjusted rcuclassic and rcupreempt to handle dynticks changes.

Attached is an updated patch to Classic RCU that applies a hierarchy,
greatly reducing the contention on the top-level lock for large machines.
This passes 10-hour concurrent rcutorture and online-offline testing on
128-CPU ppc64 without dynticks enabled, and exposes some timekeeping
bugs in presence of dynticks (exciting working on a system where
"sleep 1" hangs until interrupted...), which were fixed in the
2.6.27 kernel.  It is getting more reliable than mainline by some
measures, so the next version will be against -tip for inclusion.
See also Manfred Spraul's recent patches (or his earlier work from
2004 at http://marc.info/?l=linux-kernel&m=108546384711797&w=2).
We will converge onto a common patch in the fullness of time, but are
currently exploring different regions of the design space.  That said,
I have already gratefully stolen quite a few of Manfred's ideas.

This patch provides CONFIG_RCU_FANOUT, which controls the bushiness
of the RCU hierarchy.  Defaults to 32 on 32-bit machines and 64 on
64-bit machines.  If CONFIG_NR_CPUS is less than CONFIG_RCU_FANOUT,
there is no hierarchy.  By default, the RCU initialization code will
adjust CONFIG_RCU_FANOUT to balance the hierarchy, so strongly NUMA
architectures may choose to set CONFIG_RCU_FANOUT_EXACT to disable
this balancing, allowing the hierarchy to be exactly aligned to the
underlying hardware.  Up to two levels of hierarchy are permitted
(in addition to the root node), allowing up to 16,384 CPUs on 32-bit
systems and up to 262,144 CPUs on 64-bit systems.  I just know that I
am going to regret saying this, but this seems more than sufficient
for the foreseeable future.  (Some architectures might wish to set
CONFIG_RCU_FANOUT=4, which would limit such architectures to 64 CPUs.
If this becomes a real problem, additional levels can be added, but I
doubt that it will make a significant difference on real hardware.)

In the common case, a given CPU will manipulate its private rcu_data
structure and the rcu_node structure that it shares with its immediate
neighbors.  This can reduce both lock and memory contention by multiple
orders of magnitude, which should eliminate the need for the strange
manipulations that are reported to be required when running Linux on
very large systems.

Some shortcomings:

o	More bugs will probably surface as a result of an ongoing
	line-by-line code inspection.

	Patches will be provided as required.

o	There are probably hangs, rcutorture failures, &c.  Seems
	quite stable on a 128-CPU machine, but that is kind of small
	compared to 4096 CPUs.  However, seems to do better than
	mainline.

	Patches will be provided as required.

o	The memory footprint of this version is several KB larger
	than rcuclassic.

	A separate UP-only rcutiny patch will be provided, which will
	reduce the memory footprint significantly, even compared
	to the old rcuclassic.  One such patch passes light testing,
	and has a memory footprint smaller even than rcuclassic.
	Initial reaction from various embedded guys was "it is not
	worth it", so am putting it aside.

Credits:

o	Manfred Spraul for ideas, review comments, and bugs spotted,
	as well as some good friendly competition.  ;-)

o	Josh Triplett, Ingo Molnar, Peter Zijlstra, Mathieu Desnoyers,
	Lai Jiangshan, Andi Kleen, Andy Whitcroft, and Andrew Morton
	for reviews and comments.

o	Thomas Gleixner for much-needed help with some timer issues
	(see patches below).

o	Jon M. Tollefson, Tim Pepper, Andrew Theurer, Jose R. Santos,
	Andy Whitcroft, Darrick Wong, Nishanth Aravamudan, Anton
	Blanchard, Dave Kleikamp, and Nathan Lynch for keeping machines
	alive despite my heavy abuse^Wtesting.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 Documentation/RCU/00-INDEX             |    2 +
 Documentation/RCU/trace.txt            |  413 +++++++++
 arch/powerpc/platforms/pseries/rtasd.c |    4 +
 include/linux/hardirq.h                |   14 +-
 include/linux/rcupdate.h               |   10 +-
 include/linux/rcutree.h                |  329 +++++++
 init/Kconfig                           |   18 +-
 kernel/Kconfig.preempt                 |   62 +-
 kernel/Makefile                        |    6 +-
 kernel/rcupreempt.c                    |   10 +
 kernel/rcupreempt_trace.c              |   10 +-
 kernel/rcutree.c                       | 1535 ++++++++++++++++++++++++++++++++
 kernel/rcutree_trace.c                 |  271 ++++++
 kernel/softirq.c                       |    5 +-
 lib/Kconfig.debug                      |   13 +
 15 files changed, 2671 insertions(+), 31 deletions(-)
 create mode 100644 Documentation/RCU/trace.txt
 create mode 100644 include/linux/rcutree.h
 create mode 100644 kernel/rcutree.c
 create mode 100644 kernel/rcutree_trace.c

(limited to 'include/linux')

diff --git a/Documentation/RCU/00-INDEX b/Documentation/RCU/00-INDEX
index 461481dfb7c..7dc0695a8f9 100644
--- a/Documentation/RCU/00-INDEX
+++ b/Documentation/RCU/00-INDEX
@@ -16,6 +16,8 @@ RTFP.txt
 	- List of RCU papers (bibliography) going back to 1980.
 torture.txt
 	- RCU Torture Test Operation (CONFIG_RCU_TORTURE_TEST)
+trace.txt
+	- CONFIG_RCU_TRACE debugfs files and formats
 UP.txt
 	- RCU on Uniprocessor Systems
 whatisRCU.txt
diff --git a/Documentation/RCU/trace.txt b/Documentation/RCU/trace.txt
new file mode 100644
index 00000000000..068848240a8
--- /dev/null
+++ b/Documentation/RCU/trace.txt
@@ -0,0 +1,413 @@
+CONFIG_RCU_TRACE debugfs Files and Formats
+
+
+The rcupreempt and rcutree implementations of RCU provide debugfs trace
+output that summarizes counters and state.  This information is useful for
+debugging RCU itself, and can sometimes also help to debug abuses of RCU.
+Note that the rcuclassic implementation of RCU does not provide debugfs
+trace output.
+
+The following sections describe the debugfs files and formats for
+preemptable RCU (rcupreempt) and hierarchical RCU (rcutree).
+
+
+Preemptable RCU debugfs Files and Formats
+
+This implementation of RCU provides three debugfs files under the
+top-level directory RCU: rcu/rcuctrs (which displays the per-CPU
+counters used by preemptable RCU) rcu/rcugp (which displays grace-period
+counters), and rcu/rcustats (which internal counters for debugging RCU).
+
+The output of "cat rcu/rcuctrs" looks as follows:
+
+CPU last cur F M
+  0    5  -5 0 0
+  1   -1   0 0 0
+  2    0   1 0 0
+  3    0   1 0 0
+  4    0   1 0 0
+  5    0   1 0 0
+  6    0   2 0 0
+  7    0  -1 0 0
+  8    0   1 0 0
+ggp = 26226, state = waitzero
+
+The per-CPU fields are as follows:
+
+o	"CPU" gives the CPU number.  Offline CPUs are not displayed.
+
+o	"last" gives the value of the counter that is being decremented
+	for the current grace period phase.  In the example above,
+	the counters sum to 4, indicating that there are still four
+	RCU read-side critical sections still running that started
+	before the last counter flip.
+
+o	"cur" gives the value of the counter that is currently being
+	both incremented (by rcu_read_lock()) and decremented (by
+	rcu_read_unlock()).  In the example above, the counters sum to
+	1, indicating that there is only one RCU read-side critical section
+	still running that started after the last counter flip.
+
+o	"F" indicates whether RCU is waiting for this CPU to acknowledge
+	a counter flip.  In the above example, RCU is not waiting on any,
+	which is consistent with the state being "waitzero" rather than
+	"waitack".
+
+o	"M" indicates whether RCU is waiting for this CPU to execute a
+	memory barrier.  In the above example, RCU is not waiting on any,
+	which is consistent with the state being "waitzero" rather than
+	"waitmb".
+
+o	"ggp" is the global grace-period counter.
+
+o	"state" is the RCU state, which can be one of the following:
+
+	o	"idle": there is no grace period in progress.
+
+	o	"waitack": RCU just incremented the global grace-period
+		counter, which has the effect of reversing the roles of
+		the "last" and "cur" counters above, and is waiting for
+		all the CPUs to acknowledge the flip.  Once the flip has
+		been acknowledged, CPUs will no longer be incrementing
+		what are now the "last" counters, so that their sum will
+		decrease monotonically down to zero.
+
+	o	"waitzero": RCU is waiting for the sum of the "last" counters
+		to decrease to zero.
+
+	o	"waitmb": RCU is waiting for each CPU to execute a memory
+		barrier, which ensures that instructions from a given CPU's
+		last RCU read-side critical section cannot be reordered
+		with instructions following the memory-barrier instruction.
+
+The output of "cat rcu/rcugp" looks as follows:
+
+oldggp=48870  newggp=48873
+
+Note that reading from this file provokes a synchronize_rcu().  The
+"oldggp" value is that of "ggp" from rcu/rcuctrs above, taken before
+executing the synchronize_rcu(), and the "newggp" value is also the
+"ggp" value, but taken after the synchronize_rcu() command returns.
+
+
+The output of "cat rcu/rcugp" looks as follows:
+
+na=1337955 nl=40 wa=1337915 wl=44 da=1337871 dl=0 dr=1337871 di=1337871
+1=50989 e1=6138 i1=49722 ie1=82 g1=49640 a1=315203 ae1=265563 a2=49640
+z1=1401244 ze1=1351605 z2=49639 m1=5661253 me1=5611614 m2=49639
+
+These are counters tracking internal preemptable-RCU events, however,
+some of them may be useful for debugging algorithms using RCU.  In
+particular, the "nl", "wl", and "dl" values track the number of RCU
+callbacks in various states.  The fields are as follows:
+
+o	"na" is the total number of RCU callbacks that have been enqueued
+	since boot.
+
+o	"nl" is the number of RCU callbacks waiting for the previous
+	grace period to end so that they can start waiting on the next
+	grace period.
+
+o	"wa" is the total number of RCU callbacks that have started waiting
+	for a grace period since boot.  "na" should be roughly equal to
+	"nl" plus "wa".
+
+o	"wl" is the number of RCU callbacks currently waiting for their
+	grace period to end.
+
+o	"da" is the total number of RCU callbacks whose grace periods
+	have completed since boot.  "wa" should be roughly equal to
+	"wl" plus "da".
+
+o	"dr" is the total number of RCU callbacks that have been removed
+	from the list of callbacks ready to invoke.  "dr" should be roughly
+	equal to "da".
+
+o	"di" is the total number of RCU callbacks that have been invoked
+	since boot.  "di" should be roughly equal to "da", though some
+	early versions of preemptable RCU had a bug so that only the
+	last CPU's count of invocations was displayed, rather than the
+	sum of all CPU's counts.
+
+o	"1" is the number of calls to rcu_try_flip().  This should be
+	roughly equal to the sum of "e1", "i1", "a1", "z1", and "m1"
+	described below.  In other words, the number of times that
+	the state machine is visited should be equal to the sum of the
+	number of times that each state is visited plus the number of
+	times that the state-machine lock acquisition failed.
+
+o	"e1" is the number of times that rcu_try_flip() was unable to
+	acquire the fliplock.
+
+o	"i1" is the number of calls to rcu_try_flip_idle().
+
+o	"ie1" is the number of times rcu_try_flip_idle() exited early
+	due to the calling CPU having no work for RCU.
+
+o	"g1" is the number of times that rcu_try_flip_idle() decided
+	to start a new grace period.  "i1" should be roughly equal to
+	"ie1" plus "g1".
+
+o	"a1" is the number of calls to rcu_try_flip_waitack().
+
+o	"ae1" is the number of times that rcu_try_flip_waitack() found
+	that at least one CPU had not yet acknowledge the new grace period
+	(AKA "counter flip").
+
+o	"a2" is the number of time rcu_try_flip_waitack() found that
+	all CPUs had acknowledged.  "a1" should be roughly equal to
+	"ae1" plus "a2".  (This particular output was collected on
+	a 128-CPU machine, hence the smaller-than-usual fraction of
+	calls to rcu_try_flip_waitack() finding all CPUs having already
+	acknowledged.)
+
+o	"z1" is the number of calls to rcu_try_flip_waitzero().
+
+o	"ze1" is the number of times that rcu_try_flip_waitzero() found
+	that not all of the old RCU read-side critical sections had
+	completed.
+
+o	"z2" is the number of times that rcu_try_flip_waitzero() finds
+	the sum of the counters equal to zero, in other words, that
+	all of the old RCU read-side critical sections had completed.
+	The value of "z1" should be roughly equal to "ze1" plus
+	"z2".
+
+o	"m1" is the number of calls to rcu_try_flip_waitmb().
+
+o	"me1" is the number of times that rcu_try_flip_waitmb() finds
+	that at least one CPU has not yet executed a memory barrier.
+
+o	"m2" is the number of times that rcu_try_flip_waitmb() finds that
+	all CPUs have executed a memory barrier.
+
+
+Hierarchical RCU debugfs Files and Formats
+
+This implementation of RCU provides three debugfs files under the
+top-level directory RCU: rcu/rcudata (which displays fields in struct
+rcu_data), rcu/rcugp (which displays grace-period counters), and
+rcu/rcuhier (which displays the struct rcu_node hierarchy).
+
+The output of "cat rcu/rcudata" looks as follows:
+
+rcu:
+  0 c=4011 g=4012 pq=1 pqc=4011 qp=0 rpfq=1 rp=3c2a dt=23301/73 dn=2 df=1882 of=0 ri=2126 ql=2 b=10
+  1 c=4011 g=4012 pq=1 pqc=4011 qp=0 rpfq=3 rp=39a6 dt=78073/1 dn=2 df=1402 of=0 ri=1875 ql=46 b=10
+  2 c=4010 g=4010 pq=1 pqc=4010 qp=0 rpfq=-5 rp=1d12 dt=16646/0 dn=2 df=3140 of=0 ri=2080 ql=0 b=10
+  3 c=4012 g=4013 pq=1 pqc=4012 qp=1 rpfq=3 rp=2b50 dt=21159/1 dn=2 df=2230 of=0 ri=1923 ql=72 b=10
+  4 c=4012 g=4013 pq=1 pqc=4012 qp=1 rpfq=3 rp=1644 dt=5783/1 dn=2 df=3348 of=0 ri=2805 ql=7 b=10
+  5 c=4012 g=4013 pq=0 pqc=4011 qp=1 rpfq=3 rp=1aac dt=5879/1 dn=2 df=3140 of=0 ri=2066 ql=10 b=10
+  6 c=4012 g=4013 pq=1 pqc=4012 qp=1 rpfq=3 rp=ed8 dt=5847/1 dn=2 df=3797 of=0 ri=1266 ql=10 b=10
+  7 c=4012 g=4013 pq=1 pqc=4012 qp=1 rpfq=3 rp=1fa2 dt=6199/1 dn=2 df=2795 of=0 ri=2162 ql=28 b=10
+rcu_bh:
+  0 c=-268 g=-268 pq=1 pqc=-268 qp=0 rpfq=-145 rp=21d6 dt=23301/73 dn=2 df=0 of=0 ri=0 ql=0 b=10
+  1 c=-268 g=-268 pq=1 pqc=-268 qp=1 rpfq=-170 rp=20ce dt=78073/1 dn=2 df=26 of=0 ri=5 ql=0 b=10
+  2 c=-268 g=-268 pq=1 pqc=-268 qp=1 rpfq=-83 rp=fbd dt=16646/0 dn=2 df=28 of=0 ri=4 ql=0 b=10
+  3 c=-268 g=-268 pq=1 pqc=-268 qp=0 rpfq=-105 rp=178c dt=21159/1 dn=2 df=28 of=0 ri=2 ql=0 b=10
+  4 c=-268 g=-268 pq=1 pqc=-268 qp=1 rpfq=-30 rp=b54 dt=5783/1 dn=2 df=32 of=0 ri=0 ql=0 b=10
+  5 c=-268 g=-268 pq=1 pqc=-268 qp=1 rpfq=-29 rp=df5 dt=5879/1 dn=2 df=30 of=0 ri=3 ql=0 b=10
+  6 c=-268 g=-268 pq=1 pqc=-268 qp=1 rpfq=-28 rp=788 dt=5847/1 dn=2 df=32 of=0 ri=0 ql=0 b=10
+  7 c=-268 g=-268 pq=1 pqc=-268 qp=1 rpfq=-53 rp=1098 dt=6199/1 dn=2 df=30 of=0 ri=3 ql=0 b=10
+
+The first section lists the rcu_data structures for rcu, the second for
+rcu_bh.  Each section has one line per CPU, or eight for this 8-CPU system.
+The fields are as follows:
+
+o	The number at the beginning of each line is the CPU number.
+	CPUs numbers followed by an exclamation mark are offline,
+	but have been online at least once since boot.	There will be
+	no output for CPUs that have never been online, which can be
+	a good thing in the surprisingly common case where NR_CPUS is
+	substantially larger than the number of actual CPUs.
+
+o	"c" is the count of grace periods that this CPU believes have
+	completed.  CPUs in dynticks idle mode may lag quite a ways
+	behind, for example, CPU 4 under "rcu" above, which has slept
+	through the past 25 RCU grace periods.	It is not unusual to
+	see CPUs lagging by thousands of grace periods.
+
+o	"g" is the count of grace periods that this CPU believes have
+	started.  Again, CPUs in dynticks idle mode may lag behind.
+	If the "c" and "g" values are equal, this CPU has already
+	reported a quiescent state for the last RCU grace period that
+	it is aware of, otherwise, the CPU believes that it owes RCU a
+	quiescent state.
+
+o	"pq" indicates that this CPU has passed through a quiescent state
+	for the current grace period.  It is possible for "pq" to be
+	"1" and "c" different than "g", which indicates that although
+	the CPU has passed through a quiescent state, either (1) this
+	CPU has not yet reported that fact, (2) some other CPU has not
+	yet reported for this grace period, or (3) both.
+
+o	"pqc" indicates which grace period the last-observed quiescent
+	state for this CPU corresponds to.  This is important for handling
+	the race between CPU 0 reporting an extended dynticks-idle
+	quiescent state for CPU 1 and CPU 1 suddenly waking up and
+	reporting its own quiescent state.  If CPU 1 was the last CPU
+	for the current grace period, then the CPU that loses this race
+	will attempt to incorrectly mark CPU 1 as having checked in for
+	the next grace period!
+
+o	"qp" indicates that RCU still expects a quiescent state from
+	this CPU.
+
+o	"rpfq" is the number of rcu_pending() calls on this CPU required
+	to induce this CPU to invoke force_quiescent_state().
+
+o	"rp" is low-order four hex digits of the count of how many times
+	rcu_pending() has been invoked on this CPU.
+
+o	"dt" is the current value of the dyntick counter that is incremented
+	when entering or leaving dynticks idle state, either by the
+	scheduler or by irq.  The number after the "/" is the interrupt
+	nesting depth when in dyntick-idle state, or one greater than
+	the interrupt-nesting depth otherwise.
+
+	This field is displayed only for CONFIG_NO_HZ kernels.
+
+o	"dn" is the current value of the dyntick counter that is incremented
+	when entering or leaving dynticks idle state via NMI.  If both
+	the "dt" and "dn" values are even, then this CPU is in dynticks
+	idle mode and may be ignored by RCU.  If either of these two
+	counters is odd, then RCU must be alert to the possibility of
+	an RCU read-side critical section running on this CPU.
+
+	This field is displayed only for CONFIG_NO_HZ kernels.
+
+o	"df" is the number of times that some other CPU has forced a
+	quiescent state on behalf of this CPU due to this CPU being in
+	dynticks-idle state.
+
+	This field is displayed only for CONFIG_NO_HZ kernels.
+
+o	"of" is the number of times that some other CPU has forced a
+	quiescent state on behalf of this CPU due to this CPU being
+	offline.  In a perfect world, this might neve happen, but it
+	turns out that offlining and onlining a CPU can take several grace
+	periods, and so there is likely to be an extended period of time
+	when RCU believes that the CPU is online when it really is not.
+	Please note that erring in the other direction (RCU believing a
+	CPU is offline when it is really alive and kicking) is a fatal
+	error, so it makes sense to err conservatively.
+
+o	"ri" is the number of times that RCU has seen fit to send a
+	reschedule IPI to this CPU in order to get it to report a
+	quiescent state.
+
+o	"ql" is the number of RCU callbacks currently residing on
+	this CPU.  This is the total number of callbacks, regardless
+	of what state they are in (new, waiting for grace period to
+	start, waiting for grace period to end, ready to invoke).
+
+o	"b" is the batch limit for this CPU.  If more than this number
+	of RCU callbacks is ready to invoke, then the remainder will
+	be deferred.
+
+
+The output of "cat rcu/rcugp" looks as follows:
+
+rcu: completed=33062  gpnum=33063
+rcu_bh: completed=464  gpnum=464
+
+Again, this output is for both "rcu" and "rcu_bh".  The fields are
+taken from the rcu_state structure, and are as follows:
+
+o	"completed" is the number of grace periods that have completed.
+	It is comparable to the "c" field from rcu/rcudata in that a
+	CPU whose "c" field matches the value of "completed" is aware
+	that the corresponding RCU grace period has completed.
+
+o	"gpnum" is the number of grace periods that have started.  It is
+	comparable to the "g" field from rcu/rcudata in that a CPU
+	whose "g" field matches the value of "gpnum" is aware that the
+	corresponding RCU grace period has started.
+
+	If these two fields are equal (as they are for "rcu_bh" above),
+	then there is no grace period in progress, in other words, RCU
+	is idle.  On the other hand, if the two fields differ (as they
+	do for "rcu" above), then an RCU grace period is in progress.
+
+
+The output of "cat rcu/rcuhier" looks as follows, with very long lines:
+
+c=6902 g=6903 s=2 jfq=3 j=72c7 nfqs=13142/nfqsng=0(13142) fqlh=6
+1/1 0:127 ^0    
+3/3 0:35 ^0    0/0 36:71 ^1    0/0 72:107 ^2    0/0 108:127 ^3    
+3/3f 0:5 ^0    2/3 6:11 ^1    0/0 12:17 ^2    0/0 18:23 ^3    0/0 24:29 ^4    0/0 30:35 ^5    0/0 36:41 ^0    0/0 42:47 ^1    0/0 48:53 ^2    0/0 54:59 ^3    0/0 60:65 ^4    0/0 66:71 ^5    0/0 72:77 ^0    0/0 78:83 ^1    0/0 84:89 ^2    0/0 90:95 ^3    0/0 96:101 ^4    0/0 102:107 ^5    0/0 108:113 ^0    0/0 114:119 ^1    0/0 120:125 ^2    0/0 126:127 ^3    
+rcu_bh:
+c=-226 g=-226 s=1 jfq=-5701 j=72c7 nfqs=88/nfqsng=0(88) fqlh=0
+0/1 0:127 ^0    
+0/3 0:35 ^0    0/0 36:71 ^1    0/0 72:107 ^2    0/0 108:127 ^3    
+0/3f 0:5 ^0    0/3 6:11 ^1    0/0 12:17 ^2    0/0 18:23 ^3    0/0 24:29 ^4    0/0 30:35 ^5    0/0 36:41 ^0    0/0 42:47 ^1    0/0 48:53 ^2    0/0 54:59 ^3    0/0 60:65 ^4    0/0 66:71 ^5    0/0 72:77 ^0    0/0 78:83 ^1    0/0 84:89 ^2    0/0 90:95 ^3    0/0 96:101 ^4    0/0 102:107 ^5    0/0 108:113 ^0    0/0 114:119 ^1    0/0 120:125 ^2    0/0 126:127 ^3
+
+This is once again split into "rcu" and "rcu_bh" portions.  The fields are
+as follows:
+
+o	"c" is exactly the same as "completed" under rcu/rcugp.
+
+o	"g" is exactly the same as "gpnum" under rcu/rcugp.
+
+o	"s" is the "signaled" state that drives force_quiescent_state()'s
+	state machine.
+
+o	"jfq" is the number of jiffies remaining for this grace period
+	before force_quiescent_state() is invoked to help push things
+	along.  Note that CPUs in dyntick-idle mode thoughout the grace
+	period will not report on their own, but rather must be check by
+	some other CPU via force_quiescent_state().
+
+o	"j" is the low-order four hex digits of the jiffies counter.
+	Yes, Paul did run into a number of problems that turned out to
+	be due to the jiffies counter no longer counting.  Why do you ask?
+
+o	"nfqs" is the number of calls to force_quiescent_state() since
+	boot.
+
+o	"nfqsng" is the number of useless calls to force_quiescent_state(),
+	where there wasn't actually a grace period active.  This can
+	happen due to races.  The number in parentheses is the difference
+	between "nfqs" and "nfqsng", or the number of times that
+	force_quiescent_state() actually did some real work.
+
+o	"fqlh" is the number of calls to force_quiescent_state() that
+	exited immediately (without even being counted in nfqs above)
+	due to contention on ->fqslock.
+
+o	Each element of the form "1/1 0:127 ^0" represents one struct
+	rcu_node.  Each line represents one level of the hierarchy, from
+	root to leaves.  It is best to think of the rcu_data structures
+	as forming yet another level after the leaves.  Note that there
+	might be either one, two, or three levels of rcu_node structures,
+	depending on the relationship between CONFIG_RCU_FANOUT and
+	CONFIG_NR_CPUS.
+	
+	o	The numbers separated by the "/" are the qsmask followed
+		by the qsmaskinit.  The qsmask will have one bit
+		set for each entity in the next lower level that
+		has not yet checked in for the current grace period.
+		The qsmaskinit will have one bit for each entity that is
+		currently expected to check in during each grace period.
+		The value of qsmaskinit is assigned to that of qsmask
+		at the beginning of each grace period.
+
+		For example, for "rcu", the qsmask of the first entry
+		of the lowest level is 0x14, meaning that we are still
+		waiting for CPUs 2 and 4 to check in for the current
+		grace period.
+
+	o	The numbers separated by the ":" are the range of CPUs
+		served by this struct rcu_node.  This can be helpful
+		in working out how the hierarchy is wired together.
+
+		For example, the first entry at the lowest level shows
+		"0:5", indicating that it covers CPUs 0 through 5.
+
+	o	The number after the "^" indicates the bit in the
+		next higher level rcu_node structure that this
+		rcu_node structure corresponds to.
+
+		For example, the first entry at the lowest level shows
+		"^0", indicating that it corresponds to bit zero in
+		the first entry at the middle level.
diff --git a/arch/powerpc/platforms/pseries/rtasd.c b/arch/powerpc/platforms/pseries/rtasd.c
index f4e55be2eea..afad9f5ac0a 100644
--- a/arch/powerpc/platforms/pseries/rtasd.c
+++ b/arch/powerpc/platforms/pseries/rtasd.c
@@ -208,6 +208,7 @@ void pSeries_log_error(char *buf, unsigned int err_type, int fatal)
 		break;
 	case ERR_TYPE_KERNEL_PANIC:
 	default:
+		WARN_ON_ONCE(!irqs_disabled()); /* @@@ DEBUG @@@ */
 		spin_unlock_irqrestore(&rtasd_log_lock, s);
 		return;
 	}
@@ -227,6 +228,7 @@ void pSeries_log_error(char *buf, unsigned int err_type, int fatal)
 	/* Check to see if we need to or have stopped logging */
 	if (fatal || !logging_enabled) {
 		logging_enabled = 0;
+		WARN_ON_ONCE(!irqs_disabled()); /* @@@ DEBUG @@@ */
 		spin_unlock_irqrestore(&rtasd_log_lock, s);
 		return;
 	}
@@ -249,11 +251,13 @@ void pSeries_log_error(char *buf, unsigned int err_type, int fatal)
 		else
 			rtas_log_start += 1;
 
+		WARN_ON_ONCE(!irqs_disabled()); /* @@@ DEBUG @@@ */
 		spin_unlock_irqrestore(&rtasd_log_lock, s);
 		wake_up_interruptible(&rtas_log_wait);
 		break;
 	case ERR_TYPE_KERNEL_PANIC:
 	default:
+		WARN_ON_ONCE(!irqs_disabled()); /* @@@ DEBUG @@@ */
 		spin_unlock_irqrestore(&rtasd_log_lock, s);
 		return;
 	}
diff --git a/include/linux/hardirq.h b/include/linux/hardirq.h
index 181006cc94a..9b70b923169 100644
--- a/include/linux/hardirq.h
+++ b/include/linux/hardirq.h
@@ -118,13 +118,17 @@ static inline void account_system_vtime(struct task_struct *tsk)
 }
 #endif
 
-#if defined(CONFIG_PREEMPT_RCU) && defined(CONFIG_NO_HZ)
+#if defined(CONFIG_NO_HZ) && !defined(CONFIG_CLASSIC_RCU)
 extern void rcu_irq_enter(void);
 extern void rcu_irq_exit(void);
+extern void rcu_nmi_enter(void);
+extern void rcu_nmi_exit(void);
 #else
 # define rcu_irq_enter() do { } while (0)
 # define rcu_irq_exit() do { } while (0)
-#endif /* CONFIG_PREEMPT_RCU */
+# define rcu_nmi_enter() do { } while (0)
+# define rcu_nmi_exit() do { } while (0)
+#endif /* #if defined(CONFIG_NO_HZ) && !defined(CONFIG_CLASSIC_RCU) */
 
 /*
  * It is safe to do non-atomic ops on ->hardirq_context,
@@ -134,7 +138,6 @@ extern void rcu_irq_exit(void);
  */
 #define __irq_enter()					\
 	do {						\
-		rcu_irq_enter();			\
 		account_system_vtime(current);		\
 		add_preempt_count(HARDIRQ_OFFSET);	\
 		trace_hardirq_enter();			\
@@ -153,7 +156,6 @@ extern void irq_enter(void);
 		trace_hardirq_exit();			\
 		account_system_vtime(current);		\
 		sub_preempt_count(HARDIRQ_OFFSET);	\
-		rcu_irq_exit();				\
 	} while (0)
 
 /*
@@ -161,7 +163,7 @@ extern void irq_enter(void);
  */
 extern void irq_exit(void);
 
-#define nmi_enter()		do { lockdep_off(); __irq_enter(); } while (0)
-#define nmi_exit()		do { __irq_exit(); lockdep_on(); } while (0)
+#define nmi_enter()		do { lockdep_off(); rcu_nmi_enter(); __irq_enter(); } while (0)
+#define nmi_exit()		do { __irq_exit(); rcu_nmi_exit(); lockdep_on(); } while (0)
 
 #endif /* LINUX_HARDIRQ_H */
diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index 86f1f5e43e3..bfd289aff57 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -52,11 +52,15 @@ struct rcu_head {
 	void (*func)(struct rcu_head *head);
 };
 
-#ifdef CONFIG_CLASSIC_RCU
+#if defined(CONFIG_CLASSIC_RCU)
 #include <linux/rcuclassic.h>
-#else /* #ifdef CONFIG_CLASSIC_RCU */
+#elif defined(CONFIG_TREE_RCU)
+#include <linux/rcutree.h>
+#elif defined(CONFIG_PREEMPT_RCU)
 #include <linux/rcupreempt.h>
-#endif /* #else #ifdef CONFIG_CLASSIC_RCU */
+#else
+#error "Unknown RCU implementation specified to kernel configuration"
+#endif /* #else #if defined(CONFIG_CLASSIC_RCU) */
 
 #define RCU_HEAD_INIT 	{ .next = NULL, .func = NULL }
 #define RCU_HEAD(head) struct rcu_head head = RCU_HEAD_INIT
diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h
new file mode 100644
index 00000000000..d4368b7975c
--- /dev/null
+++ b/include/linux/rcutree.h
@@ -0,0 +1,329 @@
+/*
+ * Read-Copy Update mechanism for mutual exclusion (tree-based version)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright IBM Corporation, 2008
+ *
+ * Author: Dipankar Sarma <dipankar@in.ibm.com>
+ *	   Paul E. McKenney <paulmck@linux.vnet.ibm.com> Hierarchical algorithm
+ *
+ * Based on the original work by Paul McKenney <paulmck@us.ibm.com>
+ * and inputs from Rusty Russell, Andrea Arcangeli and Andi Kleen.
+ *
+ * For detailed explanation of Read-Copy Update mechanism see -
+ * 	Documentation/RCU
+ */
+
+#ifndef __LINUX_RCUTREE_H
+#define __LINUX_RCUTREE_H
+
+#include <linux/cache.h>
+#include <linux/spinlock.h>
+#include <linux/threads.h>
+#include <linux/percpu.h>
+#include <linux/cpumask.h>
+#include <linux/seqlock.h>
+
+/*
+ * Define shape of hierarchy based on NR_CPUS and CONFIG_RCU_FANOUT.
+ * In theory, it should be possible to add more levels straightforwardly.
+ * In practice, this has not been tested, so there is probably some
+ * bug somewhere.
+ */
+#define MAX_RCU_LVLS 3
+#define RCU_FANOUT	      (CONFIG_RCU_FANOUT)
+#define RCU_FANOUT_SQ	      (RCU_FANOUT * RCU_FANOUT)
+#define RCU_FANOUT_CUBE	      (RCU_FANOUT_SQ * RCU_FANOUT)
+
+#if NR_CPUS <= RCU_FANOUT
+#  define NUM_RCU_LVLS	      1
+#  define NUM_RCU_LVL_0	      1
+#  define NUM_RCU_LVL_1	      (NR_CPUS)
+#  define NUM_RCU_LVL_2	      0
+#  define NUM_RCU_LVL_3	      0
+#elif NR_CPUS <= RCU_FANOUT_SQ
+#  define NUM_RCU_LVLS	      2
+#  define NUM_RCU_LVL_0	      1
+#  define NUM_RCU_LVL_1	      (((NR_CPUS) + RCU_FANOUT - 1) / RCU_FANOUT)
+#  define NUM_RCU_LVL_2	      (NR_CPUS)
+#  define NUM_RCU_LVL_3	      0
+#elif NR_CPUS <= RCU_FANOUT_CUBE
+#  define NUM_RCU_LVLS	      3
+#  define NUM_RCU_LVL_0	      1
+#  define NUM_RCU_LVL_1	      (((NR_CPUS) + RCU_FANOUT_SQ - 1) / RCU_FANOUT_SQ)
+#  define NUM_RCU_LVL_2	      (((NR_CPUS) + (RCU_FANOUT) - 1) / (RCU_FANOUT))
+#  define NUM_RCU_LVL_3	      NR_CPUS
+#else
+# error "CONFIG_RCU_FANOUT insufficient for NR_CPUS"
+#endif /* #if (NR_CPUS) <= RCU_FANOUT */
+
+#define RCU_SUM (NUM_RCU_LVL_0 + NUM_RCU_LVL_1 + NUM_RCU_LVL_2 + NUM_RCU_LVL_3)
+#define NUM_RCU_NODES (RCU_SUM - NR_CPUS)
+
+/*
+ * Dynticks per-CPU state.
+ */
+struct rcu_dynticks {
+	int dynticks_nesting;	/* Track nesting level, sort of. */
+	int dynticks;		/* Even value for dynticks-idle, else odd. */
+	int dynticks_nmi;	/* Even value for either dynticks-idle or */
+				/*  not in nmi handler, else odd.  So this */
+				/*  remains even for nmi from irq handler. */
+};
+
+/*
+ * Definition for node within the RCU grace-period-detection hierarchy.
+ */
+struct rcu_node {
+	spinlock_t lock;
+	unsigned long qsmask;	/* CPUs or groups that need to switch in */
+				/*  order for current grace period to proceed.*/
+	unsigned long qsmaskinit;
+				/* Per-GP initialization for qsmask. */
+	unsigned long grpmask;	/* Mask to apply to parent qsmask. */
+	int	grplo;		/* lowest-numbered CPU or group here. */
+	int	grphi;		/* highest-numbered CPU or group here. */
+	u8	grpnum;		/* CPU/group number for next level up. */
+	u8	level;		/* root is at level 0. */
+	struct rcu_node *parent;
+} ____cacheline_internodealigned_in_smp;
+
+/* Index values for nxttail array in struct rcu_data. */
+#define RCU_DONE_TAIL		0	/* Also RCU_WAIT head. */
+#define RCU_WAIT_TAIL		1	/* Also RCU_NEXT_READY head. */
+#define RCU_NEXT_READY_TAIL	2	/* Also RCU_NEXT head. */
+#define RCU_NEXT_TAIL		3
+#define RCU_NEXT_SIZE		4
+
+/* Per-CPU data for read-copy update. */
+struct rcu_data {
+	/* 1) quiescent-state and grace-period handling : */
+	long		completed;	/* Track rsp->completed gp number */
+					/*  in order to detect GP end. */
+	long		gpnum;		/* Highest gp number that this CPU */
+					/*  is aware of having started. */
+	long		passed_quiesc_completed;
+					/* Value of completed at time of qs. */
+	bool		passed_quiesc;	/* User-mode/idle loop etc. */
+	bool		qs_pending;	/* Core waits for quiesc state. */
+	bool		beenonline;	/* CPU online at least once. */
+	struct rcu_node *mynode;	/* This CPU's leaf of hierarchy */
+	unsigned long grpmask;		/* Mask to apply to leaf qsmask. */
+
+	/* 2) batch handling */
+	/*
+	 * If nxtlist is not NULL, it is partitioned as follows.
+	 * Any of the partitions might be empty, in which case the
+	 * pointer to that partition will be equal to the pointer for
+	 * the following partition.  When the list is empty, all of
+	 * the nxttail elements point to nxtlist, which is NULL.
+	 *
+	 * [*nxttail[RCU_NEXT_READY_TAIL], NULL = *nxttail[RCU_NEXT_TAIL]):
+	 *	Entries that might have arrived after current GP ended
+	 * [*nxttail[RCU_WAIT_TAIL], *nxttail[RCU_NEXT_READY_TAIL]):
+	 *	Entries known to have arrived before current GP ended
+	 * [*nxttail[RCU_DONE_TAIL], *nxttail[RCU_WAIT_TAIL]):
+	 *	Entries that batch # <= ->completed - 1: waiting for current GP
+	 * [nxtlist, *nxttail[RCU_DONE_TAIL]):
+	 *	Entries that batch # <= ->completed
+	 *	The grace period for these entries has completed, and
+	 *	the other grace-period-completed entries may be moved
+	 *	here temporarily in rcu_process_callbacks().
+	 */
+	struct rcu_head *nxtlist;
+	struct rcu_head **nxttail[RCU_NEXT_SIZE];
+	long		qlen; 	 	/* # of queued callbacks */
+	long		blimit;		/* Upper limit on a processed batch */
+
+#ifdef CONFIG_NO_HZ
+	/* 3) dynticks interface. */
+	struct rcu_dynticks *dynticks;	/* Shared per-CPU dynticks state. */
+	int dynticks_snap;		/* Per-GP tracking for dynticks. */
+	int dynticks_nmi_snap;		/* Per-GP tracking for dynticks_nmi. */
+#endif /* #ifdef CONFIG_NO_HZ */
+
+	/* 4) reasons this CPU needed to be kicked by force_quiescent_state */
+#ifdef CONFIG_NO_HZ
+	unsigned long dynticks_fqs;	/* Kicked due to dynticks idle. */
+#endif /* #ifdef CONFIG_NO_HZ */
+	unsigned long offline_fqs;	/* Kicked due to being offline. */
+	unsigned long resched_ipi;	/* Sent a resched IPI. */
+
+	/* 5) state to allow this CPU to force_quiescent_state on others */
+	long n_rcu_pending;		/* rcu_pending() calls since boot. */
+	long n_rcu_pending_force_qs;	/* when to force quiescent states. */
+
+	int cpu;
+};
+
+/* Values for signaled field in struct rcu_state. */
+#define RCU_GP_INIT		0	/* Grace period being initialized. */
+#define RCU_SAVE_DYNTICK	1	/* Need to scan dyntick state. */
+#define RCU_FORCE_QS		2	/* Need to force quiescent state. */
+#ifdef CONFIG_NO_HZ
+#define RCU_SIGNAL_INIT		RCU_SAVE_DYNTICK
+#else /* #ifdef CONFIG_NO_HZ */
+#define RCU_SIGNAL_INIT		RCU_FORCE_QS
+#endif /* #else #ifdef CONFIG_NO_HZ */
+
+#define RCU_JIFFIES_TILL_FORCE_QS	 3	/* for rsp->jiffies_force_qs */
+#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
+#define RCU_SECONDS_TILL_STALL_CHECK   (10 * HZ)  /* for rsp->jiffies_stall */
+#define RCU_SECONDS_TILL_STALL_RECHECK (30 * HZ)  /* for rsp->jiffies_stall */
+#define RCU_STALL_RAT_DELAY		2	  /* Allow other CPUs time */
+						  /*  to take at least one */
+						  /*  scheduling clock irq */
+						  /*  before ratting on them. */
+
+#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
+
+/*
+ * RCU global state, including node hierarchy.  This hierarchy is
+ * represented in "heap" form in a dense array.  The root (first level)
+ * of the hierarchy is in ->node[0] (referenced by ->level[0]), the second
+ * level in ->node[1] through ->node[m] (->node[1] referenced by ->level[1]),
+ * and the third level in ->node[m+1] and following (->node[m+1] referenced
+ * by ->level[2]).  The number of levels is determined by the number of
+ * CPUs and by CONFIG_RCU_FANOUT.  Small systems will have a "hierarchy"
+ * consisting of a single rcu_node.
+ */
+struct rcu_state {
+	struct rcu_node node[NUM_RCU_NODES];	/* Hierarchy. */
+	struct rcu_node *level[NUM_RCU_LVLS];	/* Hierarchy levels. */
+	u32 levelcnt[MAX_RCU_LVLS + 1];		/* # nodes in each level. */
+	u8 levelspread[NUM_RCU_LVLS];		/* kids/node in each level. */
+	struct rcu_data *rda[NR_CPUS];		/* array of rdp pointers. */
+
+	/* The following fields are guarded by the root rcu_node's lock. */
+
+	u8	signaled ____cacheline_internodealigned_in_smp;
+						/* Force QS state. */
+	long	gpnum;				/* Current gp number. */
+	long	completed;			/* # of last completed gp. */
+	spinlock_t onofflock;			/* exclude on/offline and */
+						/*  starting new GP. */
+	spinlock_t fqslock;			/* Only one task forcing */
+						/*  quiescent states. */
+	unsigned long jiffies_force_qs;		/* Time at which to invoke */
+						/*  force_quiescent_state(). */
+	unsigned long n_force_qs;		/* Number of calls to */
+						/*  force_quiescent_state(). */
+	unsigned long n_force_qs_lh;		/* ~Number of calls leaving */
+						/*  due to lock unavailable. */
+	unsigned long n_force_qs_ngp;		/* Number of calls leaving */
+						/*  due to no GP active. */
+#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
+	unsigned long gp_start;			/* Time at which GP started, */
+						/*  but in jiffies. */
+	unsigned long jiffies_stall;		/* Time at which to check */
+						/*  for CPU stalls. */
+#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
+#ifdef CONFIG_NO_HZ
+	long dynticks_completed;		/* Value of completed @ snap. */
+#endif /* #ifdef CONFIG_NO_HZ */
+};
+
+extern struct rcu_state rcu_state;
+DECLARE_PER_CPU(struct rcu_data, rcu_data);
+
+extern struct rcu_state rcu_bh_state;
+DECLARE_PER_CPU(struct rcu_data, rcu_bh_data);
+
+/*
+ * Increment the quiescent state counter.
+ * The counter is a bit degenerated: We do not need to know
+ * how many quiescent states passed, just if there was at least
+ * one since the start of the grace period. Thus just a flag.
+ */
+static inline void rcu_qsctr_inc(int cpu)
+{
+	struct rcu_data *rdp = &per_cpu(rcu_data, cpu);
+	rdp->passed_quiesc = 1;
+	rdp->passed_quiesc_completed = rdp->completed;
+}
+static inline void rcu_bh_qsctr_inc(int cpu)
+{
+	struct rcu_data *rdp = &per_cpu(rcu_bh_data, cpu);
+	rdp->passed_quiesc = 1;
+	rdp->passed_quiesc_completed = rdp->completed;
+}
+
+extern int rcu_pending(int cpu);
+extern int rcu_needs_cpu(int cpu);
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+extern struct lockdep_map rcu_lock_map;
+# define rcu_read_acquire()	\
+			lock_acquire(&rcu_lock_map, 0, 0, 2, 1, NULL, _THIS_IP_)
+# define rcu_read_release()	lock_release(&rcu_lock_map, 1, _THIS_IP_)
+#else
+# define rcu_read_acquire()	do { } while (0)
+# define rcu_read_release()	do { } while (0)
+#endif
+
+static inline void __rcu_read_lock(void)
+{
+	preempt_disable();
+	__acquire(RCU);
+	rcu_read_acquire();
+}
+static inline void __rcu_read_unlock(void)
+{
+	rcu_read_release();
+	__release(RCU);
+	preempt_enable();
+}
+static inline void __rcu_read_lock_bh(void)
+{
+	local_bh_disable();
+	__acquire(RCU_BH);
+	rcu_read_acquire();
+}
+static inline void __rcu_read_unlock_bh(void)
+{
+	rcu_read_release();
+	__release(RCU_BH);
+	local_bh_enable();
+}
+
+#define __synchronize_sched() synchronize_rcu()
+
+#define call_rcu_sched(head, func) call_rcu(head, func)
+
+static inline void rcu_init_sched(void)
+{
+}
+
+extern void __rcu_init(void);
+extern void rcu_check_callbacks(int cpu, int user);
+extern void rcu_restart_cpu(int cpu);
+
+extern long rcu_batches_completed(void);
+extern long rcu_batches_completed_bh(void);
+
+#ifdef CONFIG_NO_HZ
+void rcu_enter_nohz(void);
+void rcu_exit_nohz(void);
+#else /* CONFIG_NO_HZ */
+static inline void rcu_enter_nohz(void)
+{
+}
+static inline void rcu_exit_nohz(void)
+{
+}
+#endif /* CONFIG_NO_HZ */
+
+#endif /* __LINUX_RCUTREE_H */
diff --git a/init/Kconfig b/init/Kconfig
index f763762d544..9dd7958a71f 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -928,10 +928,16 @@ source "block/Kconfig"
 config PREEMPT_NOTIFIERS
 	bool
 
-config CLASSIC_RCU
-	def_bool !PREEMPT_RCU
+config TREE_RCU_TRACE
+	def_bool RCU_TRACE && TREE_RCU
+	select DEBUG_FS
 	help
-	  This option selects the classic RCU implementation that is
-	  designed for best read-side performance on non-realtime
-	  systems.  Classic RCU is the default.  Note that the
-	  PREEMPT_RCU symbol is used to select/deselect this option.
+	  This option provides tracing for the TREE_RCU implementation,
+	  permitting Makefile to trivially select kernel/rcutree_trace.c.
+
+config PREEMPT_RCU_TRACE
+	def_bool RCU_TRACE && PREEMPT_RCU
+	select DEBUG_FS
+	help
+	  This option provides tracing for the PREEMPT_RCU implementation,
+	  permitting Makefile to trivially select kernel/rcupreempt_trace.c.
diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt
index 9fdba03dc1f..463f29743ea 100644
--- a/kernel/Kconfig.preempt
+++ b/kernel/Kconfig.preempt
@@ -52,10 +52,29 @@ config PREEMPT
 
 endchoice
 
+choice
+	prompt "RCU Implementation"
+	default CLASSIC_RCU
+
+config CLASSIC_RCU
+	bool "Classic RCU"
+	help
+	  This option selects the classic RCU implementation that is
+	  designed for best read-side performance on non-realtime
+	  systems.
+	  
+	  Select this option if you are unsure.
+
+config TREE_RCU
+	bool "Tree-based hierarchical RCU"
+	help
+	  This option selects the RCU implementation that is
+	  designed for very large SMP system with hundreds or
+	  thousands of CPUs.
+
 config PREEMPT_RCU
 	bool "Preemptible RCU"
 	depends on PREEMPT
-	default n
 	help
 	  This option reduces the latency of the kernel by making certain
 	  RCU sections preemptible. Normally RCU code is non-preemptible, if
@@ -64,16 +83,47 @@ config PREEMPT_RCU
 	  now-naive assumptions about each RCU read-side critical section
 	  remaining on a given CPU through its execution.
 
-	  Say N if you are unsure.
+endchoice
 
 config RCU_TRACE
-	bool "Enable tracing for RCU - currently stats in debugfs"
-	depends on PREEMPT_RCU
-	select DEBUG_FS
-	default y
+	bool "Enable tracing for RCU"
+	depends on TREE_RCU || PREEMPT_RCU
 	help
 	  This option provides tracing in RCU which presents stats
 	  in debugfs for debugging RCU implementation.
 
 	  Say Y here if you want to enable RCU tracing
 	  Say N if you are unsure.
+
+config RCU_FANOUT
+	int "Tree-based hierarchical RCU fanout value"
+	range 2 64 if 64BIT
+	range 2 32 if !64BIT
+	depends on TREE_RCU
+	default 64 if 64BIT
+	default 32 if !64BIT
+	help
+	  This option controls the fanout of hierarchical implementations
+	  of RCU, allowing RCU to work efficiently on machines with
+	  large numbers of CPUs.  This value must be at least the cube
+	  root of NR_CPUS, which allows NR_CPUS up to 32,768 for 32-bit
+	  systems and up to 262,144 for 64-bit systems.
+
+	  Select a specific number if testing RCU itself.
+	  Take the default if unsure.
+
+config RCU_FANOUT_EXACT
+	bool "Disable tree-based hierarchical RCU auto-balancing"
+	depends on TREE_RCU
+	default n
+	help
+	  This option forces use of the exact RCU_FANOUT value specified,
+	  regardless of imbalances in the hierarchy.  This is useful for
+	  testing RCU itself, and might one day be useful on systems with
+	  strong NUMA behavior.
+
+	  Without RCU_FANOUT_EXACT, the code will balance the hierarchy.
+
+	  Say n if unsure.
+
+	
diff --git a/kernel/Makefile b/kernel/Makefile
index 19fad003b19..b4fdbbff5ec 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -74,10 +74,10 @@ obj-$(CONFIG_GENERIC_HARDIRQS) += irq/
 obj-$(CONFIG_SECCOMP) += seccomp.o
 obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o
 obj-$(CONFIG_CLASSIC_RCU) += rcuclassic.o
+obj-$(CONFIG_TREE_RCU) += rcutree.o
 obj-$(CONFIG_PREEMPT_RCU) += rcupreempt.o
-ifeq ($(CONFIG_PREEMPT_RCU),y)
-obj-$(CONFIG_RCU_TRACE) += rcupreempt_trace.o
-endif
+obj-$(CONFIG_TREE_RCU_TRACE) += rcutree_trace.o
+obj-$(CONFIG_PREEMPT_RCU_TRACE) += rcupreempt_trace.o
 obj-$(CONFIG_RELAY) += relay.o
 obj-$(CONFIG_SYSCTL) += utsname_sysctl.o
 obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o
diff --git a/kernel/rcupreempt.c b/kernel/rcupreempt.c
index 59236e8b9da..04982659875 100644
--- a/kernel/rcupreempt.c
+++ b/kernel/rcupreempt.c
@@ -551,6 +551,16 @@ void rcu_irq_exit(void)
 	}
 }
 
+void rcu_nmi_enter(void)
+{
+	rcu_irq_enter();
+}
+
+void rcu_nmi_exit(void)
+{
+	rcu_irq_exit();
+}
+
 static void dyntick_save_progress_counter(int cpu)
 {
 	struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu);
diff --git a/kernel/rcupreempt_trace.c b/kernel/rcupreempt_trace.c
index 35c2d3360ec..7c2665cac17 100644
--- a/kernel/rcupreempt_trace.c
+++ b/kernel/rcupreempt_trace.c
@@ -149,12 +149,12 @@ static void rcupreempt_trace_sum(struct rcupreempt_trace *sp)
 		sp->done_length += cp->done_length;
 		sp->done_add += cp->done_add;
 		sp->done_remove += cp->done_remove;
-		atomic_set(&sp->done_invoked, atomic_read(&cp->done_invoked));
+		atomic_add(atomic_read(&cp->done_invoked), &sp->done_invoked);
 		sp->rcu_check_callbacks += cp->rcu_check_callbacks;
-		atomic_set(&sp->rcu_try_flip_1,
-			   atomic_read(&cp->rcu_try_flip_1));
-		atomic_set(&sp->rcu_try_flip_e1,
-			   atomic_read(&cp->rcu_try_flip_e1));
+		atomic_add(atomic_read(&cp->rcu_try_flip_1),
+			   &sp->rcu_try_flip_1);
+		atomic_add(atomic_read(&cp->rcu_try_flip_e1),
+			   &sp->rcu_try_flip_e1);
 		sp->rcu_try_flip_i1 += cp->rcu_try_flip_i1;
 		sp->rcu_try_flip_ie1 += cp->rcu_try_flip_ie1;
 		sp->rcu_try_flip_g1 += cp->rcu_try_flip_g1;
diff --git a/kernel/rcutree.c b/kernel/rcutree.c
new file mode 100644
index 00000000000..a342b032112
--- /dev/null
+++ b/kernel/rcutree.c
@@ -0,0 +1,1535 @@
+/*
+ * Read-Copy Update mechanism for mutual exclusion
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright IBM Corporation, 2008
+ *
+ * Authors: Dipankar Sarma <dipankar@in.ibm.com>
+ *	    Manfred Spraul <manfred@colorfullife.com>
+ *	    Paul E. McKenney <paulmck@linux.vnet.ibm.com> Hierarchical version
+ *
+ * Based on the original work by Paul McKenney <paulmck@us.ibm.com>
+ * and inputs from Rusty Russell, Andrea Arcangeli and Andi Kleen.
+ *
+ * For detailed explanation of Read-Copy Update mechanism see -
+ * 	Documentation/RCU
+ */
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/spinlock.h>
+#include <linux/smp.h>
+#include <linux/rcupdate.h>
+#include <linux/interrupt.h>
+#include <linux/sched.h>
+#include <asm/atomic.h>
+#include <linux/bitops.h>
+#include <linux/module.h>
+#include <linux/completion.h>
+#include <linux/moduleparam.h>
+#include <linux/percpu.h>
+#include <linux/notifier.h>
+#include <linux/cpu.h>
+#include <linux/mutex.h>
+#include <linux/time.h>
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+static struct lock_class_key rcu_lock_key;
+struct lockdep_map rcu_lock_map =
+	STATIC_LOCKDEP_MAP_INIT("rcu_read_lock", &rcu_lock_key);
+EXPORT_SYMBOL_GPL(rcu_lock_map);
+#endif
+
+/* Data structures. */
+
+#define RCU_STATE_INITIALIZER(name) { \
+	.level = { &name.node[0] }, \
+	.levelcnt = { \
+		NUM_RCU_LVL_0,  /* root of hierarchy. */ \
+		NUM_RCU_LVL_1, \
+		NUM_RCU_LVL_2, \
+		NUM_RCU_LVL_3, /* == MAX_RCU_LVLS */ \
+	}, \
+	.signaled = RCU_SIGNAL_INIT, \
+	.gpnum = -300, \
+	.completed = -300, \
+	.onofflock = __SPIN_LOCK_UNLOCKED(&name.onofflock), \
+	.fqslock = __SPIN_LOCK_UNLOCKED(&name.fqslock), \
+	.n_force_qs = 0, \
+	.n_force_qs_ngp = 0, \
+}
+
+struct rcu_state rcu_state = RCU_STATE_INITIALIZER(rcu_state);
+DEFINE_PER_CPU(struct rcu_data, rcu_data);
+
+struct rcu_state rcu_bh_state = RCU_STATE_INITIALIZER(rcu_bh_state);
+DEFINE_PER_CPU(struct rcu_data, rcu_bh_data);
+
+#ifdef CONFIG_NO_HZ
+DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks);
+#endif /* #ifdef CONFIG_NO_HZ */
+
+static int blimit = 10;		/* Maximum callbacks per softirq. */
+static int qhimark = 10000;	/* If this many pending, ignore blimit. */
+static int qlowmark = 100;	/* Once only this many pending, use blimit. */
+
+static void force_quiescent_state(struct rcu_state *rsp, int relaxed);
+
+/*
+ * Return the number of RCU batches processed thus far for debug & stats.
+ */
+long rcu_batches_completed(void)
+{
+	return rcu_state.completed;
+}
+EXPORT_SYMBOL_GPL(rcu_batches_completed);
+
+/*
+ * Return the number of RCU BH batches processed thus far for debug & stats.
+ */
+long rcu_batches_completed_bh(void)
+{
+	return rcu_bh_state.completed;
+}
+EXPORT_SYMBOL_GPL(rcu_batches_completed_bh);
+
+/*
+ * Does the CPU have callbacks ready to be invoked?
+ */
+static int
+cpu_has_callbacks_ready_to_invoke(struct rcu_data *rdp)
+{
+	return &rdp->nxtlist != rdp->nxttail[RCU_DONE_TAIL];
+}
+
+/*
+ * Does the current CPU require a yet-as-unscheduled grace period?
+ */
+static int
+cpu_needs_another_gp(struct rcu_state *rsp, struct rcu_data *rdp)
+{
+	/* ACCESS_ONCE() because we are accessing outside of lock. */
+	return *rdp->nxttail[RCU_DONE_TAIL] &&
+	       ACCESS_ONCE(rsp->completed) == ACCESS_ONCE(rsp->gpnum);
+}
+
+/*
+ * Return the root node of the specified rcu_state structure.
+ */
+static struct rcu_node *rcu_get_root(struct rcu_state *rsp)
+{
+	return &rsp->node[0];
+}
+
+#ifdef CONFIG_SMP
+
+/*
+ * If the specified CPU is offline, tell the caller that it is in
+ * a quiescent state.  Otherwise, whack it with a reschedule IPI.
+ * Grace periods can end up waiting on an offline CPU when that
+ * CPU is in the process of coming online -- it will be added to the
+ * rcu_node bitmasks before it actually makes it online.  The same thing
+ * can happen while a CPU is in the process of coming online.  Because this
+ * race is quite rare, we check for it after detecting that the grace
+ * period has been delayed rather than checking each and every CPU
+ * each and every time we start a new grace period.
+ */
+static int rcu_implicit_offline_qs(struct rcu_data *rdp)
+{
+	/*
+	 * If the CPU is offline, it is in a quiescent state.  We can
+	 * trust its state not to change because interrupts are disabled.
+	 */
+	if (cpu_is_offline(rdp->cpu)) {
+		rdp->offline_fqs++;
+		return 1;
+	}
+
+	/* The CPU is online, so send it a reschedule IPI. */
+	if (rdp->cpu != smp_processor_id())
+		smp_send_reschedule(rdp->cpu);
+	else
+		set_need_resched();
+	rdp->resched_ipi++;
+	return 0;
+}
+
+#endif /* #ifdef CONFIG_SMP */
+
+#ifdef CONFIG_NO_HZ
+static DEFINE_RATELIMIT_STATE(rcu_rs, 10 * HZ, 5);
+
+/**
+ * rcu_enter_nohz - inform RCU that current CPU is entering nohz
+ *
+ * Enter nohz mode, in other words, -leave- the mode in which RCU
+ * read-side critical sections can occur.  (Though RCU read-side
+ * critical sections can occur in irq handlers in nohz mode, a possibility
+ * handled by rcu_irq_enter() and rcu_irq_exit()).
+ */
+void rcu_enter_nohz(void)
+{
+	unsigned long flags;
+	struct rcu_dynticks *rdtp;
+
+	smp_mb(); /* CPUs seeing ++ must see prior RCU read-side crit sects */
+	local_irq_save(flags);
+	rdtp = &__get_cpu_var(rcu_dynticks);
+	rdtp->dynticks++;
+	rdtp->dynticks_nesting--;
+	WARN_ON_RATELIMIT(rdtp->dynticks & 0x1, &rcu_rs);
+	local_irq_restore(flags);
+}
+
+/*
+ * rcu_exit_nohz - inform RCU that current CPU is leaving nohz
+ *
+ * Exit nohz mode, in other words, -enter- the mode in which RCU
+ * read-side critical sections normally occur.
+ */
+void rcu_exit_nohz(void)
+{
+	unsigned long flags;
+	struct rcu_dynticks *rdtp;
+
+	local_irq_save(flags);
+	rdtp = &__get_cpu_var(rcu_dynticks);
+	rdtp->dynticks++;
+	rdtp->dynticks_nesting++;
+	WARN_ON_RATELIMIT(!(rdtp->dynticks & 0x1), &rcu_rs);
+	local_irq_restore(flags);
+	smp_mb(); /* CPUs seeing ++ must see later RCU read-side crit sects */
+}
+
+/**
+ * rcu_nmi_enter - inform RCU of entry to NMI context
+ *
+ * If the CPU was idle with dynamic ticks active, and there is no
+ * irq handler running, this updates rdtp->dynticks_nmi to let the
+ * RCU grace-period handling know that the CPU is active.
+ */
+void rcu_nmi_enter(void)
+{
+	struct rcu_dynticks *rdtp = &__get_cpu_var(rcu_dynticks);
+
+	if (rdtp->dynticks & 0x1)
+		return;
+	rdtp->dynticks_nmi++;
+	WARN_ON_RATELIMIT(!(rdtp->dynticks_nmi & 0x1), &rcu_rs);
+	smp_mb(); /* CPUs seeing ++ must see later RCU read-side crit sects */
+}
+
+/**
+ * rcu_nmi_exit - inform RCU of exit from NMI context
+ *
+ * If the CPU was idle with dynamic ticks active, and there is no
+ * irq handler running, this updates rdtp->dynticks_nmi to let the
+ * RCU grace-period handling know that the CPU is no longer active.
+ */
+void rcu_nmi_exit(void)
+{
+	struct rcu_dynticks *rdtp = &__get_cpu_var(rcu_dynticks);
+
+	if (rdtp->dynticks & 0x1)
+		return;
+	smp_mb(); /* CPUs seeing ++ must see prior RCU read-side crit sects */
+	rdtp->dynticks_nmi++;
+	WARN_ON_RATELIMIT(rdtp->dynticks_nmi & 0x1, &rcu_rs);
+}
+
+/**
+ * rcu_irq_enter - inform RCU of entry to hard irq context
+ *
+ * If the CPU was idle with dynamic ticks active, this updates the
+ * rdtp->dynticks to let the RCU handling know that the CPU is active.
+ */
+void rcu_irq_enter(void)
+{
+	struct rcu_dynticks *rdtp = &__get_cpu_var(rcu_dynticks);
+
+	if (rdtp->dynticks_nesting++)
+		return;
+	rdtp->dynticks++;
+	WARN_ON_RATELIMIT(!(rdtp->dynticks & 0x1), &rcu_rs);
+	smp_mb(); /* CPUs seeing ++ must see later RCU read-side crit sects */
+}
+
+/**
+ * rcu_irq_exit - inform RCU of exit from hard irq context
+ *
+ * If the CPU was idle with dynamic ticks active, update the rdp->dynticks
+ * to put let the RCU handling be aware that the CPU is going back to idle
+ * with no ticks.
+ */
+void rcu_irq_exit(void)
+{
+	struct rcu_dynticks *rdtp = &__get_cpu_var(rcu_dynticks);
+
+	if (--rdtp->dynticks_nesting)
+		return;
+	smp_mb(); /* CPUs seeing ++ must see prior RCU read-side crit sects */
+	rdtp->dynticks++;
+	WARN_ON_RATELIMIT(rdtp->dynticks & 0x1, &rcu_rs);
+
+	/* If the interrupt queued a callback, get out of dyntick mode. */
+	if (__get_cpu_var(rcu_data).nxtlist ||
+	    __get_cpu_var(rcu_bh_data).nxtlist)
+		set_need_resched();
+}
+
+/*
+ * Record the specified "completed" value, which is later used to validate
+ * dynticks counter manipulations.  Specify "rsp->completed - 1" to
+ * unconditionally invalidate any future dynticks manipulations (which is
+ * useful at the beginning of a grace period).
+ */
+static void dyntick_record_completed(struct rcu_state *rsp, long comp)
+{
+	rsp->dynticks_completed = comp;
+}
+
+#ifdef CONFIG_SMP
+
+/*
+ * Recall the previously recorded value of the completion for dynticks.
+ */
+static long dyntick_recall_completed(struct rcu_state *rsp)
+{
+	return rsp->dynticks_completed;
+}
+
+/*
+ * Snapshot the specified CPU's dynticks counter so that we can later
+ * credit them with an implicit quiescent state.  Return 1 if this CPU
+ * is already in a quiescent state courtesy of dynticks idle mode.
+ */
+static int dyntick_save_progress_counter(struct rcu_data *rdp)
+{
+	int ret;
+	int snap;
+	int snap_nmi;
+
+	snap = rdp->dynticks->dynticks;
+	snap_nmi = rdp->dynticks->dynticks_nmi;
+	smp_mb();	/* Order sampling of snap with end of grace period. */
+	rdp->dynticks_snap = snap;
+	rdp->dynticks_nmi_snap = snap_nmi;
+	ret = ((snap & 0x1) == 0) && ((snap_nmi & 0x1) == 0);
+	if (ret)
+		rdp->dynticks_fqs++;
+	return ret;
+}
+
+/*
+ * Return true if the specified CPU has passed through a quiescent
+ * state by virtue of being in or having passed through an dynticks
+ * idle state since the last call to dyntick_save_progress_counter()
+ * for this same CPU.
+ */
+static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
+{
+	long curr;
+	long curr_nmi;
+	long snap;
+	long snap_nmi;
+
+	curr = rdp->dynticks->dynticks;
+	snap = rdp->dynticks_snap;
+	curr_nmi = rdp->dynticks->dynticks_nmi;
+	snap_nmi = rdp->dynticks_nmi_snap;
+	smp_mb(); /* force ordering with cpu entering/leaving dynticks. */
+
+	/*
+	 * If the CPU passed through or entered a dynticks idle phase with
+	 * no active irq/NMI handlers, then we can safely pretend that the CPU
+	 * already acknowledged the request to pass through a quiescent
+	 * state.  Either way, that CPU cannot possibly be in an RCU
+	 * read-side critical section that started before the beginning
+	 * of the current RCU grace period.
+	 */
+	if ((curr != snap || (curr & 0x1) == 0) &&
+	    (curr_nmi != snap_nmi || (curr_nmi & 0x1) == 0)) {
+		rdp->dynticks_fqs++;
+		return 1;
+	}
+
+	/* Go check for the CPU being offline. */
+	return rcu_implicit_offline_qs(rdp);
+}
+
+#endif /* #ifdef CONFIG_SMP */
+
+#else /* #ifdef CONFIG_NO_HZ */
+
+static void dyntick_record_completed(struct rcu_state *rsp, long comp)
+{
+}
+
+#ifdef CONFIG_SMP
+
+/*
+ * If there are no dynticks, then the only way that a CPU can passively
+ * be in a quiescent state is to be offline.  Unlike dynticks idle, which
+ * is a point in time during the prior (already finished) grace period,
+ * an offline CPU is always in a quiescent state, and thus can be
+ * unconditionally applied.  So just return the current value of completed.
+ */
+static long dyntick_recall_completed(struct rcu_state *rsp)
+{
+	return rsp->completed;
+}
+
+static int dyntick_save_progress_counter(struct rcu_data *rdp)
+{
+	return 0;
+}
+
+static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
+{
+	return rcu_implicit_offline_qs(rdp);
+}
+
+#endif /* #ifdef CONFIG_SMP */
+
+#endif /* #else #ifdef CONFIG_NO_HZ */
+
+#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
+
+static void record_gp_stall_check_time(struct rcu_state *rsp)
+{
+	rsp->gp_start = jiffies;
+	rsp->jiffies_stall = jiffies + RCU_SECONDS_TILL_STALL_CHECK;
+}
+
+static void print_other_cpu_stall(struct rcu_state *rsp)
+{
+	int cpu;
+	long delta;
+	unsigned long flags;
+	struct rcu_node *rnp = rcu_get_root(rsp);
+	struct rcu_node *rnp_cur = rsp->level[NUM_RCU_LVLS - 1];
+	struct rcu_node *rnp_end = &rsp->node[NUM_RCU_NODES];
+
+	/* Only let one CPU complain about others per time interval. */
+
+	spin_lock_irqsave(&rnp->lock, flags);
+	delta = jiffies - rsp->jiffies_stall;
+	if (delta < RCU_STALL_RAT_DELAY || rsp->gpnum == rsp->completed) {
+		spin_unlock_irqrestore(&rnp->lock, flags);
+		return;
+	}
+	rsp->jiffies_stall = jiffies + RCU_SECONDS_TILL_STALL_RECHECK;
+	spin_unlock_irqrestore(&rnp->lock, flags);
+
+	/* OK, time to rat on our buddy... */
+
+	printk(KERN_ERR "INFO: RCU detected CPU stalls:");
+	for (; rnp_cur < rnp_end; rnp_cur++) {
+		if (rnp_cur->qsmask == 0)
+			continue;
+		for (cpu = 0; cpu <= rnp_cur->grphi - rnp_cur->grplo; cpu++)
+			if (rnp_cur->qsmask & (1UL << cpu))
+				printk(" %d", rnp_cur->grplo + cpu);
+	}
+	printk(" (detected by %d, t=%ld jiffies)\n",
+	       smp_processor_id(), (long)(jiffies - rsp->gp_start));
+	force_quiescent_state(rsp, 0);  /* Kick them all. */
+}
+
+static void print_cpu_stall(struct rcu_state *rsp)
+{
+	unsigned long flags;
+	struct rcu_node *rnp = rcu_get_root(rsp);
+
+	printk(KERN_ERR "INFO: RCU detected CPU %d stall (t=%lu jiffies)\n",
+			smp_processor_id(), jiffies - rsp->gp_start);
+	dump_stack();
+	spin_lock_irqsave(&rnp->lock, flags);
+	if ((long)(jiffies - rsp->jiffies_stall) >= 0)
+		rsp->jiffies_stall =
+			jiffies + RCU_SECONDS_TILL_STALL_RECHECK;
+	spin_unlock_irqrestore(&rnp->lock, flags);
+	set_need_resched();  /* kick ourselves to get things going. */
+}
+
+static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp)
+{
+	long delta;
+	struct rcu_node *rnp;
+
+	delta = jiffies - rsp->jiffies_stall;
+	rnp = rdp->mynode;
+	if ((rnp->qsmask & rdp->grpmask) && delta >= 0) {
+
+		/* We haven't checked in, so go dump stack. */
+		print_cpu_stall(rsp);
+
+	} else if (rsp->gpnum != rsp->completed &&
+		   delta >= RCU_STALL_RAT_DELAY) {
+
+		/* They had two time units to dump stack, so complain. */
+		print_other_cpu_stall(rsp);
+	}
+}
+
+#else /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
+
+static void record_gp_stall_check_time(struct rcu_state *rsp)
+{
+}
+
+static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp)
+{
+}
+
+#endif /* #else #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
+
+/*
+ * Update CPU-local rcu_data state to record the newly noticed grace period.
+ * This is used both when we started the grace period and when we notice
+ * that someone else started the grace period.
+ */
+static void note_new_gpnum(struct rcu_state *rsp, struct rcu_data *rdp)
+{
+	rdp->qs_pending = 1;
+	rdp->passed_quiesc = 0;
+	rdp->gpnum = rsp->gpnum;
+	rdp->n_rcu_pending_force_qs = rdp->n_rcu_pending +
+				      RCU_JIFFIES_TILL_FORCE_QS;
+}
+
+/*
+ * Did someone else start a new RCU grace period start since we last
+ * checked?  Update local state appropriately if so.  Must be called
+ * on the CPU corresponding to rdp.
+ */
+static int
+check_for_new_grace_period(struct rcu_state *rsp, struct rcu_data *rdp)
+{
+	unsigned long flags;
+	int ret = 0;
+
+	local_irq_save(flags);
+	if (rdp->gpnum != rsp->gpnum) {
+		note_new_gpnum(rsp, rdp);
+		ret = 1;
+	}
+	local_irq_restore(flags);
+	return ret;
+}
+
+/*
+ * Start a new RCU grace period if warranted, re-initializing the hierarchy
+ * in preparation for detecting the next grace period.  The caller must hold
+ * the root node's ->lock, which is released before return.  Hard irqs must
+ * be disabled.
+ */
+static void
+rcu_start_gp(struct rcu_state *rsp, unsigned long flags)
+	__releases(rcu_get_root(rsp)->lock)
+{
+	struct rcu_data *rdp = rsp->rda[smp_processor_id()];
+	struct rcu_node *rnp = rcu_get_root(rsp);
+	struct rcu_node *rnp_cur;
+	struct rcu_node *rnp_end;
+
+	if (!cpu_needs_another_gp(rsp, rdp)) {
+		spin_unlock_irqrestore(&rnp->lock, flags);
+		return;
+	}
+
+	/* Advance to a new grace period and initialize state. */
+	rsp->gpnum++;
+	rsp->signaled = RCU_GP_INIT; /* Hold off force_quiescent_state. */
+	rsp->jiffies_force_qs = jiffies + RCU_JIFFIES_TILL_FORCE_QS;
+	rdp->n_rcu_pending_force_qs = rdp->n_rcu_pending +
+				      RCU_JIFFIES_TILL_FORCE_QS;
+	record_gp_stall_check_time(rsp);
+	dyntick_record_completed(rsp, rsp->completed - 1);
+	note_new_gpnum(rsp, rdp);
+
+	/*
+	 * Because we are first, we know that all our callbacks will
+	 * be covered by this upcoming grace period, even the ones
+	 * that were registered arbitrarily recently.
+	 */
+	rdp->nxttail[RCU_NEXT_READY_TAIL] = rdp->nxttail[RCU_NEXT_TAIL];
+	rdp->nxttail[RCU_WAIT_TAIL] = rdp->nxttail[RCU_NEXT_TAIL];
+
+	/* Special-case the common single-level case. */
+	if (NUM_RCU_NODES == 1) {
+		rnp->qsmask = rnp->qsmaskinit;
+		spin_unlock_irqrestore(&rnp->lock, flags);
+		return;
+	}
+
+	spin_unlock(&rnp->lock);  /* leave irqs disabled. */
+
+
+	/* Exclude any concurrent CPU-hotplug operations. */
+	spin_lock(&rsp->onofflock);  /* irqs already disabled. */
+
+	/*
+	 * Set the quiescent-state-needed bits in all the non-leaf RCU
+	 * nodes for all currently online CPUs.  This operation relies
+	 * on the layout of the hierarchy within the rsp->node[] array.
+	 * Note that other CPUs will access only the leaves of the
+	 * hierarchy, which still indicate that no grace period is in
+	 * progress.  In addition, we have excluded CPU-hotplug operations.
+	 *
+	 * We therefore do not need to hold any locks.  Any required
+	 * memory barriers will be supplied by the locks guarding the
+	 * leaf rcu_nodes in the hierarchy.
+	 */
+
+	rnp_end = rsp->level[NUM_RCU_LVLS - 1];
+	for (rnp_cur = &rsp->node[0]; rnp_cur < rnp_end; rnp_cur++)
+		rnp_cur->qsmask = rnp_cur->qsmaskinit;
+
+	/*
+	 * Now set up the leaf nodes.  Here we must be careful.  First,
+	 * we need to hold the lock in order to exclude other CPUs, which
+	 * might be contending for the leaf nodes' locks.  Second, as
+	 * soon as we initialize a given leaf node, its CPUs might run
+	 * up the rest of the hierarchy.  We must therefore acquire locks
+	 * for each node that we touch during this stage.  (But we still
+	 * are excluding CPU-hotplug operations.)
+	 *
+	 * Note that the grace period cannot complete until we finish
+	 * the initialization process, as there will be at least one
+	 * qsmask bit set in the root node until that time, namely the
+	 * one corresponding to this CPU.
+	 */
+	rnp_end = &rsp->node[NUM_RCU_NODES];
+	rnp_cur = rsp->level[NUM_RCU_LVLS - 1];
+	for (; rnp_cur < rnp_end; rnp_cur++) {
+		spin_lock(&rnp_cur->lock);	/* irqs already disabled. */
+		rnp_cur->qsmask = rnp_cur->qsmaskinit;
+		spin_unlock(&rnp_cur->lock);	/* irqs already disabled. */
+	}
+
+	rsp->signaled = RCU_SIGNAL_INIT; /* force_quiescent_state now OK. */
+	spin_unlock_irqrestore(&rsp->onofflock, flags);
+}
+
+/*
+ * Advance this CPU's callbacks, but only if the current grace period
+ * has ended.  This may be called only from the CPU to whom the rdp
+ * belongs.
+ */
+static void
+rcu_process_gp_end(struct rcu_state *rsp, struct rcu_data *rdp)
+{
+	long completed_snap;
+	unsigned long flags;
+
+	local_irq_save(flags);
+	completed_snap = ACCESS_ONCE(rsp->completed);  /* outside of lock. */
+
+	/* Did another grace period end? */
+	if (rdp->completed != completed_snap) {
+
+		/* Advance callbacks.  No harm if list empty. */
+		rdp->nxttail[RCU_DONE_TAIL] = rdp->nxttail[RCU_WAIT_TAIL];
+		rdp->nxttail[RCU_WAIT_TAIL] = rdp->nxttail[RCU_NEXT_READY_TAIL];
+		rdp->nxttail[RCU_NEXT_READY_TAIL] = rdp->nxttail[RCU_NEXT_TAIL];
+
+		/* Remember that we saw this grace-period completion. */
+		rdp->completed = completed_snap;
+	}
+	local_irq_restore(flags);
+}
+
+/*
+ * Similar to cpu_quiet(), for which it is a helper function.  Allows
+ * a group of CPUs to be quieted at one go, though all the CPUs in the
+ * group must be represented by the same leaf rcu_node structure.
+ * That structure's lock must be held upon entry, and it is released
+ * before return.
+ */
+static void
+cpu_quiet_msk(unsigned long mask, struct rcu_state *rsp, struct rcu_node *rnp,
+	      unsigned long flags)
+	__releases(rnp->lock)
+{
+	/* Walk up the rcu_node hierarchy. */
+	for (;;) {
+		if (!(rnp->qsmask & mask)) {
+
+			/* Our bit has already been cleared, so done. */
+			spin_unlock_irqrestore(&rnp->lock, flags);
+			return;
+		}
+		rnp->qsmask &= ~mask;
+		if (rnp->qsmask != 0) {
+
+			/* Other bits still set at this level, so done. */
+			spin_unlock_irqrestore(&rnp->lock, flags);
+			return;
+		}
+		mask = rnp->grpmask;
+		if (rnp->parent == NULL) {
+
+			/* No more levels.  Exit loop holding root lock. */
+
+			break;
+		}
+		spin_unlock_irqrestore(&rnp->lock, flags);
+		rnp = rnp->parent;
+		spin_lock_irqsave(&rnp->lock, flags);
+	}
+
+	/*
+	 * Get here if we are the last CPU to pass through a quiescent
+	 * state for this grace period.  Clean up and let rcu_start_gp()
+	 * start up the next grace period if one is needed.  Note that
+	 * we still hold rnp->lock, as required by rcu_start_gp(), which
+	 * will release it.
+	 */
+	rsp->completed = rsp->gpnum;
+	rcu_process_gp_end(rsp, rsp->rda[smp_processor_id()]);
+	rcu_start_gp(rsp, flags);  /* releases rnp->lock. */
+}
+
+/*
+ * Record a quiescent state for the specified CPU, which must either be
+ * the current CPU or an offline CPU.  The lastcomp argument is used to
+ * make sure we are still in the grace period of interest.  We don't want
+ * to end the current grace period based on quiescent states detected in
+ * an earlier grace period!
+ */
+static void
+cpu_quiet(int cpu, struct rcu_state *rsp, struct rcu_data *rdp, long lastcomp)
+{
+	unsigned long flags;
+	unsigned long mask;
+	struct rcu_node *rnp;
+
+	rnp = rdp->mynode;
+	spin_lock_irqsave(&rnp->lock, flags);
+	if (lastcomp != ACCESS_ONCE(rsp->completed)) {
+
+		/*
+		 * Someone beat us to it for this grace period, so leave.
+		 * The race with GP start is resolved by the fact that we
+		 * hold the leaf rcu_node lock, so that the per-CPU bits
+		 * cannot yet be initialized -- so we would simply find our
+		 * CPU's bit already cleared in cpu_quiet_msk() if this race
+		 * occurred.
+		 */
+		rdp->passed_quiesc = 0;	/* try again later! */
+		spin_unlock_irqrestore(&rnp->lock, flags);
+		return;
+	}
+	mask = rdp->grpmask;
+	if ((rnp->qsmask & mask) == 0) {
+		spin_unlock_irqrestore(&rnp->lock, flags);
+	} else {
+		rdp->qs_pending = 0;
+
+		/*
+		 * This GP can't end until cpu checks in, so all of our
+		 * callbacks can be processed during the next GP.
+		 */
+		rdp = rsp->rda[smp_processor_id()];
+		rdp->nxttail[RCU_NEXT_READY_TAIL] = rdp->nxttail[RCU_NEXT_TAIL];
+
+		cpu_quiet_msk(mask, rsp, rnp, flags); /* releases rnp->lock */
+	}
+}
+
+/*
+ * Check to see if there is a new grace period of which this CPU
+ * is not yet aware, and if so, set up local rcu_data state for it.
+ * Otherwise, see if this CPU has just passed through its first
+ * quiescent state for this grace period, and record that fact if so.
+ */
+static void
+rcu_check_quiescent_state(struct rcu_state *rsp, struct rcu_data *rdp)
+{
+	/* If there is now a new grace period, record and return. */
+	if (check_for_new_grace_period(rsp, rdp))
+		return;
+
+	/*
+	 * Does this CPU still need to do its part for current grace period?
+	 * If no, return and let the other CPUs do their part as well.
+	 */
+	if (!rdp->qs_pending)
+		return;
+
+	/*
+	 * Was there a quiescent state since the beginning of the grace
+	 * period? If no, then exit and wait for the next call.
+	 */
+	if (!rdp->passed_quiesc)
+		return;
+
+	/* Tell RCU we are done (but cpu_quiet() will be the judge of that). */
+	cpu_quiet(rdp->cpu, rsp, rdp, rdp->passed_quiesc_completed);
+}
+
+#ifdef CONFIG_HOTPLUG_CPU
+
+/*
+ * Remove the outgoing CPU from the bitmasks in the rcu_node hierarchy
+ * and move all callbacks from the outgoing CPU to the current one.
+ */
+static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp)
+{
+	int i;
+	unsigned long flags;
+	long lastcomp;
+	unsigned long mask;
+	struct rcu_data *rdp = rsp->rda[cpu];
+	struct rcu_data *rdp_me;
+	struct rcu_node *rnp;
+
+	/* Exclude any attempts to start a new grace period. */
+	spin_lock_irqsave(&rsp->onofflock, flags);
+
+	/* Remove the outgoing CPU from the masks in the rcu_node hierarchy. */
+	rnp = rdp->mynode;
+	mask = rdp->grpmask;	/* rnp->grplo is constant. */
+	do {
+		spin_lock(&rnp->lock);		/* irqs already disabled. */
+		rnp->qsmaskinit &= ~mask;
+		if (rnp->qsmaskinit != 0) {
+			spin_unlock(&rnp->lock); /* irqs already disabled. */
+			break;
+		}
+		mask = rnp->grpmask;
+		spin_unlock(&rnp->lock);	/* irqs already disabled. */
+		rnp = rnp->parent;
+	} while (rnp != NULL);
+	lastcomp = rsp->completed;
+
+	spin_unlock(&rsp->onofflock);		/* irqs remain disabled. */
+
+	/* Being offline is a quiescent state, so go record it. */
+	cpu_quiet(cpu, rsp, rdp, lastcomp);
+
+	/*
+	 * Move callbacks from the outgoing CPU to the running CPU.
+	 * Note that the outgoing CPU is now quiscent, so it is now
+	 * (uncharacteristically) safe to access it rcu_data structure.
+	 * Note also that we must carefully retain the order of the
+	 * outgoing CPU's callbacks in order for rcu_barrier() to work
+	 * correctly.  Finally, note that we start all the callbacks
+	 * afresh, even those that have passed through a grace period
+	 * and are therefore ready to invoke.  The theory is that hotplug
+	 * events are rare, and that if they are frequent enough to
+	 * indefinitely delay callbacks, you have far worse things to
+	 * be worrying about.
+	 */
+	rdp_me = rsp->rda[smp_processor_id()];
+	if (rdp->nxtlist != NULL) {
+		*rdp_me->nxttail[RCU_NEXT_TAIL] = rdp->nxtlist;
+		rdp_me->nxttail[RCU_NEXT_TAIL] = rdp->nxttail[RCU_NEXT_TAIL];
+		rdp->nxtlist = NULL;
+		for (i = 0; i < RCU_NEXT_SIZE; i++)
+			rdp->nxttail[i] = &rdp->nxtlist;
+		rdp_me->qlen += rdp->qlen;
+		rdp->qlen = 0;
+	}
+	local_irq_restore(flags);
+}
+
+/*
+ * Remove the specified CPU from the RCU hierarchy and move any pending
+ * callbacks that it might have to the current CPU.  This code assumes
+ * that at least one CPU in the system will remain running at all times.
+ * Any attempt to offline -all- CPUs is likely to strand RCU callbacks.
+ */
+static void rcu_offline_cpu(int cpu)
+{
+	__rcu_offline_cpu(cpu, &rcu_state);
+	__rcu_offline_cpu(cpu, &rcu_bh_state);
+}
+
+#else /* #ifdef CONFIG_HOTPLUG_CPU */
+
+static void rcu_offline_cpu(int cpu)
+{
+}
+
+#endif /* #else #ifdef CONFIG_HOTPLUG_CPU */
+
+/*
+ * Invoke any RCU callbacks that have made it to the end of their grace
+ * period.  Thottle as specified by rdp->blimit.
+ */
+static void rcu_do_batch(struct rcu_data *rdp)
+{
+	unsigned long flags;
+	struct rcu_head *next, *list, **tail;
+	int count;
+
+	/* If no callbacks are ready, just return.*/
+	if (!cpu_has_callbacks_ready_to_invoke(rdp))
+		return;
+
+	/*
+	 * Extract the list of ready callbacks, disabling to prevent
+	 * races with call_rcu() from interrupt handlers.
+	 */
+	local_irq_save(flags);
+	list = rdp->nxtlist;
+	rdp->nxtlist = *rdp->nxttail[RCU_DONE_TAIL];
+	*rdp->nxttail[RCU_DONE_TAIL] = NULL;
+	tail = rdp->nxttail[RCU_DONE_TAIL];
+	for (count = RCU_NEXT_SIZE - 1; count >= 0; count--)
+		if (rdp->nxttail[count] == rdp->nxttail[RCU_DONE_TAIL])
+			rdp->nxttail[count] = &rdp->nxtlist;
+	local_irq_restore(flags);
+
+	/* Invoke callbacks. */
+	count = 0;
+	while (list) {
+		next = list->next;
+		prefetch(next);
+		list->func(list);
+		list = next;
+		if (++count >= rdp->blimit)
+			break;
+	}
+
+	local_irq_save(flags);
+
+	/* Update count, and requeue any remaining callbacks. */
+	rdp->qlen -= count;
+	if (list != NULL) {
+		*tail = rdp->nxtlist;
+		rdp->nxtlist = list;
+		for (count = 0; count < RCU_NEXT_SIZE; count++)
+			if (&rdp->nxtlist == rdp->nxttail[count])
+				rdp->nxttail[count] = tail;
+			else
+				break;
+	}
+
+	/* Reinstate batch limit if we have worked down the excess. */
+	if (rdp->blimit == LONG_MAX && rdp->qlen <= qlowmark)
+		rdp->blimit = blimit;
+
+	local_irq_restore(flags);
+
+	/* Re-raise the RCU softirq if there are callbacks remaining. */
+	if (cpu_has_callbacks_ready_to_invoke(rdp))
+		raise_softirq(RCU_SOFTIRQ);
+}
+
+/*
+ * Check to see if this CPU is in a non-context-switch quiescent state
+ * (user mode or idle loop for rcu, non-softirq execution for rcu_bh).
+ * Also schedule the RCU softirq handler.
+ *
+ * This function must be called with hardirqs disabled.  It is normally
+ * invoked from the scheduling-clock interrupt.  If rcu_pending returns
+ * false, there is no point in invoking rcu_check_callbacks().
+ */
+void rcu_check_callbacks(int cpu, int user)
+{
+	if (user ||
+	    (idle_cpu(cpu) && !in_softirq() &&
+				hardirq_count() <= (1 << HARDIRQ_SHIFT))) {
+
+		/*
+		 * Get here if this CPU took its interrupt from user
+		 * mode or from the idle loop, and if this is not a
+		 * nested interrupt.  In this case, the CPU is in
+		 * a quiescent state, so count it.
+		 *
+		 * No memory barrier is required here because both
+		 * rcu_qsctr_inc() and rcu_bh_qsctr_inc() reference
+		 * only CPU-local variables that other CPUs neither
+		 * access nor modify, at least not while the corresponding
+		 * CPU is online.
+		 */
+
+		rcu_qsctr_inc(cpu);
+		rcu_bh_qsctr_inc(cpu);
+
+	} else if (!in_softirq()) {
+
+		/*
+		 * Get here if this CPU did not take its interrupt from
+		 * softirq, in other words, if it is not interrupting
+		 * a rcu_bh read-side critical section.  This is an _bh
+		 * critical section, so count it.
+		 */
+
+		rcu_bh_qsctr_inc(cpu);
+	}
+	raise_softirq(RCU_SOFTIRQ);
+}
+
+#ifdef CONFIG_SMP
+
+/*
+ * Scan the leaf rcu_node structures, processing dyntick state for any that
+ * have not yet encountered a quiescent state, using the function specified.
+ * Returns 1 if the current grace period ends while scanning (possibly
+ * because we made it end).
+ */
+static int rcu_process_dyntick(struct rcu_state *rsp, long lastcomp,
+			       int (*f)(struct rcu_data *))
+{
+	unsigned long bit;
+	int cpu;
+	unsigned long flags;
+	unsigned long mask;
+	struct rcu_node *rnp_cur = rsp->level[NUM_RCU_LVLS - 1];
+	struct rcu_node *rnp_end = &rsp->node[NUM_RCU_NODES];
+
+	for (; rnp_cur < rnp_end; rnp_cur++) {
+		mask = 0;
+		spin_lock_irqsave(&rnp_cur->lock, flags);
+		if (rsp->completed != lastcomp) {
+			spin_unlock_irqrestore(&rnp_cur->lock, flags);
+			return 1;
+		}
+		if (rnp_cur->qsmask == 0) {
+			spin_unlock_irqrestore(&rnp_cur->lock, flags);
+			continue;
+		}
+		cpu = rnp_cur->grplo;
+		bit = 1;
+		for (; cpu <= rnp_cur->grphi; cpu++, bit <<= 1) {
+			if ((rnp_cur->qsmask & bit) != 0 && f(rsp->rda[cpu]))
+				mask |= bit;
+		}
+		if (mask != 0 && rsp->completed == lastcomp) {
+
+			/* cpu_quiet_msk() releases rnp_cur->lock. */
+			cpu_quiet_msk(mask, rsp, rnp_cur, flags);
+			continue;
+		}
+		spin_unlock_irqrestore(&rnp_cur->lock, flags);
+	}
+	return 0;
+}
+
+/*
+ * Force quiescent states on reluctant CPUs, and also detect which
+ * CPUs are in dyntick-idle mode.
+ */
+static void force_quiescent_state(struct rcu_state *rsp, int relaxed)
+{
+	unsigned long flags;
+	long lastcomp;
+	struct rcu_data *rdp = rsp->rda[smp_processor_id()];
+	struct rcu_node *rnp = rcu_get_root(rsp);
+	u8 signaled;
+
+	if (ACCESS_ONCE(rsp->completed) == ACCESS_ONCE(rsp->gpnum))
+		return;  /* No grace period in progress, nothing to force. */
+	if (!spin_trylock_irqsave(&rsp->fqslock, flags)) {
+		rsp->n_force_qs_lh++; /* Inexact, can lose counts.  Tough! */
+		return;	/* Someone else is already on the job. */
+	}
+	if (relaxed &&
+	    (long)(rsp->jiffies_force_qs - jiffies) >= 0 &&
+	    (rdp->n_rcu_pending_force_qs - rdp->n_rcu_pending) >= 0)
+		goto unlock_ret; /* no emergency and done recently. */
+	rsp->n_force_qs++;
+	spin_lock(&rnp->lock);
+	lastcomp = rsp->completed;
+	signaled = rsp->signaled;
+	rsp->jiffies_force_qs = jiffies + RCU_JIFFIES_TILL_FORCE_QS;
+	rdp->n_rcu_pending_force_qs = rdp->n_rcu_pending +
+				      RCU_JIFFIES_TILL_FORCE_QS;
+	if (lastcomp == rsp->gpnum) {
+		rsp->n_force_qs_ngp++;
+		spin_unlock(&rnp->lock);
+		goto unlock_ret;  /* no GP in progress, time updated. */
+	}
+	spin_unlock(&rnp->lock);
+	switch (signaled) {
+	case RCU_GP_INIT:
+
+		break; /* grace period still initializing, ignore. */
+
+	case RCU_SAVE_DYNTICK:
+
+		if (RCU_SIGNAL_INIT != RCU_SAVE_DYNTICK)
+			break; /* So gcc recognizes the dead code. */
+
+		/* Record dyntick-idle state. */
+		if (rcu_process_dyntick(rsp, lastcomp,
+					dyntick_save_progress_counter))
+			goto unlock_ret;
+
+		/* Update state, record completion counter. */
+		spin_lock(&rnp->lock);
+		if (lastcomp == rsp->completed) {
+			rsp->signaled = RCU_FORCE_QS;
+			dyntick_record_completed(rsp, lastcomp);
+		}
+		spin_unlock(&rnp->lock);
+		break;
+
+	case RCU_FORCE_QS:
+
+		/* Check dyntick-idle state, send IPI to laggarts. */
+		if (rcu_process_dyntick(rsp, dyntick_recall_completed(rsp),
+					rcu_implicit_dynticks_qs))
+			goto unlock_ret;
+
+		/* Leave state in case more forcing is required. */
+
+		break;
+	}
+unlock_ret:
+	spin_unlock_irqrestore(&rsp->fqslock, flags);
+}
+
+#else /* #ifdef CONFIG_SMP */
+
+static void force_quiescent_state(struct rcu_state *rsp, int relaxed)
+{
+	set_need_resched();
+}
+
+#endif /* #else #ifdef CONFIG_SMP */
+
+/*
+ * This does the RCU processing work from softirq context for the
+ * specified rcu_state and rcu_data structures.  This may be called
+ * only from the CPU to whom the rdp belongs.
+ */
+static void
+__rcu_process_callbacks(struct rcu_state *rsp, struct rcu_data *rdp)
+{
+	unsigned long flags;
+
+	/*
+	 * If an RCU GP has gone long enough, go check for dyntick
+	 * idle CPUs and, if needed, send resched IPIs.
+	 */
+	if ((long)(ACCESS_ONCE(rsp->jiffies_force_qs) - jiffies) < 0 ||
+	    (rdp->n_rcu_pending_force_qs - rdp->n_rcu_pending) < 0)
+		force_quiescent_state(rsp, 1);
+
+	/*
+	 * Advance callbacks in response to end of earlier grace
+	 * period that some other CPU ended.
+	 */
+	rcu_process_gp_end(rsp, rdp);
+
+	/* Update RCU state based on any recent quiescent states. */
+	rcu_check_quiescent_state(rsp, rdp);
+
+	/* Does this CPU require a not-yet-started grace period? */
+	if (cpu_needs_another_gp(rsp, rdp)) {
+		spin_lock_irqsave(&rcu_get_root(rsp)->lock, flags);
+		rcu_start_gp(rsp, flags);  /* releases above lock */
+	}
+
+	/* If there are callbacks ready, invoke them. */
+	rcu_do_batch(rdp);
+}
+
+/*
+ * Do softirq processing for the current CPU.
+ */
+static void rcu_process_callbacks(struct softirq_action *unused)
+{
+	/*
+	 * Memory references from any prior RCU read-side critical sections
+	 * executed by the interrupted code must be seen before any RCU
+	 * grace-period manipulations below.
+	 */
+	smp_mb(); /* See above block comment. */
+
+	__rcu_process_callbacks(&rcu_state, &__get_cpu_var(rcu_data));
+	__rcu_process_callbacks(&rcu_bh_state, &__get_cpu_var(rcu_bh_data));
+
+	/*
+	 * Memory references from any later RCU read-side critical sections
+	 * executed by the interrupted code must be seen after any RCU
+	 * grace-period manipulations above.
+	 */
+	smp_mb(); /* See above block comment. */
+}
+
+static void
+__call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),
+	   struct rcu_state *rsp)
+{
+	unsigned long flags;
+	struct rcu_data *rdp;
+
+	head->func = func;
+	head->next = NULL;
+
+	smp_mb(); /* Ensure RCU update seen before callback registry. */
+
+	/*
+	 * Opportunistically note grace-period endings and beginnings.
+	 * Note that we might see a beginning right after we see an
+	 * end, but never vice versa, since this CPU has to pass through
+	 * a quiescent state betweentimes.
+	 */
+	local_irq_save(flags);
+	rdp = rsp->rda[smp_processor_id()];
+	rcu_process_gp_end(rsp, rdp);
+	check_for_new_grace_period(rsp, rdp);
+
+	/* Add the callback to our list. */
+	*rdp->nxttail[RCU_NEXT_TAIL] = head;
+	rdp->nxttail[RCU_NEXT_TAIL] = &head->next;
+
+	/* Start a new grace period if one not already started. */
+	if (ACCESS_ONCE(rsp->completed) == ACCESS_ONCE(rsp->gpnum)) {
+		unsigned long nestflag;
+		struct rcu_node *rnp_root = rcu_get_root(rsp);
+
+		spin_lock_irqsave(&rnp_root->lock, nestflag);
+		rcu_start_gp(rsp, nestflag);  /* releases rnp_root->lock. */
+	}
+
+	/* Force the grace period if too many callbacks or too long waiting. */
+	if (unlikely(++rdp->qlen > qhimark)) {
+		rdp->blimit = LONG_MAX;
+		force_quiescent_state(rsp, 0);
+	} else if ((long)(ACCESS_ONCE(rsp->jiffies_force_qs) - jiffies) < 0 ||
+		   (rdp->n_rcu_pending_force_qs - rdp->n_rcu_pending) < 0)
+		force_quiescent_state(rsp, 1);
+	local_irq_restore(flags);
+}
+
+/*
+ * Queue an RCU callback for invocation after a grace period.
+ */
+void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
+{
+	__call_rcu(head, func, &rcu_state);
+}
+EXPORT_SYMBOL_GPL(call_rcu);
+
+/*
+ * Queue an RCU for invocation after a quicker grace period.
+ */
+void call_rcu_bh(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
+{
+	__call_rcu(head, func, &rcu_bh_state);
+}
+EXPORT_SYMBOL_GPL(call_rcu_bh);
+
+/*
+ * Check to see if there is any immediate RCU-related work to be done
+ * by the current CPU, for the specified type of RCU, returning 1 if so.
+ * The checks are in order of increasing expense: checks that can be
+ * carried out against CPU-local state are performed first.  However,
+ * we must check for CPU stalls first, else we might not get a chance.
+ */
+static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp)
+{
+	rdp->n_rcu_pending++;
+
+	/* Check for CPU stalls, if enabled. */
+	check_cpu_stall(rsp, rdp);
+
+	/* Is the RCU core waiting for a quiescent state from this CPU? */
+	if (rdp->qs_pending)
+		return 1;
+
+	/* Does this CPU have callbacks ready to invoke? */
+	if (cpu_has_callbacks_ready_to_invoke(rdp))
+		return 1;
+
+	/* Has RCU gone idle with this CPU needing another grace period? */
+	if (cpu_needs_another_gp(rsp, rdp))
+		return 1;
+
+	/* Has another RCU grace period completed?  */
+	if (ACCESS_ONCE(rsp->completed) != rdp->completed) /* outside of lock */
+		return 1;
+
+	/* Has a new RCU grace period started? */
+	if (ACCESS_ONCE(rsp->gpnum) != rdp->gpnum) /* outside of lock */
+		return 1;
+
+	/* Has an RCU GP gone long enough to send resched IPIs &c? */
+	if (ACCESS_ONCE(rsp->completed) != ACCESS_ONCE(rsp->gpnum) &&
+	    ((long)(ACCESS_ONCE(rsp->jiffies_force_qs) - jiffies) < 0 ||
+	     (rdp->n_rcu_pending_force_qs - rdp->n_rcu_pending) < 0))
+		return 1;
+
+	/* nothing to do */
+	return 0;
+}
+
+/*
+ * Check to see if there is any immediate RCU-related work to be done
+ * by the current CPU, returning 1 if so.  This function is part of the
+ * RCU implementation; it is -not- an exported member of the RCU API.
+ */
+int rcu_pending(int cpu)
+{
+	return __rcu_pending(&rcu_state, &per_cpu(rcu_data, cpu)) ||
+	       __rcu_pending(&rcu_bh_state, &per_cpu(rcu_bh_data, cpu));
+}
+
+/*
+ * Check to see if any future RCU-related work will need to be done
+ * by the current CPU, even if none need be done immediately, returning
+ * 1 if so.  This function is part of the RCU implementation; it is -not-
+ * an exported member of the RCU API.
+ */
+int rcu_needs_cpu(int cpu)
+{
+	/* RCU callbacks either ready or pending? */
+	return per_cpu(rcu_data, cpu).nxtlist ||
+	       per_cpu(rcu_bh_data, cpu).nxtlist;
+}
+
+/*
+ * Initialize a CPU's per-CPU RCU data.  We take this "scorched earth"
+ * approach so that we don't have to worry about how long the CPU has
+ * been gone, or whether it ever was online previously.  We do trust the
+ * ->mynode field, as it is constant for a given struct rcu_data and
+ * initialized during early boot.
+ *
+ * Note that only one online or offline event can be happening at a given
+ * time.  Note also that we can accept some slop in the rsp->completed
+ * access due to the fact that this CPU cannot possibly have any RCU
+ * callbacks in flight yet.
+ */
+static void
+rcu_init_percpu_data(int cpu, struct rcu_state *rsp)
+{
+	unsigned long flags;
+	int i;
+	long lastcomp;
+	unsigned long mask;
+	struct rcu_data *rdp = rsp->rda[cpu];
+	struct rcu_node *rnp = rcu_get_root(rsp);
+
+	/* Set up local state, ensuring consistent view of global state. */
+	spin_lock_irqsave(&rnp->lock, flags);
+	lastcomp = rsp->completed;
+	rdp->completed = lastcomp;
+	rdp->gpnum = lastcomp;
+	rdp->passed_quiesc = 0;  /* We could be racing with new GP, */
+	rdp->qs_pending = 1;	 /*  so set up to respond to current GP. */
+	rdp->beenonline = 1;	 /* We have now been online. */
+	rdp->passed_quiesc_completed = lastcomp - 1;
+	rdp->grpmask = 1UL << (cpu - rdp->mynode->grplo);
+	rdp->nxtlist = NULL;
+	for (i = 0; i < RCU_NEXT_SIZE; i++)
+		rdp->nxttail[i] = &rdp->nxtlist;
+	rdp->qlen = 0;
+	rdp->blimit = blimit;
+#ifdef CONFIG_NO_HZ
+	rdp->dynticks = &per_cpu(rcu_dynticks, cpu);
+#endif /* #ifdef CONFIG_NO_HZ */
+	rdp->cpu = cpu;
+	spin_unlock(&rnp->lock);		/* irqs remain disabled. */
+
+	/*
+	 * A new grace period might start here.  If so, we won't be part
+	 * of it, but that is OK, as we are currently in a quiescent state.
+	 */
+
+	/* Exclude any attempts to start a new GP on large systems. */
+	spin_lock(&rsp->onofflock);		/* irqs already disabled. */
+
+	/* Add CPU to rcu_node bitmasks. */
+	rnp = rdp->mynode;
+	mask = rdp->grpmask;
+	do {
+		/* Exclude any attempts to start a new GP on small systems. */
+		spin_lock(&rnp->lock);	/* irqs already disabled. */
+		rnp->qsmaskinit |= mask;
+		mask = rnp->grpmask;
+		spin_unlock(&rnp->lock); /* irqs already disabled. */
+		rnp = rnp->parent;
+	} while (rnp != NULL && !(rnp->qsmaskinit & mask));
+
+	spin_unlock(&rsp->onofflock);		/* irqs remain disabled. */
+
+	/*
+	 * A new grace period might start here.  If so, we will be part of
+	 * it, and its gpnum will be greater than ours, so we will
+	 * participate.  It is also possible for the gpnum to have been
+	 * incremented before this function was called, and the bitmasks
+	 * to not be filled out until now, in which case we will also
+	 * participate due to our gpnum being behind.
+	 */
+
+	/* Since it is coming online, the CPU is in a quiescent state. */
+	cpu_quiet(cpu, rsp, rdp, lastcomp);
+	local_irq_restore(flags);
+}
+
+static void __cpuinit rcu_online_cpu(int cpu)
+{
+#ifdef CONFIG_NO_HZ
+	struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
+
+	rdtp->dynticks_nesting = 1;
+	rdtp->dynticks |= 1; 	/* need consecutive #s even for hotplug. */
+	rdtp->dynticks_nmi = (rdtp->dynticks_nmi + 1) & ~0x1;
+#endif /* #ifdef CONFIG_NO_HZ */
+	rcu_init_percpu_data(cpu, &rcu_state);
+	rcu_init_percpu_data(cpu, &rcu_bh_state);
+	open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
+}
+
+/*
+ * Handle CPU online/offline notifcation events.
+ */
+static int __cpuinit rcu_cpu_notify(struct notifier_block *self,
+				unsigned long action, void *hcpu)
+{
+	long cpu = (long)hcpu;
+
+	switch (action) {
+	case CPU_UP_PREPARE:
+	case CPU_UP_PREPARE_FROZEN:
+		rcu_online_cpu(cpu);
+		break;
+	case CPU_DEAD:
+	case CPU_DEAD_FROZEN:
+	case CPU_UP_CANCELED:
+	case CPU_UP_CANCELED_FROZEN:
+		rcu_offline_cpu(cpu);
+		break;
+	default:
+		break;
+	}
+	return NOTIFY_OK;
+}
+
+/*
+ * Compute the per-level fanout, either using the exact fanout specified
+ * or balancing the tree, depending on CONFIG_RCU_FANOUT_EXACT.
+ */
+#ifdef CONFIG_RCU_FANOUT_EXACT
+static void __init rcu_init_levelspread(struct rcu_state *rsp)
+{
+	int i;
+
+	for (i = NUM_RCU_LVLS - 1; i >= 0; i--)
+		rsp->levelspread[i] = CONFIG_RCU_FANOUT;
+}
+#else /* #ifdef CONFIG_RCU_FANOUT_EXACT */
+static void __init rcu_init_levelspread(struct rcu_state *rsp)
+{
+	int ccur;
+	int cprv;
+	int i;
+
+	cprv = NR_CPUS;
+	for (i = NUM_RCU_LVLS - 1; i >= 0; i--) {
+		ccur = rsp->levelcnt[i];
+		rsp->levelspread[i] = (cprv + ccur - 1) / ccur;
+		cprv = ccur;
+	}
+}
+#endif /* #else #ifdef CONFIG_RCU_FANOUT_EXACT */
+
+/*
+ * Helper function for rcu_init() that initializes one rcu_state structure.
+ */
+static void __init rcu_init_one(struct rcu_state *rsp)
+{
+	int cpustride = 1;
+	int i;
+	int j;
+	struct rcu_node *rnp;
+
+	/* Initialize the level-tracking arrays. */
+
+	for (i = 1; i < NUM_RCU_LVLS; i++)
+		rsp->level[i] = rsp->level[i - 1] + rsp->levelcnt[i - 1];
+	rcu_init_levelspread(rsp);
+
+	/* Initialize the elements themselves, starting from the leaves. */
+
+	for (i = NUM_RCU_LVLS - 1; i >= 0; i--) {
+		cpustride *= rsp->levelspread[i];
+		rnp = rsp->level[i];
+		for (j = 0; j < rsp->levelcnt[i]; j++, rnp++) {
+			spin_lock_init(&rnp->lock);
+			rnp->qsmask = 0;
+			rnp->qsmaskinit = 0;
+			rnp->grplo = j * cpustride;
+			rnp->grphi = (j + 1) * cpustride - 1;
+			if (rnp->grphi >= NR_CPUS)
+				rnp->grphi = NR_CPUS - 1;
+			if (i == 0) {
+				rnp->grpnum = 0;
+				rnp->grpmask = 0;
+				rnp->parent = NULL;
+			} else {
+				rnp->grpnum = j % rsp->levelspread[i - 1];
+				rnp->grpmask = 1UL << rnp->grpnum;
+				rnp->parent = rsp->level[i - 1] +
+					      j / rsp->levelspread[i - 1];
+			}
+			rnp->level = i;
+		}
+	}
+}
+
+/*
+ * Helper macro for __rcu_init().  To be used nowhere else!
+ * Assigns leaf node pointers into each CPU's rcu_data structure.
+ */
+#define RCU_DATA_PTR_INIT(rsp, rcu_data) \
+do { \
+	rnp = (rsp)->level[NUM_RCU_LVLS - 1]; \
+	j = 0; \
+	for_each_possible_cpu(i) { \
+		if (i > rnp[j].grphi) \
+			j++; \
+		per_cpu(rcu_data, i).mynode = &rnp[j]; \
+		(rsp)->rda[i] = &per_cpu(rcu_data, i); \
+	} \
+} while (0)
+
+static struct notifier_block __cpuinitdata rcu_nb = {
+	.notifier_call	= rcu_cpu_notify,
+};
+
+void __init __rcu_init(void)
+{
+	int i;			/* All used by RCU_DATA_PTR_INIT(). */
+	int j;
+	struct rcu_node *rnp;
+
+	printk(KERN_WARNING "Experimental hierarchical RCU implementation.\n");
+#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
+	printk(KERN_INFO "RCU-based detection of stalled CPUs is enabled.\n");
+#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
+	rcu_init_one(&rcu_state);
+	RCU_DATA_PTR_INIT(&rcu_state, rcu_data);
+	rcu_init_one(&rcu_bh_state);
+	RCU_DATA_PTR_INIT(&rcu_bh_state, rcu_bh_data);
+
+	for_each_online_cpu(i)
+		rcu_cpu_notify(&rcu_nb, CPU_UP_PREPARE, (void *)(long)i);
+	/* Register notifier for non-boot CPUs */
+	register_cpu_notifier(&rcu_nb);
+	printk(KERN_WARNING "Experimental hierarchical RCU init done.\n");
+}
+
+module_param(blimit, int, 0);
+module_param(qhimark, int, 0);
+module_param(qlowmark, int, 0);
diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c
new file mode 100644
index 00000000000..d6db3e83782
--- /dev/null
+++ b/kernel/rcutree_trace.c
@@ -0,0 +1,271 @@
+/*
+ * Read-Copy Update tracing for classic implementation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright IBM Corporation, 2008
+ *
+ * Papers:  http://www.rdrop.com/users/paulmck/RCU
+ *
+ * For detailed explanation of Read-Copy Update mechanism see -
+ * 		Documentation/RCU
+ *
+ */
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/spinlock.h>
+#include <linux/smp.h>
+#include <linux/rcupdate.h>
+#include <linux/interrupt.h>
+#include <linux/sched.h>
+#include <asm/atomic.h>
+#include <linux/bitops.h>
+#include <linux/module.h>
+#include <linux/completion.h>
+#include <linux/moduleparam.h>
+#include <linux/percpu.h>
+#include <linux/notifier.h>
+#include <linux/cpu.h>
+#include <linux/mutex.h>
+#include <linux/debugfs.h>
+#include <linux/seq_file.h>
+
+static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp)
+{
+	if (!rdp->beenonline)
+		return;
+	seq_printf(m, "%3d%cc=%ld g=%ld pq=%d pqc=%ld qp=%d rpfq=%ld rp=%x",
+		   rdp->cpu,
+		   cpu_is_offline(rdp->cpu) ? '!' : ' ',
+		   rdp->completed, rdp->gpnum,
+		   rdp->passed_quiesc, rdp->passed_quiesc_completed,
+		   rdp->qs_pending,
+		   rdp->n_rcu_pending_force_qs - rdp->n_rcu_pending,
+		   (int)(rdp->n_rcu_pending & 0xffff));
+#ifdef CONFIG_NO_HZ
+	seq_printf(m, " dt=%d/%d dn=%d df=%lu",
+		   rdp->dynticks->dynticks,
+		   rdp->dynticks->dynticks_nesting,
+		   rdp->dynticks->dynticks_nmi,
+		   rdp->dynticks_fqs);
+#endif /* #ifdef CONFIG_NO_HZ */
+	seq_printf(m, " of=%lu ri=%lu", rdp->offline_fqs, rdp->resched_ipi);
+	seq_printf(m, " ql=%ld b=%ld\n", rdp->qlen, rdp->blimit);
+}
+
+#define PRINT_RCU_DATA(name, func, m) \
+	do { \
+		int _p_r_d_i; \
+		\
+		for_each_possible_cpu(_p_r_d_i) \
+			func(m, &per_cpu(name, _p_r_d_i)); \
+	} while (0)
+
+static int show_rcudata(struct seq_file *m, void *unused)
+{
+	seq_puts(m, "rcu:\n");
+	PRINT_RCU_DATA(rcu_data, print_one_rcu_data, m);
+	seq_puts(m, "rcu_bh:\n");
+	PRINT_RCU_DATA(rcu_bh_data, print_one_rcu_data, m);
+	return 0;
+}
+
+static int rcudata_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, show_rcudata, NULL);
+}
+
+static struct file_operations rcudata_fops = {
+	.owner = THIS_MODULE,
+	.open = rcudata_open,
+	.read = seq_read,
+	.llseek = seq_lseek,
+	.release = single_release,
+};
+
+static void print_one_rcu_data_csv(struct seq_file *m, struct rcu_data *rdp)
+{
+	if (!rdp->beenonline)
+		return;
+	seq_printf(m, "%d,%s,%ld,%ld,%d,%ld,%d,%ld,%ld",
+		   rdp->cpu,
+		   cpu_is_offline(rdp->cpu) ? "\"Y\"" : "\"N\"",
+		   rdp->completed, rdp->gpnum,
+		   rdp->passed_quiesc, rdp->passed_quiesc_completed,
+		   rdp->qs_pending,
+		   rdp->n_rcu_pending_force_qs - rdp->n_rcu_pending,
+		   rdp->n_rcu_pending);
+#ifdef CONFIG_NO_HZ
+	seq_printf(m, ",%d,%d,%d,%lu",
+		   rdp->dynticks->dynticks,
+		   rdp->dynticks->dynticks_nesting,
+		   rdp->dynticks->dynticks_nmi,
+		   rdp->dynticks_fqs);
+#endif /* #ifdef CONFIG_NO_HZ */
+	seq_printf(m, ",%lu,%lu", rdp->offline_fqs, rdp->resched_ipi);
+	seq_printf(m, ",%ld,%ld\n", rdp->qlen, rdp->blimit);
+}
+
+static int show_rcudata_csv(struct seq_file *m, void *unused)
+{
+	seq_puts(m, "\"CPU\",\"Online?\",\"c\",\"g\",\"pq\",\"pqc\",\"pq\",\"rpfq\",\"rp\",");
+#ifdef CONFIG_NO_HZ
+	seq_puts(m, "\"dt\",\"dt nesting\",\"dn\",\"df\",");
+#endif /* #ifdef CONFIG_NO_HZ */
+	seq_puts(m, "\"of\",\"ri\",\"ql\",\"b\"\n");
+	seq_puts(m, "\"rcu:\"\n");
+	PRINT_RCU_DATA(rcu_data, print_one_rcu_data_csv, m);
+	seq_puts(m, "\"rcu_bh:\"\n");
+	PRINT_RCU_DATA(rcu_bh_data, print_one_rcu_data_csv, m);
+	return 0;
+}
+
+static int rcudata_csv_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, show_rcudata_csv, NULL);
+}
+
+static struct file_operations rcudata_csv_fops = {
+	.owner = THIS_MODULE,
+	.open = rcudata_csv_open,
+	.read = seq_read,
+	.llseek = seq_lseek,
+	.release = single_release,
+};
+
+static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp)
+{
+	int level = 0;
+	struct rcu_node *rnp;
+
+	seq_printf(m, "c=%ld g=%ld s=%d jfq=%ld j=%x "
+	              "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu\n",
+		   rsp->completed, rsp->gpnum, rsp->signaled,
+		   (long)(rsp->jiffies_force_qs - jiffies),
+		   (int)(jiffies & 0xffff),
+		   rsp->n_force_qs, rsp->n_force_qs_ngp,
+		   rsp->n_force_qs - rsp->n_force_qs_ngp,
+		   rsp->n_force_qs_lh);
+	for (rnp = &rsp->node[0]; rnp - &rsp->node[0] < NUM_RCU_NODES; rnp++) {
+		if (rnp->level != level) {
+			seq_puts(m, "\n");
+			level = rnp->level;
+		}
+		seq_printf(m, "%lx/%lx %d:%d ^%d    ",
+			   rnp->qsmask, rnp->qsmaskinit,
+			   rnp->grplo, rnp->grphi, rnp->grpnum);
+	}
+	seq_puts(m, "\n");
+}
+
+static int show_rcuhier(struct seq_file *m, void *unused)
+{
+	seq_puts(m, "rcu:\n");
+	print_one_rcu_state(m, &rcu_state);
+	seq_puts(m, "rcu_bh:\n");
+	print_one_rcu_state(m, &rcu_bh_state);
+	return 0;
+}
+
+static int rcuhier_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, show_rcuhier, NULL);
+}
+
+static struct file_operations rcuhier_fops = {
+	.owner = THIS_MODULE,
+	.open = rcuhier_open,
+	.read = seq_read,
+	.llseek = seq_lseek,
+	.release = single_release,
+};
+
+static int show_rcugp(struct seq_file *m, void *unused)
+{
+	seq_printf(m, "rcu: completed=%ld  gpnum=%ld\n",
+		   rcu_state.completed, rcu_state.gpnum);
+	seq_printf(m, "rcu_bh: completed=%ld  gpnum=%ld\n",
+		   rcu_bh_state.completed, rcu_bh_state.gpnum);
+	return 0;
+}
+
+static int rcugp_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, show_rcugp, NULL);
+}
+
+static struct file_operations rcugp_fops = {
+	.owner = THIS_MODULE,
+	.open = rcugp_open,
+	.read = seq_read,
+	.llseek = seq_lseek,
+	.release = single_release,
+};
+
+static struct dentry *rcudir, *datadir, *datadir_csv, *hierdir, *gpdir;
+static int __init rcuclassic_trace_init(void)
+{
+	rcudir = debugfs_create_dir("rcu", NULL);
+	if (!rcudir)
+		goto out;
+
+	datadir = debugfs_create_file("rcudata", 0444, rcudir,
+						NULL, &rcudata_fops);
+	if (!datadir)
+		goto free_out;
+
+	datadir_csv = debugfs_create_file("rcudata.csv", 0444, rcudir,
+						NULL, &rcudata_csv_fops);
+	if (!datadir_csv)
+		goto free_out;
+
+	gpdir = debugfs_create_file("rcugp", 0444, rcudir, NULL, &rcugp_fops);
+	if (!gpdir)
+		goto free_out;
+
+	hierdir = debugfs_create_file("rcuhier", 0444, rcudir,
+						NULL, &rcuhier_fops);
+	if (!hierdir)
+		goto free_out;
+	return 0;
+free_out:
+	if (datadir)
+		debugfs_remove(datadir);
+	if (datadir_csv)
+		debugfs_remove(datadir_csv);
+	if (gpdir)
+		debugfs_remove(gpdir);
+	debugfs_remove(rcudir);
+out:
+	return 1;
+}
+
+static void __exit rcuclassic_trace_cleanup(void)
+{
+	debugfs_remove(datadir);
+	debugfs_remove(datadir_csv);
+	debugfs_remove(gpdir);
+	debugfs_remove(hierdir);
+	debugfs_remove(rcudir);
+}
+
+
+module_init(rcuclassic_trace_init);
+module_exit(rcuclassic_trace_cleanup);
+
+MODULE_AUTHOR("Paul E. McKenney");
+MODULE_DESCRIPTION("Read-Copy Update tracing for hierarchical implementation");
+MODULE_LICENSE("GPL");
diff --git a/kernel/softirq.c b/kernel/softirq.c
index e7c69a720d6..80d323e6f61 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -269,6 +269,7 @@ void irq_enter(void)
 {
 	int cpu = smp_processor_id();
 
+	rcu_irq_enter();
 	if (idle_cpu(cpu) && !in_interrupt()) {
 		__irq_enter();
 		tick_check_idle(cpu);
@@ -295,9 +296,9 @@ void irq_exit(void)
 
 #ifdef CONFIG_NO_HZ
 	/* Make sure that timer wheel updates are propagated */
-	if (!in_interrupt() && idle_cpu(smp_processor_id()) && !need_resched())
-		tick_nohz_stop_sched_tick(0);
 	rcu_irq_exit();
+	if (idle_cpu(smp_processor_id()) && !in_interrupt() && !need_resched())
+		tick_nohz_stop_sched_tick(0);
 #endif
 	preempt_enable_no_resched();
 }
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index b0f239e443b..465d822f3f5 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -619,6 +619,19 @@ config RCU_CPU_STALL_DETECTOR
 
 	  Say N if you are unsure.
 
+config RCU_CPU_STALL_DETECTOR
+	bool "Check for stalled CPUs delaying RCU grace periods"
+	depends on CLASSIC_RCU || TREE_RCU
+	default n
+	help
+	  This option causes RCU to printk information on which
+	  CPUs are delaying the current grace period, but only when
+	  the grace period extends for excessive time periods.
+
+	  Say Y if you want RCU to perform such checks.
+
+	  Say N if you are unsure.
+
 config KPROBES_SANITY_TEST
 	bool "Kprobes sanity tests"
 	depends on DEBUG_KERNEL
-- 
cgit v1.2.3-70-g09d2


From 3c8bb73ace6249bd089b70c941440441940e3365 Mon Sep 17 00:00:00 2001
From: "venkatesh.pallipadi@intel.com" <venkatesh.pallipadi@intel.com>
Date: Thu, 18 Dec 2008 11:41:27 -0800
Subject: x86: PAT: store vm_pgoff for all linear_over_vma_region mappings - v3

Impact: Code transformation, new functions added should have no effect.

Drivers use mmap followed by pgprot_* and remap_pfn_range or vm_insert_pfn,
in order to export reserved memory to userspace. Currently, such mappings are
not tracked and hence not kept consistent with other mappings (/dev/mem,
pci resource, ioremap) for the sme memory, that may exist in the system.

The following patchset adds x86 PAT attribute tracking and untracking for
pfnmap related APIs.

First three patches in the patchset are changing the generic mm code to fit
in this tracking. Last four patches are x86 specific to make things work
with x86 PAT code. The patchset aso introduces pgprot_writecombine interface,
which gives writecombine mapping when enabled, falling back to
pgprot_noncached otherwise.

This patch:

While working on x86 PAT, we faced some hurdles with trackking
remap_pfn_range() regions, as we do not have any information to say
whether that PFNMAP mapping is linear for the entire vma range or
it is smaller granularity regions within the vma.

A simple solution to this is to use vm_pgoff as an indicator for
linear mapping over the vma region. Currently, remap_pfn_range
only sets vm_pgoff for COW mappings. Below patch changes the
logic and sets the vm_pgoff irrespective of COW. This will still not
be enough for the case where pfn is zero (vma region mapped to
physical address zero). But, for all the other cases, we can look at
pfnmap VMAs and say whether the mappng is for the entire vma region
or not.

Signed-off-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 include/linux/mm.h | 9 +++++++++
 mm/memory.c        | 7 +++----
 2 files changed, 12 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index ffee2f74341..2be8d9b5e46 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -145,6 +145,15 @@ extern pgprot_t protection_map[16];
 #define FAULT_FLAG_WRITE	0x01	/* Fault was a write access */
 #define FAULT_FLAG_NONLINEAR	0x02	/* Fault was via a nonlinear mapping */
 
+static inline int is_linear_pfn_mapping(struct vm_area_struct *vma)
+{
+	return ((vma->vm_flags & VM_PFNMAP) && vma->vm_pgoff);
+}
+
+static inline int is_pfn_mapping(struct vm_area_struct *vma)
+{
+	return (vma->vm_flags & VM_PFNMAP);
+}
 
 /*
  * vm_fault is filled by the the pagefault handler and passed to the vma's
diff --git a/mm/memory.c b/mm/memory.c
index 164951c4730..cef95c8c77f 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1575,11 +1575,10 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
 	 * behaviour that some programs depend on. We mark the "original"
 	 * un-COW'ed pages by matching them up with "vma->vm_pgoff".
 	 */
-	if (is_cow_mapping(vma->vm_flags)) {
-		if (addr != vma->vm_start || end != vma->vm_end)
-			return -EINVAL;
+	if (addr == vma->vm_start && end == vma->vm_end)
 		vma->vm_pgoff = pfn;
-	}
+	else if (is_cow_mapping(vma->vm_flags))
+		return -EINVAL;
 
 	vma->vm_flags |= VM_IO | VM_RESERVED | VM_PFNMAP;
 
-- 
cgit v1.2.3-70-g09d2


From e121e418441525b5636321fe03d16f0193ad218e Mon Sep 17 00:00:00 2001
From: "venkatesh.pallipadi@intel.com" <venkatesh.pallipadi@intel.com>
Date: Thu, 18 Dec 2008 11:41:28 -0800
Subject: x86: PAT: add follow_pfnmp_pte routine to help tracking pfnmap pages
 - v3

Impact: New currently unused interface.

Add a generic interface to follow pfn in a pfnmap vma range. This is used by
one of the subsequent x86 PAT related patch to keep track of memory types
for vma regions across vma copy and free.

Signed-off-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 include/linux/mm.h |  3 +++
 mm/memory.c        | 43 +++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 46 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 2be8d9b5e46..a25024ff9c1 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1223,6 +1223,9 @@ struct page *follow_page(struct vm_area_struct *, unsigned long address,
 #define FOLL_GET	0x04	/* do get_page on page */
 #define FOLL_ANON	0x08	/* give ZERO_PAGE if no pgtable */
 
+int follow_pfnmap_pte(struct vm_area_struct *vma,
+				unsigned long address, pte_t *ret_ptep);
+
 typedef int (*pte_fn_t)(pte_t *pte, pgtable_t token, unsigned long addr,
 			void *data);
 extern int apply_to_page_range(struct mm_struct *mm, unsigned long address,
diff --git a/mm/memory.c b/mm/memory.c
index cef95c8c77f..8ca6bbf34ad 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1111,6 +1111,49 @@ no_page_table:
 	return page;
 }
 
+int follow_pfnmap_pte(struct vm_area_struct *vma, unsigned long address,
+			pte_t *ret_ptep)
+{
+	pgd_t *pgd;
+	pud_t *pud;
+	pmd_t *pmd;
+	pte_t *ptep, pte;
+	spinlock_t *ptl;
+	struct page *page;
+	struct mm_struct *mm = vma->vm_mm;
+
+	if (!is_pfn_mapping(vma))
+		goto err;
+
+	page = NULL;
+	pgd = pgd_offset(mm, address);
+	if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd)))
+		goto err;
+
+	pud = pud_offset(pgd, address);
+	if (pud_none(*pud) || unlikely(pud_bad(*pud)))
+		goto err;
+
+	pmd = pmd_offset(pud, address);
+	if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd)))
+		goto err;
+
+	ptep = pte_offset_map_lock(mm, pmd, address, &ptl);
+
+	pte = *ptep;
+	if (!pte_present(pte))
+		goto err_unlock;
+
+	*ret_ptep = pte;
+	pte_unmap_unlock(ptep, ptl);
+	return 0;
+
+err_unlock:
+	pte_unmap_unlock(ptep, ptl);
+err:
+	return -EINVAL;
+}
+
 /* Can we do the FOLL_ANON optimization? */
 static inline int use_zero_page(struct vm_area_struct *vma)
 {
-- 
cgit v1.2.3-70-g09d2


From 2ab640379a0ab4cef746ced1d7e04a0941774bcb Mon Sep 17 00:00:00 2001
From: "venkatesh.pallipadi@intel.com" <venkatesh.pallipadi@intel.com>
Date: Thu, 18 Dec 2008 11:41:29 -0800
Subject: x86: PAT: hooks in generic vm code to help archs to track pfnmap
 regions - v3

Impact: Introduces new hooks, which are currently null.

Introduce generic hooks in remap_pfn_range and vm_insert_pfn and
corresponding copy and free routines with reserve and free tracking.

Signed-off-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 include/linux/mm.h |  6 +++++
 mm/memory.c        | 76 +++++++++++++++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 81 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index a25024ff9c1..87ecb40e11a 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -155,6 +155,12 @@ static inline int is_pfn_mapping(struct vm_area_struct *vma)
 	return (vma->vm_flags & VM_PFNMAP);
 }
 
+extern int track_pfn_vma_new(struct vm_area_struct *vma, pgprot_t prot,
+				unsigned long pfn, unsigned long size);
+extern int track_pfn_vma_copy(struct vm_area_struct *vma);
+extern void untrack_pfn_vma(struct vm_area_struct *vma, unsigned long pfn,
+				unsigned long size);
+
 /*
  * vm_fault is filled by the the pagefault handler and passed to the vma's
  * ->fault function. The vma's ->fault is responsible for returning a bitmask
diff --git a/mm/memory.c b/mm/memory.c
index 8ca6bbf34ad..1e8f0d347c0 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -99,6 +99,50 @@ int randomize_va_space __read_mostly =
 					2;
 #endif
 
+#ifndef track_pfn_vma_new
+/*
+ * Interface that can be used by architecture code to keep track of
+ * memory type of pfn mappings (remap_pfn_range, vm_insert_pfn)
+ *
+ * track_pfn_vma_new is called when a _new_ pfn mapping is being established
+ * for physical range indicated by pfn and size.
+ */
+int track_pfn_vma_new(struct vm_area_struct *vma, pgprot_t prot,
+			unsigned long pfn, unsigned long size)
+{
+	return 0;
+}
+#endif
+
+#ifndef track_pfn_vma_copy
+/*
+ * Interface that can be used by architecture code to keep track of
+ * memory type of pfn mappings (remap_pfn_range, vm_insert_pfn)
+ *
+ * track_pfn_vma_copy is called when vma that is covering the pfnmap gets
+ * copied through copy_page_range().
+ */
+int track_pfn_vma_copy(struct vm_area_struct *vma)
+{
+	return 0;
+}
+#endif
+
+#ifndef untrack_pfn_vma
+/*
+ * Interface that can be used by architecture code to keep track of
+ * memory type of pfn mappings (remap_pfn_range, vm_insert_pfn)
+ *
+ * untrack_pfn_vma is called while unmapping a pfnmap for a region.
+ * untrack can be called for a specific region indicated by pfn and size or
+ * can be for the entire vma (in which case size can be zero).
+ */
+void untrack_pfn_vma(struct vm_area_struct *vma, unsigned long pfn,
+			unsigned long size)
+{
+}
+#endif
+
 static int __init disable_randmaps(char *s)
 {
 	randomize_va_space = 0;
@@ -669,6 +713,16 @@ int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
 	if (is_vm_hugetlb_page(vma))
 		return copy_hugetlb_page_range(dst_mm, src_mm, vma);
 
+	if (is_pfn_mapping(vma)) {
+		/*
+		 * We do not free on error cases below as remove_vma
+		 * gets called on error from higher level routine
+		 */
+		ret = track_pfn_vma_copy(vma);
+		if (ret)
+			return ret;
+	}
+
 	/*
 	 * We need to invalidate the secondary MMU mappings only when
 	 * there could be a permission downgrade on the ptes of the
@@ -915,6 +969,9 @@ unsigned long unmap_vmas(struct mmu_gather **tlbp,
 		if (vma->vm_flags & VM_ACCOUNT)
 			*nr_accounted += (end - start) >> PAGE_SHIFT;
 
+		if (is_pfn_mapping(vma))
+			untrack_pfn_vma(vma, 0, 0);
+
 		while (start != end) {
 			if (!tlb_start_valid) {
 				tlb_start = start;
@@ -1473,6 +1530,7 @@ out:
 int vm_insert_pfn(struct vm_area_struct *vma, unsigned long addr,
 			unsigned long pfn)
 {
+	int ret;
 	/*
 	 * Technically, architectures with pte_special can avoid all these
 	 * restrictions (same for remap_pfn_range).  However we would like
@@ -1487,7 +1545,15 @@ int vm_insert_pfn(struct vm_area_struct *vma, unsigned long addr,
 
 	if (addr < vma->vm_start || addr >= vma->vm_end)
 		return -EFAULT;
-	return insert_pfn(vma, addr, pfn, vma->vm_page_prot);
+	if (track_pfn_vma_new(vma, vma->vm_page_prot, pfn, PAGE_SIZE))
+		return -EINVAL;
+
+	ret = insert_pfn(vma, addr, pfn, vma->vm_page_prot);
+
+	if (ret)
+		untrack_pfn_vma(vma, pfn, PAGE_SIZE);
+
+	return ret;
 }
 EXPORT_SYMBOL(vm_insert_pfn);
 
@@ -1625,6 +1691,10 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
 
 	vma->vm_flags |= VM_IO | VM_RESERVED | VM_PFNMAP;
 
+	err = track_pfn_vma_new(vma, prot, pfn, PAGE_ALIGN(size));
+	if (err)
+		return -EINVAL;
+
 	BUG_ON(addr >= end);
 	pfn -= addr >> PAGE_SHIFT;
 	pgd = pgd_offset(mm, addr);
@@ -1636,6 +1706,10 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
 		if (err)
 			break;
 	} while (pgd++, addr = next, addr != end);
+
+	if (err)
+		untrack_pfn_vma(vma, pfn, PAGE_ALIGN(size));
+
 	return err;
 }
 EXPORT_SYMBOL(remap_pfn_range);
-- 
cgit v1.2.3-70-g09d2


From 078a55db075bf4dcc03115dc94875b3dd83f69d4 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai@kernel.org>
Date: Thu, 18 Dec 2008 16:57:52 -0800
Subject: sparseirq: add kernel-doc notation for new member in irq_desc, -v2

Signed-off-by: Yinghai Lu <yinghai@kernel.org>
Acked-by: Randy Dunlap <randy.dunlap@oracle.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/irq.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/irq.h b/include/linux/irq.h
index 36a01574678..7fa7f30fccb 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -134,6 +134,9 @@ struct irq_2_iommu;
 /**
  * struct irq_desc - interrupt descriptor
  * @irq:		interrupt number for this descriptor
+ * @timer_rand_state:	pointer to timer rand state struct
+ * @kstat_irqs:		irq stats per cpu
+ * @irq_2_iommu:	iommu with this irq
  * @handle_irq:		highlevel irq-events handler [if NULL, __do_IRQ()]
  * @chip:		low level interrupt hardware access
  * @msi_desc:		MSI descriptor
-- 
cgit v1.2.3-70-g09d2


From 7b4967c532045a1983d6d4af5c69cc7c5109f62b Mon Sep 17 00:00:00 2001
From: Mike Travis <travis@sgi.com>
Date: Fri, 19 Dec 2008 16:56:37 +1030
Subject: cpumask: Add alloc_cpumask_var_node()

Impact: New API

This will be needed in x86 code to allocate the domain and old_domain
cpumasks on the same node as where the containing irq_cfg struct is
allocated.

(Also fixes double-dump_stack on rare CONFIG_DEBUG_PER_CPU_MAPS case)

Signed-off-by: Mike Travis <travis@sgi.com>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au> (re-impl alloc_cpumask_var)
---
 include/linux/cpumask.h |  7 +++++++
 lib/cpumask.c           | 11 ++++++++---
 2 files changed, 15 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h
index d4bf52603e6..b5ad19a6f43 100644
--- a/include/linux/cpumask.h
+++ b/include/linux/cpumask.h
@@ -1025,6 +1025,7 @@ static inline size_t cpumask_size(void)
 #ifdef CONFIG_CPUMASK_OFFSTACK
 typedef struct cpumask *cpumask_var_t;
 
+bool alloc_cpumask_var_node(cpumask_var_t *mask, gfp_t flags, int node);
 bool alloc_cpumask_var(cpumask_var_t *mask, gfp_t flags);
 void alloc_bootmem_cpumask_var(cpumask_var_t *mask);
 void free_cpumask_var(cpumask_var_t mask);
@@ -1038,6 +1039,12 @@ static inline bool alloc_cpumask_var(cpumask_var_t *mask, gfp_t flags)
 	return true;
 }
 
+static inline bool alloc_cpumask_var_node(cpumask_var_t *mask, gfp_t flags,
+					  int node)
+{
+	return true;
+}
+
 static inline void alloc_bootmem_cpumask_var(cpumask_var_t *mask)
 {
 }
diff --git a/lib/cpumask.c b/lib/cpumask.c
index 8d03f22c6ce..3f258f58c85 100644
--- a/lib/cpumask.c
+++ b/lib/cpumask.c
@@ -76,15 +76,14 @@ int cpumask_any_but(const struct cpumask *mask, unsigned int cpu)
 
 /* These are not inline because of header tangles. */
 #ifdef CONFIG_CPUMASK_OFFSTACK
-bool alloc_cpumask_var(cpumask_var_t *mask, gfp_t flags)
+bool alloc_cpumask_var_node(cpumask_var_t *mask, gfp_t flags, int node)
 {
 	if (likely(slab_is_available()))
-		*mask = kmalloc(cpumask_size(), flags);
+		*mask = kmalloc_node(cpumask_size(), flags, node);
 	else {
 #ifdef CONFIG_DEBUG_PER_CPU_MAPS
 		printk(KERN_ERR
 			"=> alloc_cpumask_var: kmalloc not available!\n");
-		dump_stack();
 #endif
 		*mask = NULL;
 	}
@@ -96,6 +95,12 @@ bool alloc_cpumask_var(cpumask_var_t *mask, gfp_t flags)
 #endif
 	return *mask != NULL;
 }
+EXPORT_SYMBOL(alloc_cpumask_var_node);
+
+bool alloc_cpumask_var(cpumask_var_t *mask, gfp_t flags)
+{
+	return alloc_cpumask_var_node(mask, flags, numa_node_id());
+}
 EXPORT_SYMBOL(alloc_cpumask_var);
 
 void __init alloc_bootmem_cpumask_var(cpumask_var_t *mask)
-- 
cgit v1.2.3-70-g09d2


From e057d7aea9d8f2a46cd440d8bfb72245d4e72d79 Mon Sep 17 00:00:00 2001
From: Mike Travis <travis@sgi.com>
Date: Mon, 15 Dec 2008 20:26:48 -0800
Subject: cpumask: add sysfs displays for configured and disabled cpu maps

Impact: add new sysfs files.

Add sysfs files "kernel_max" and "offline" to display the max CPU index
allowed (NR_CPUS-1), and the map of cpus that are offline.

Cpus can be offlined via HOTPLUG, disabled by the BIOS ACPI tables, or
if they exceed the number of cpus allowed by the NR_CPUS config option,
or the "maxcpus=NUM" kernel start parameter.

The "possible_cpus=NUM" parameter can also extend the number of possible
cpus allowed, in which case the cpus not present at startup will be
in the offline state.  (These cpus can be HOTPLUGGED ON after system
startup [pending a follow-on patch to provide the capability via the
/sys/devices/sys/cpu/cpuN/online mechanism to bring them online.])

By design, the "offlined cpus > possible cpus" display will always
use the following formats:

  * all possible cpus online:   "x$"    or "x-y$"
  * some possible cpus offline: ".*,x$" or ".*,x-y$"

where:
  x == number of possible cpus (nr_cpu_ids); and
  y == number of cpus >= NR_CPUS or maxcpus (if y > x).

One use of this feature is for distros to select (or configure) the
appropriate kernel to install for the resident system.

Notes:
  * cpus offlined <= possible cpus will be printed for all architectures.
  * cpus offlined >  possible cpus will only be printed for arches that
  	set 'total_cpus' [X86 only in this patch].

Based on tip/cpus4096 + .../rusty/linux-2.6-for-ingo.git/master +
	 x86-only-patches sent 12/15.

Signed-off-by: Mike Travis <travis@sgi.com>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 drivers/base/cpu.c  | 44 ++++++++++++++++++++++++++++++++++++++++++++
 include/linux/smp.h |  3 +++
 2 files changed, 47 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/base/cpu.c b/drivers/base/cpu.c
index 4259072f5bd..2aef96f20b3 100644
--- a/drivers/base/cpu.c
+++ b/drivers/base/cpu.c
@@ -128,10 +128,54 @@ print_cpus_func(online);
 print_cpus_func(possible);
 print_cpus_func(present);
 
+/*
+ * Print values for NR_CPUS and offlined cpus
+ */
+static ssize_t print_cpus_kernel_max(struct sysdev_class *class, char *buf)
+{
+	int n = snprintf(buf, PAGE_SIZE-2, "%d\n", CONFIG_NR_CPUS - 1);
+	return n;
+}
+static SYSDEV_CLASS_ATTR(kernel_max, 0444, print_cpus_kernel_max, NULL);
+
+/* arch-optional setting to enable display of offline cpus >= nr_cpu_ids */
+unsigned int total_cpus;
+
+static ssize_t print_cpus_offline(struct sysdev_class *class, char *buf)
+{
+	int n = 0, len = PAGE_SIZE-2;
+	cpumask_var_t offline;
+
+	/* display offline cpus < nr_cpu_ids */
+	if (!alloc_cpumask_var(&offline, GFP_KERNEL))
+		return -ENOMEM;
+	cpumask_complement(offline, cpu_online_mask);
+	n = cpulist_scnprintf(buf, len, offline);
+	free_cpumask_var(offline);
+
+	/* display offline cpus >= nr_cpu_ids */
+	if (total_cpus && nr_cpu_ids < total_cpus) {
+		if (n && n < len)
+			buf[n++] = ',';
+
+		if (nr_cpu_ids == total_cpus-1)
+			n += snprintf(&buf[n], len - n, "%d", nr_cpu_ids);
+		else
+			n += snprintf(&buf[n], len - n, "%d-%d",
+						      nr_cpu_ids, total_cpus-1);
+	}
+
+	n += snprintf(&buf[n], len - n, "\n");
+	return n;
+}
+static SYSDEV_CLASS_ATTR(offline, 0444, print_cpus_offline, NULL);
+
 static struct sysdev_class_attribute *cpu_state_attr[] = {
 	&attr_online_map,
 	&attr_possible_map,
 	&attr_present_map,
+	&attr_kernel_max,
+	&attr_offline,
 };
 
 static int cpu_states_init(void)
diff --git a/include/linux/smp.h b/include/linux/smp.h
index 3f9a60043a9..0d5770c2e43 100644
--- a/include/linux/smp.h
+++ b/include/linux/smp.h
@@ -21,6 +21,9 @@ struct call_single_data {
 	u16 priv;
 };
 
+/* total number of cpus in this system (may exceed NR_CPUS) */
+extern unsigned int total_cpus;
+
 #ifdef CONFIG_SMP
 
 #include <linux/preempt.h>
-- 
cgit v1.2.3-70-g09d2


From 716707b29906e1d8d190defe3d646610b097a861 Mon Sep 17 00:00:00 2001
From: Vaidyanathan Srinivasan <svaidy@linux.vnet.ibm.com>
Date: Thu, 18 Dec 2008 23:26:02 +0530
Subject: sched: convert BALANCE_FOR_xx_POWER to inline functions

Impact: cleanup

BALANCE_FOR_MC_POWER and similar macros defined in sched.h are
not constants and have various condition checks and significant
amount of code that is not suitable to be contain in a macro.
Also there could be side effects on the expressions passed to
some of them like test_sd_parent().

This patch converts all complex macros related to power savings
balance to inline functions.

Signed-off-by: Vaidyanathan Srinivasan <svaidy@linux.vnet.ibm.com>
Acked-by: Balbir Singh <balbir@linux.vnet.ibm.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/sched.h    | 33 ++++++++++++++++++++++++---------
 include/linux/topology.h |  4 ++--
 2 files changed, 26 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 4240f6bfa81..1210fb0e45f 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -763,15 +763,23 @@ enum cpu_idle_type {
 #define SD_SERIALIZE		1024	/* Only a single load balancing instance */
 #define SD_WAKE_IDLE_FAR	2048	/* Gain latency sacrificing cache hit */
 
-#define BALANCE_FOR_MC_POWER	\
-	(sched_smt_power_savings ? SD_POWERSAVINGS_BALANCE : 0)
+extern int sched_mc_power_savings, sched_smt_power_savings;
+
+static inline int sd_balance_for_mc_power(void)
+{
+	if (sched_smt_power_savings)
+		return SD_POWERSAVINGS_BALANCE;
 
-#define BALANCE_FOR_PKG_POWER	\
-	((sched_mc_power_savings || sched_smt_power_savings) ?	\
-	 SD_POWERSAVINGS_BALANCE : 0)
+	return 0;
+}
 
-#define test_sd_parent(sd, flag)	((sd->parent &&		\
-					 (sd->parent->flags & flag)) ? 1 : 0)
+static inline int sd_balance_for_package_power(void)
+{
+	if (sched_mc_power_savings | sched_smt_power_savings)
+		return SD_POWERSAVINGS_BALANCE;
+
+	return 0;
+}
 
 
 struct sched_group {
@@ -1399,6 +1407,15 @@ struct task_struct {
 #endif
 };
 
+/* Test a flag in parent sched domain */
+static inline int test_sd_parent(struct sched_domain *sd, int flag)
+{
+	if (sd->parent && (sd->parent->flags & flag))
+		return 1;
+
+	return 0;
+}
+
 /*
  * Priority of a process goes from 0..MAX_PRIO-1, valid RT
  * priority is 0..MAX_RT_PRIO-1, and SCHED_NORMAL/SCHED_BATCH
@@ -2256,8 +2273,6 @@ __trace_special(void *__tr, void *__data,
 extern long sched_setaffinity(pid_t pid, const struct cpumask *new_mask);
 extern long sched_getaffinity(pid_t pid, struct cpumask *mask);
 
-extern int sched_mc_power_savings, sched_smt_power_savings;
-
 extern void normalize_rt_tasks(void);
 
 #ifdef CONFIG_GROUP_SCHED
diff --git a/include/linux/topology.h b/include/linux/topology.h
index 0c5b5ac36d8..0ce7c0dac06 100644
--- a/include/linux/topology.h
+++ b/include/linux/topology.h
@@ -125,7 +125,7 @@ int arch_update_cpu_topology(void);
 				| SD_WAKE_AFFINE	\
 				| SD_WAKE_BALANCE	\
 				| SD_SHARE_PKG_RESOURCES\
-				| BALANCE_FOR_MC_POWER,	\
+				| sd_balance_for_mc_power(),\
 	.last_balance		= jiffies,		\
 	.balance_interval	= 1,			\
 }
@@ -150,7 +150,7 @@ int arch_update_cpu_topology(void);
 				| SD_BALANCE_FORK	\
 				| SD_WAKE_AFFINE	\
 				| SD_WAKE_BALANCE	\
-				| BALANCE_FOR_PKG_POWER,\
+				| sd_balance_for_package_power(),\
 	.last_balance		= jiffies,		\
 	.balance_interval	= 1,			\
 }
-- 
cgit v1.2.3-70-g09d2


From afb8a9b70b86866a60e08b2956ae4e1406390336 Mon Sep 17 00:00:00 2001
From: Gautham R Shenoy <ego@in.ibm.com>
Date: Thu, 18 Dec 2008 23:26:09 +0530
Subject: sched: framework for sched_mc/smt_power_savings=N

Impact: extend range of /sys/devices/system/cpu/sched_mc_power_savings

Currently the sched_mc/smt_power_savings variable is a boolean,
which either enables or disables topology based power savings.
This patch extends the behaviour of the variable from boolean to
multivalued, such that based on the value, we decide how
aggressively do we want to perform powersavings balance at
appropriate sched domain based on topology.

Variable levels of power saving tunable would benefit end user to
match the required level of power savings vs performance
trade-off depending on the system configuration and workloads.

This version makes the sched_mc_power_savings global variable to
take more values (0,1,2).  Later versions can have a single
tunable called sched_power_savings instead of
sched_{mc,smt}_power_savings.

Signed-off-by: Gautham R Shenoy <ego@in.ibm.com>
Signed-off-by: Vaidyanathan Srinivasan <svaidy@linux.vnet.ibm.com>
Acked-by: Balbir Singh <balbir@linux.vnet.ibm.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/sched.h | 11 +++++++++++
 kernel/sched.c        | 17 ++++++++++++++---
 2 files changed, 25 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 1210fb0e45f..a96726658ec 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -763,6 +763,17 @@ enum cpu_idle_type {
 #define SD_SERIALIZE		1024	/* Only a single load balancing instance */
 #define SD_WAKE_IDLE_FAR	2048	/* Gain latency sacrificing cache hit */
 
+enum powersavings_balance_level {
+	POWERSAVINGS_BALANCE_NONE = 0,  /* No power saving load balance */
+	POWERSAVINGS_BALANCE_BASIC,	/* Fill one thread/core/package
+					 * first for long running threads
+					 */
+	POWERSAVINGS_BALANCE_WAKEUP,	/* Also bias task wakeups to semi-idle
+					 * cpu package for power savings
+					 */
+	MAX_POWERSAVINGS_BALANCE_LEVELS
+};
+
 extern int sched_mc_power_savings, sched_smt_power_savings;
 
 static inline int sd_balance_for_mc_power(void)
diff --git a/kernel/sched.c b/kernel/sched.c
index b309027bf9e..56b285cd535 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -7906,14 +7906,25 @@ int arch_reinit_sched_domains(void)
 static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt)
 {
 	int ret;
+	unsigned int level = 0;
 
-	if (buf[0] != '0' && buf[0] != '1')
+	if (sscanf(buf, "%u", &level) != 1)
+		return -EINVAL;
+
+	/*
+	 * level is always be positive so don't check for
+	 * level < POWERSAVINGS_BALANCE_NONE which is 0
+	 * What happens on 0 or 1 byte write,
+	 * need to check for count as well?
+	 */
+
+	if (level >= MAX_POWERSAVINGS_BALANCE_LEVELS)
 		return -EINVAL;
 
 	if (smt)
-		sched_smt_power_savings = (buf[0] == '1');
+		sched_smt_power_savings = level;
 	else
-		sched_mc_power_savings = (buf[0] == '1');
+		sched_mc_power_savings = level;
 
 	ret = arch_reinit_sched_domains();
 
-- 
cgit v1.2.3-70-g09d2


From 100fdaee70ebf5f31b9451fbc01300c627091328 Mon Sep 17 00:00:00 2001
From: Vaidyanathan Srinivasan <svaidy@linux.vnet.ibm.com>
Date: Thu, 18 Dec 2008 23:26:47 +0530
Subject: sched: add SD_BALANCE_NEWIDLE at MC and CPU level for sched_mc>0

Impact: change task balancing to save power more agressively

Add SD_BALANCE_NEWIDLE flag at MC level and CPU level
if sched_mc is set.  This helps power savings and
will not affect performance when sched_mc=0

Ingo and Mike Galbraith have optimised the SD flags by
removing SD_BALANCE_NEWIDLE at MC and CPU level.  This
helps performance but hurts power savings since this
slows down task consolidation by reducing the number
of times load_balance is run.

    sched: fine-tune SD_MC_INIT
        commit 14800984706bf6936bbec5187f736e928be5c218
        Author: Mike Galbraith <efault@gmx.de>
        Date:   Fri Nov 7 15:26:50 2008 +0100

    sched: re-tune balancing -- revert
        commit 9fcd18c9e63e325dbd2b4c726623f760788d5aa8
        Author: Ingo Molnar <mingo@elte.hu>
        Date:   Wed Nov 5 16:52:08 2008 +0100

This patch selectively enables SD_BALANCE_NEWIDLE flag
only when sched_mc is set to 1 or 2.  This helps power savings
by task consolidation and also does not hurt performance at
sched_mc=0 where all power saving optimisations are turned off.

Signed-off-by: Vaidyanathan Srinivasan <svaidy@linux.vnet.ibm.com>
Acked-by: Balbir Singh <balbir@linux.vnet.ibm.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/sched.h    | 13 +++++++++++++
 include/linux/topology.h |  6 ++++--
 2 files changed, 17 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index a96726658ec..5a933d92547 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -792,6 +792,19 @@ static inline int sd_balance_for_package_power(void)
 	return 0;
 }
 
+/*
+ * Optimise SD flags for power savings:
+ * SD_BALANCE_NEWIDLE helps agressive task consolidation and power savings.
+ * Keep default SD flags if sched_{smt,mc}_power_saving=0
+ */
+
+static inline int sd_power_saving_flags(void)
+{
+	if (sched_mc_power_savings | sched_smt_power_savings)
+		return SD_BALANCE_NEWIDLE;
+
+	return 0;
+}
 
 struct sched_group {
 	struct sched_group *next;	/* Must be a circular list */
diff --git a/include/linux/topology.h b/include/linux/topology.h
index 0ce7c0dac06..e632d29f054 100644
--- a/include/linux/topology.h
+++ b/include/linux/topology.h
@@ -125,7 +125,8 @@ int arch_update_cpu_topology(void);
 				| SD_WAKE_AFFINE	\
 				| SD_WAKE_BALANCE	\
 				| SD_SHARE_PKG_RESOURCES\
-				| sd_balance_for_mc_power(),\
+				| sd_balance_for_mc_power()\
+				| sd_power_saving_flags(),\
 	.last_balance		= jiffies,		\
 	.balance_interval	= 1,			\
 }
@@ -150,7 +151,8 @@ int arch_update_cpu_topology(void);
 				| SD_BALANCE_FORK	\
 				| SD_WAKE_AFFINE	\
 				| SD_WAKE_BALANCE	\
-				| sd_balance_for_package_power(),\
+				| sd_balance_for_package_power()\
+				| sd_power_saving_flags(),\
 	.last_balance		= jiffies,		\
 	.balance_interval	= 1,			\
 }
-- 
cgit v1.2.3-70-g09d2


From 06aaf76a7e2e4cc57eabcb8f43ec99c961fe55fe Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Thu, 18 Dec 2008 21:30:23 +0100
Subject: sched: move test_sd_parent() to an SMP section of sched.h

Impact: build fix

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/sched.h | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 5a933d92547..e5f928a079e 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -920,6 +920,15 @@ extern void partition_sched_domains(int ndoms_new, struct cpumask *doms_new,
 				    struct sched_domain_attr *dattr_new);
 extern int arch_reinit_sched_domains(void);
 
+/* Test a flag in parent sched domain */
+static inline int test_sd_parent(struct sched_domain *sd, int flag)
+{
+	if (sd->parent && (sd->parent->flags & flag))
+		return 1;
+
+	return 0;
+}
+
 #else /* CONFIG_SMP */
 
 struct sched_domain_attr;
@@ -1431,15 +1440,6 @@ struct task_struct {
 #endif
 };
 
-/* Test a flag in parent sched domain */
-static inline int test_sd_parent(struct sched_domain *sd, int flag)
-{
-	if (sd->parent && (sd->parent->flags & flag))
-		return 1;
-
-	return 0;
-}
-
 /*
  * Priority of a process goes from 0..MAX_PRIO-1, valid RT
  * priority is 0..MAX_RT_PRIO-1, and SCHED_NORMAL/SCHED_BATCH
-- 
cgit v1.2.3-70-g09d2


From 420e7fabd9c6d907280ed6b3e40eef425c5d8d8d Mon Sep 17 00:00:00 2001
From: Henning Rogge <hrogge@googlemail.com>
Date: Thu, 11 Dec 2008 22:04:19 +0100
Subject: nl80211: Add signal strength and bandwith to nl80211station info

This patch adds signal strength and transmission bitrate
to the station_info of nl80211.

Signed-off-by: Henning Rogge <rogge@fgan.de>
Acked-by: Johannes Berg <johannes@sipsolutions.net>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 include/linux/nl80211.h | 31 ++++++++++++++++++++++++++
 include/net/cfg80211.h  | 40 ++++++++++++++++++++++++++++++++++
 net/mac80211/cfg.c      | 25 ++++++++++++++++++++-
 net/wireless/nl80211.c  | 58 ++++++++++++++++++++++++++++++++++++++++++++++++-
 4 files changed, 152 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/nl80211.h b/include/linux/nl80211.h
index 04d4516f9c7..7501acfcfdc 100644
--- a/include/linux/nl80211.h
+++ b/include/linux/nl80211.h
@@ -424,6 +424,32 @@ enum nl80211_sta_flags {
 	NL80211_STA_FLAG_MAX = __NL80211_STA_FLAG_AFTER_LAST - 1
 };
 
+/**
+ * enum nl80211_rate_info - bitrate information
+ *
+ * These attribute types are used with %NL80211_STA_INFO_TXRATE
+ * when getting information about the bitrate of a station.
+ *
+ * @__NL80211_RATE_INFO_INVALID: attribute number 0 is reserved
+ * @NL80211_RATE_INFO_BITRATE: total bitrate (u16, 100kbit/s)
+ * @NL80211_RATE_INFO_MCS: mcs index for 802.11n (u8)
+ * @NL80211_RATE_INFO_40_MHZ_WIDTH: 40 Mhz dualchannel bitrate
+ * @NL80211_RATE_INFO_SHORT_GI: 400ns guard interval
+ * @NL80211_RATE_INFO_MAX: highest rate_info number currently defined
+ * @__NL80211_RATE_INFO_AFTER_LAST: internal use
+ */
+enum nl80211_rate_info {
+	__NL80211_RATE_INFO_INVALID,
+	NL80211_RATE_INFO_BITRATE,
+	NL80211_RATE_INFO_MCS,
+	NL80211_RATE_INFO_40_MHZ_WIDTH,
+	NL80211_RATE_INFO_SHORT_GI,
+
+	/* keep last */
+	__NL80211_RATE_INFO_AFTER_LAST,
+	NL80211_RATE_INFO_MAX = __NL80211_RATE_INFO_AFTER_LAST - 1
+};
+
 /**
  * enum nl80211_sta_info - station information
  *
@@ -436,6 +462,9 @@ enum nl80211_sta_flags {
  * @NL80211_STA_INFO_TX_BYTES: total transmitted bytes (u32, to this station)
  * @__NL80211_STA_INFO_AFTER_LAST: internal
  * @NL80211_STA_INFO_MAX: highest possible station info attribute
+ * @NL80211_STA_INFO_SIGNAL: signal strength of last received PPDU (u8, dBm)
+ * @NL80211_STA_INFO_TX_BITRATE: current unicast tx rate, nested attribute
+ * 	containing info as possible, see &enum nl80211_sta_info_txrate.
  */
 enum nl80211_sta_info {
 	__NL80211_STA_INFO_INVALID,
@@ -445,6 +474,8 @@ enum nl80211_sta_info {
 	NL80211_STA_INFO_LLID,
 	NL80211_STA_INFO_PLID,
 	NL80211_STA_INFO_PLINK_STATE,
+	NL80211_STA_INFO_SIGNAL,
+	NL80211_STA_INFO_TX_BITRATE,
 
 	/* keep last */
 	__NL80211_STA_INFO_AFTER_LAST,
diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index a0c0bf19496..65e03ac9310 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -169,6 +169,9 @@ struct station_parameters {
  * @STATION_INFO_LLID: @llid filled
  * @STATION_INFO_PLID: @plid filled
  * @STATION_INFO_PLINK_STATE: @plink_state filled
+ * @STATION_INFO_SIGNAL: @signal filled
+ * @STATION_INFO_TX_BITRATE: @tx_bitrate fields are filled
+ *  (tx_bitrate, tx_bitrate_flags and tx_bitrate_mcs)
  */
 enum station_info_flags {
 	STATION_INFO_INACTIVE_TIME	= 1<<0,
@@ -177,6 +180,39 @@ enum station_info_flags {
 	STATION_INFO_LLID		= 1<<3,
 	STATION_INFO_PLID		= 1<<4,
 	STATION_INFO_PLINK_STATE	= 1<<5,
+	STATION_INFO_SIGNAL		= 1<<6,
+	STATION_INFO_TX_BITRATE		= 1<<7,
+};
+
+/**
+ * enum station_info_rate_flags - bitrate info flags
+ *
+ * Used by the driver to indicate the specific rate transmission
+ * type for 802.11n transmissions.
+ *
+ * @RATE_INFO_FLAGS_MCS: @tx_bitrate_mcs filled
+ * @RATE_INFO_FLAGS_40_MHZ_WIDTH: 40 Mhz width transmission
+ * @RATE_INFO_FLAGS_SHORT_GI: 400ns guard interval
+ */
+enum rate_info_flags {
+	RATE_INFO_FLAGS_MCS		= 1<<0,
+	RATE_INFO_FLAGS_40_MHZ_WIDTH	= 1<<1,
+	RATE_INFO_FLAGS_SHORT_GI	= 1<<2,
+};
+
+/**
+ * struct rate_info - bitrate information
+ *
+ * Information about a receiving or transmitting bitrate
+ *
+ * @flags: bitflag of flags from &enum rate_info_flags
+ * @mcs: mcs index if struct describes a 802.11n bitrate
+ * @legacy: bitrate in 100kbit/s for 802.11abg
+ */
+struct rate_info {
+	u8 flags;
+	u8 mcs;
+	u16 legacy;
 };
 
 /**
@@ -191,6 +227,8 @@ enum station_info_flags {
  * @llid: mesh local link id
  * @plid: mesh peer link id
  * @plink_state: mesh peer link state
+ * @signal: signal strength of last received packet in dBm
+ * @txrate: current unicast bitrate to this station
  */
 struct station_info {
 	u32 filled;
@@ -200,6 +238,8 @@ struct station_info {
 	u16 llid;
 	u16 plid;
 	u8 plink_state;
+	s8 signal;
+	struct rate_info txrate;
 };
 
 /**
diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c
index 7912eb14eca..23b5eeaf7bc 100644
--- a/net/mac80211/cfg.c
+++ b/net/mac80211/cfg.c
@@ -310,12 +310,35 @@ static void sta_set_sinfo(struct sta_info *sta, struct station_info *sinfo)
 
 	sinfo->filled = STATION_INFO_INACTIVE_TIME |
 			STATION_INFO_RX_BYTES |
-			STATION_INFO_TX_BYTES;
+			STATION_INFO_TX_BYTES |
+			STATION_INFO_TX_BITRATE;
 
 	sinfo->inactive_time = jiffies_to_msecs(jiffies - sta->last_rx);
 	sinfo->rx_bytes = sta->rx_bytes;
 	sinfo->tx_bytes = sta->tx_bytes;
 
+	if (sta->local->hw.flags & IEEE80211_HW_SIGNAL_DBM) {
+		sinfo->filled |= STATION_INFO_SIGNAL;
+		sinfo->signal = (s8)sta->last_signal;
+	}
+
+	sinfo->txrate.flags = 0;
+	if (sta->last_tx_rate.flags & IEEE80211_TX_RC_MCS)
+		sinfo->txrate.flags |= RATE_INFO_FLAGS_MCS;
+	if (sta->last_tx_rate.flags & IEEE80211_TX_RC_40_MHZ_WIDTH)
+		sinfo->txrate.flags |= RATE_INFO_FLAGS_40_MHZ_WIDTH;
+	if (sta->last_tx_rate.flags & IEEE80211_TX_RC_SHORT_GI)
+		sinfo->txrate.flags |= RATE_INFO_FLAGS_SHORT_GI;
+
+	if (!(sta->last_tx_rate.flags & IEEE80211_TX_RC_MCS)) {
+		struct ieee80211_supported_band *sband;
+		sband = sta->local->hw.wiphy->bands[
+				sta->local->hw.conf.channel->band];
+		sinfo->txrate.legacy =
+			sband->bitrates[sta->last_tx_rate.idx].bitrate;
+	} else
+		sinfo->txrate.mcs = sta->last_tx_rate.idx;
+
 	if (ieee80211_vif_is_mesh(&sdata->vif)) {
 #ifdef CONFIG_MAC80211_MESH
 		sinfo->filled |= STATION_INFO_LLID |
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index 4335f76be71..93c9b983ce0 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -1091,12 +1091,46 @@ static int parse_station_flags(struct nlattr *nla, u32 *staflags)
 	return 0;
 }
 
+static u16 nl80211_calculate_bitrate(struct rate_info *rate)
+{
+	int modulation, streams, bitrate;
+
+	if (!(rate->flags & RATE_INFO_FLAGS_MCS))
+		return rate->legacy;
+
+	/* the formula below does only work for MCS values smaller than 32 */
+	if (rate->mcs >= 32)
+		return 0;
+
+	modulation = rate->mcs & 7;
+	streams = (rate->mcs >> 3) + 1;
+
+	bitrate = (rate->flags & RATE_INFO_FLAGS_40_MHZ_WIDTH) ?
+			13500000 : 6500000;
+
+	if (modulation < 4)
+		bitrate *= (modulation + 1);
+	else if (modulation == 4)
+		bitrate *= (modulation + 2);
+	else
+		bitrate *= (modulation + 3);
+
+	bitrate *= streams;
+
+	if (rate->flags & RATE_INFO_FLAGS_SHORT_GI)
+		bitrate = (bitrate / 9) * 10;
+
+	/* do NOT round down here */
+	return (bitrate + 50000) / 100000;
+}
+
 static int nl80211_send_station(struct sk_buff *msg, u32 pid, u32 seq,
 				int flags, struct net_device *dev,
 				u8 *mac_addr, struct station_info *sinfo)
 {
 	void *hdr;
-	struct nlattr *sinfoattr;
+	struct nlattr *sinfoattr, *txrate;
+	u16 bitrate;
 
 	hdr = nl80211hdr_put(msg, pid, seq, flags, NL80211_CMD_NEW_STATION);
 	if (!hdr)
@@ -1126,7 +1160,29 @@ static int nl80211_send_station(struct sk_buff *msg, u32 pid, u32 seq,
 	if (sinfo->filled & STATION_INFO_PLINK_STATE)
 		NLA_PUT_U8(msg, NL80211_STA_INFO_PLINK_STATE,
 			    sinfo->plink_state);
+	if (sinfo->filled & STATION_INFO_SIGNAL)
+		NLA_PUT_U8(msg, NL80211_STA_INFO_SIGNAL,
+			   sinfo->signal);
+	if (sinfo->filled & STATION_INFO_TX_BITRATE) {
+		txrate = nla_nest_start(msg, NL80211_STA_INFO_TX_BITRATE);
+		if (!txrate)
+			goto nla_put_failure;
+
+		/* nl80211_calculate_bitrate will return 0 for mcs >= 32 */
+		bitrate = nl80211_calculate_bitrate(&sinfo->txrate);
+		if (bitrate > 0)
+			NLA_PUT_U16(msg, NL80211_RATE_INFO_BITRATE, bitrate);
 
+		if (sinfo->txrate.flags & RATE_INFO_FLAGS_MCS)
+			NLA_PUT_U8(msg, NL80211_RATE_INFO_MCS,
+				    sinfo->txrate.mcs);
+		if (sinfo->txrate.flags & RATE_INFO_FLAGS_40_MHZ_WIDTH)
+			NLA_PUT_FLAG(msg, NL80211_RATE_INFO_40_MHZ_WIDTH);
+		if (sinfo->txrate.flags & RATE_INFO_FLAGS_SHORT_GI)
+			NLA_PUT_FLAG(msg, NL80211_RATE_INFO_SHORT_GI);
+
+		nla_nest_end(msg, txrate);
+	}
 	nla_nest_end(msg, sinfoattr);
 
 	return genlmsg_end(msg, hdr);
-- 
cgit v1.2.3-70-g09d2


From 094d05dc32fc2930e381189a942016e5561775d9 Mon Sep 17 00:00:00 2001
From: Sujith <Sujith.Manoharan@atheros.com>
Date: Fri, 12 Dec 2008 11:57:43 +0530
Subject: mac80211: Fix HT channel selection

HT management is done differently for AP and STA modes, unify
to just the ->config() callback since HT is fundamentally a
PHY property and cannot be per-BSS.

Rename enum nl80211_sec_chan_offset as nl80211_channel_type to denote
the channel type ( NO_HT, HT20, HT40+, HT40- ).

Signed-off-by: Johannes Berg <johannes@sipsolutions.net>
Signed-off-by: Sujith <Sujith.Manoharan@atheros.com>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 drivers/net/wireless/ath9k/main.c      | 123 ++++++++-------------------------
 drivers/net/wireless/iwlwifi/iwl-agn.c |  18 +++--
 drivers/net/wireless/mac80211_hwsim.c  |   6 +-
 include/linux/nl80211.h                |  22 +++---
 include/net/cfg80211.h                 |   2 +-
 include/net/mac80211.h                 |   9 +--
 net/mac80211/cfg.c                     |   4 +-
 net/mac80211/ht.c                      |  35 +++++++---
 net/mac80211/ieee80211_i.h             |   2 +-
 net/mac80211/main.c                    |  27 +++-----
 net/mac80211/mlme.c                    |   1 +
 net/mac80211/util.c                    |   2 +-
 net/wireless/nl80211.c                 |  27 ++++----
 13 files changed, 109 insertions(+), 169 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/wireless/ath9k/main.c b/drivers/net/wireless/ath9k/main.c
index 02e1771bb27..e22fea18bad 100644
--- a/drivers/net/wireless/ath9k/main.c
+++ b/drivers/net/wireless/ath9k/main.c
@@ -623,37 +623,40 @@ static int ath_get_channel(struct ath_softc *sc,
 	return -1;
 }
 
-/* ext_chan_offset: (-1, 0, 1) (below, none, above) */
-
 static u32 ath_get_extchanmode(struct ath_softc *sc,
 			       struct ieee80211_channel *chan,
-			       int ext_chan_offset,
-			       enum ath9k_ht_macmode tx_chan_width)
+			       enum nl80211_channel_type channel_type)
 {
 	u32 chanmode = 0;
 
 	switch (chan->band) {
 	case IEEE80211_BAND_2GHZ:
-		if ((ext_chan_offset == 0) &&
-		    (tx_chan_width == ATH9K_HT_MACMODE_20))
+		switch(channel_type) {
+		case NL80211_CHAN_NO_HT:
+		case NL80211_CHAN_HT20:
 			chanmode = CHANNEL_G_HT20;
-		if ((ext_chan_offset == 1) &&
-		    (tx_chan_width == ATH9K_HT_MACMODE_2040))
+			break;
+		case NL80211_CHAN_HT40PLUS:
 			chanmode = CHANNEL_G_HT40PLUS;
-		if ((ext_chan_offset == -1) &&
-		    (tx_chan_width == ATH9K_HT_MACMODE_2040))
+			break;
+		case NL80211_CHAN_HT40MINUS:
 			chanmode = CHANNEL_G_HT40MINUS;
+			break;
+		}
 		break;
 	case IEEE80211_BAND_5GHZ:
-		if ((ext_chan_offset == 0) &&
-		    (tx_chan_width == ATH9K_HT_MACMODE_20))
+		switch(channel_type) {
+		case NL80211_CHAN_NO_HT:
+		case NL80211_CHAN_HT20:
 			chanmode = CHANNEL_A_HT20;
-		if ((ext_chan_offset == 1) &&
-		    (tx_chan_width == ATH9K_HT_MACMODE_2040))
+			break;
+		case NL80211_CHAN_HT40PLUS:
 			chanmode = CHANNEL_A_HT40PLUS;
-		if ((ext_chan_offset == -1) &&
-		    (tx_chan_width == ATH9K_HT_MACMODE_2040))
+			break;
+		case NL80211_CHAN_HT40MINUS:
 			chanmode = CHANNEL_A_HT40MINUS;
+			break;
+		}
 		break;
 	default:
 		break;
@@ -829,45 +832,15 @@ static void setup_ht_cap(struct ieee80211_sta_ht_cap *ht_info)
 	ht_info->mcs.tx_params = IEEE80211_HT_MCS_TX_DEFINED;
 }
 
-static void ath9k_ht_conf(struct ath_softc *sc,
-			  struct ieee80211_bss_conf *bss_conf)
-{
-	if (sc->hw->conf.ht.enabled) {
-		if (bss_conf->ht.width_40_ok)
-			sc->tx_chan_width = ATH9K_HT_MACMODE_2040;
-		else
-			sc->tx_chan_width = ATH9K_HT_MACMODE_20;
-
-		ath9k_hw_set11nmac2040(sc->sc_ah, sc->tx_chan_width);
-
-		DPRINTF(sc, ATH_DBG_CONFIG,
-			"BSS Changed HT, chanwidth: %d\n", sc->tx_chan_width);
-	}
-}
-
-static inline int ath_sec_offset(u8 ext_offset)
-{
-	if (ext_offset == IEEE80211_HT_PARAM_CHA_SEC_NONE)
-		return 0;
-	else if (ext_offset == IEEE80211_HT_PARAM_CHA_SEC_ABOVE)
-		return 1;
-	else if (ext_offset == IEEE80211_HT_PARAM_CHA_SEC_BELOW)
-		return -1;
-
-	return 0;
-}
-
 static void ath9k_bss_assoc_info(struct ath_softc *sc,
 				 struct ieee80211_vif *vif,
 				 struct ieee80211_bss_conf *bss_conf)
 {
-	struct ieee80211_hw *hw = sc->hw;
-	struct ieee80211_channel *curchan = hw->conf.channel;
 	struct ath_vap *avp = (void *)vif->drv_priv;
-	int pos;
 
 	if (bss_conf->assoc) {
-		DPRINTF(sc, ATH_DBG_CONFIG, "Bss Info ASSOC %d\n", bss_conf->aid);
+		DPRINTF(sc, ATH_DBG_CONFIG, "Bss Info ASSOC %d, bssid: %pM\n",
+			bss_conf->aid, sc->sc_curbssid);
 
 		/* New association, store aid */
 		if (avp->av_opmode == NL80211_IFTYPE_STATION) {
@@ -886,40 +859,6 @@ static void ath9k_bss_assoc_info(struct ath_softc *sc,
 		sc->sc_halstats.ns_avgtxrssi = ATH_RSSI_DUMMY_MARKER;
 		sc->sc_halstats.ns_avgtxrate = ATH_RATE_DUMMY_MARKER;
 
-		/* Update chainmask */
-		ath_update_chainmask(sc, hw->conf.ht.enabled);
-
-		DPRINTF(sc, ATH_DBG_CONFIG,
-			"bssid %pM aid 0x%x\n",
-			sc->sc_curbssid, sc->sc_curaid);
-
-		pos = ath_get_channel(sc, curchan);
-		if (pos == -1) {
-			DPRINTF(sc, ATH_DBG_FATAL,
-				"Invalid channel: %d\n", curchan->center_freq);
-			return;
-		}
-
-		if (hw->conf.ht.enabled) {
-			int offset =
-				ath_sec_offset(bss_conf->ht.secondary_channel_offset);
-			sc->tx_chan_width = (bss_conf->ht.width_40_ok) ?
-				ATH9K_HT_MACMODE_2040 : ATH9K_HT_MACMODE_20;
-
-			sc->sc_ah->ah_channels[pos].chanmode =
-				ath_get_extchanmode(sc, curchan,
-						    offset, sc->tx_chan_width);
-		} else {
-			sc->sc_ah->ah_channels[pos].chanmode =
-				(curchan->band == IEEE80211_BAND_2GHZ) ?
-				CHANNEL_G : CHANNEL_A;
-		}
-
-		/* set h/w channel */
-		if (ath_set_channel(sc, &sc->sc_ah->ah_channels[pos]) < 0)
-			DPRINTF(sc, ATH_DBG_FATAL, "Unable to set channel: %d\n",
-				curchan->center_freq);
-
 		/* Start ANI */
 		mod_timer(&sc->sc_ani.timer,
 			jiffies + msecs_to_jiffies(ATH_ANI_POLLINTERVAL));
@@ -2146,7 +2085,8 @@ static int ath9k_config(struct ieee80211_hw *hw, u32 changed)
 	struct ath_softc *sc = hw->priv;
 	struct ieee80211_conf *conf = &hw->conf;
 
-	if (changed & IEEE80211_CONF_CHANGE_CHANNEL) {
+	if (changed & (IEEE80211_CONF_CHANGE_CHANNEL |
+		       IEEE80211_CONF_CHANGE_HT)) {
 		struct ieee80211_channel *curchan = hw->conf.channel;
 		int pos;
 
@@ -2165,25 +2105,23 @@ static int ath9k_config(struct ieee80211_hw *hw, u32 changed)
 			(curchan->band == IEEE80211_BAND_2GHZ) ?
 			CHANNEL_G : CHANNEL_A;
 
-		if ((sc->sc_ah->ah_opmode == NL80211_IFTYPE_AP) &&
-		    (conf->ht.enabled)) {
-			sc->tx_chan_width = (!!conf->ht.sec_chan_offset) ?
-				ATH9K_HT_MACMODE_2040 : ATH9K_HT_MACMODE_20;
+		if (conf->ht.enabled) {
+			if (conf->ht.channel_type == NL80211_CHAN_HT40PLUS ||
+			    conf->ht.channel_type == NL80211_CHAN_HT40MINUS)
+				sc->tx_chan_width = ATH9K_HT_MACMODE_2040;
 
 			sc->sc_ah->ah_channels[pos].chanmode =
 				ath_get_extchanmode(sc, curchan,
-						    conf->ht.sec_chan_offset,
-						    sc->tx_chan_width);
+						    conf->ht.channel_type);
 		}
 
 		if (ath_set_channel(sc, &sc->sc_ah->ah_channels[pos]) < 0) {
 			DPRINTF(sc, ATH_DBG_FATAL, "Unable to set channel\n");
 			return -EINVAL;
 		}
-	}
 
-	if (changed & IEEE80211_CONF_CHANGE_HT)
 		ath_update_chainmask(sc, conf->ht.enabled);
+	}
 
 	if (changed & IEEE80211_CONF_CHANGE_POWER)
 		sc->sc_config.txpowlimit = 2 * conf->power_level;
@@ -2417,9 +2355,6 @@ static void ath9k_bss_info_changed(struct ieee80211_hw *hw,
 			sc->sc_flags &= ~SC_OP_PROTECT_ENABLE;
 	}
 
-	if (changed & BSS_CHANGED_HT)
-		ath9k_ht_conf(sc, bss_conf);
-
 	if (changed & BSS_CHANGED_ASSOC) {
 		DPRINTF(sc, ATH_DBG_CONFIG, "BSS Changed ASSOC %d\n",
 			bss_conf->assoc);
diff --git a/drivers/net/wireless/iwlwifi/iwl-agn.c b/drivers/net/wireless/iwlwifi/iwl-agn.c
index 2f5e86e1291..bbc1c8052ff 100644
--- a/drivers/net/wireless/iwlwifi/iwl-agn.c
+++ b/drivers/net/wireless/iwlwifi/iwl-agn.c
@@ -515,19 +515,27 @@ static void iwl_ht_conf(struct iwl_priv *priv,
 	iwl_conf->supported_chan_width =
 		!!(ht_conf->cap & IEEE80211_HT_CAP_SUP_WIDTH_20_40);
 
-	iwl_conf->extension_chan_offset = bss_conf->ht.secondary_channel_offset;
+	/*
+	 * XXX: The HT configuration needs to be moved into iwl_mac_config()
+	 *	to be done there correctly.
+	 */
+
+	iwl_conf->extension_chan_offset = IEEE80211_HT_PARAM_CHA_SEC_NONE;
+	if (priv->hw->conf.ht.channel_type == NL80211_CHAN_HT40MINUS)
+		iwl_conf->extension_chan_offset = IEEE80211_HT_PARAM_CHA_SEC_BELOW;
+	else if(priv->hw->conf.ht.channel_type == NL80211_CHAN_HT40PLUS)
+		iwl_conf->extension_chan_offset = IEEE80211_HT_PARAM_CHA_SEC_ABOVE;
+
 	/* If no above or below channel supplied disable FAT channel */
 	if (iwl_conf->extension_chan_offset != IEEE80211_HT_PARAM_CHA_SEC_ABOVE &&
-	    iwl_conf->extension_chan_offset != IEEE80211_HT_PARAM_CHA_SEC_BELOW) {
-		iwl_conf->extension_chan_offset = IEEE80211_HT_PARAM_CHA_SEC_NONE;
+	    iwl_conf->extension_chan_offset != IEEE80211_HT_PARAM_CHA_SEC_BELOW)
 		iwl_conf->supported_chan_width = 0;
-	}
 
 	iwl_conf->sm_ps = (u8)((ht_conf->cap & IEEE80211_HT_CAP_SM_PS) >> 2);
 
 	memcpy(&iwl_conf->mcs, &ht_conf->mcs, 16);
 
-	iwl_conf->tx_chan_width = bss_conf->ht.width_40_ok;
+	iwl_conf->tx_chan_width = iwl_conf->supported_chan_width != 0;
 	iwl_conf->ht_protection =
 		bss_conf->ht.operation_mode & IEEE80211_HT_OP_MODE_PROTECTION;
 	iwl_conf->non_GF_STA_present =
diff --git a/drivers/net/wireless/mac80211_hwsim.c b/drivers/net/wireless/mac80211_hwsim.c
index fd5a537ac51..f83d69e813d 100644
--- a/drivers/net/wireless/mac80211_hwsim.c
+++ b/drivers/net/wireless/mac80211_hwsim.c
@@ -495,11 +495,9 @@ static void mac80211_hwsim_bss_info_changed(struct ieee80211_hw *hw,
 	}
 
 	if (changed & BSS_CHANGED_HT) {
-		printk(KERN_DEBUG "  %s: HT: sec_ch_offs=%d width_40_ok=%d "
-		       "op_mode=%d\n",
+		printk(KERN_DEBUG "  %s: HT: op_mode=0x%x\n",
 		       wiphy_name(hw->wiphy),
-		       info->ht.secondary_channel_offset,
-		       info->ht.width_40_ok, info->ht.operation_mode);
+		       info->ht.operation_mode);
 	}
 
 	if (changed & BSS_CHANGED_BASIC_RATES) {
diff --git a/include/linux/nl80211.h b/include/linux/nl80211.h
index 7501acfcfdc..e86ed59f9ad 100644
--- a/include/linux/nl80211.h
+++ b/include/linux/nl80211.h
@@ -201,13 +201,13 @@ enum nl80211_commands {
  * @NL80211_ATTR_WIPHY_NAME: wiphy name (used for renaming)
  * @NL80211_ATTR_WIPHY_TXQ_PARAMS: a nested array of TX queue parameters
  * @NL80211_ATTR_WIPHY_FREQ: frequency of the selected channel in MHz
- * @NL80211_ATTR_WIPHY_SEC_CHAN_OFFSET: included with NL80211_ATTR_WIPHY_FREQ
+ * @NL80211_ATTR_WIPHY_CHANNEL_TYPE: included with NL80211_ATTR_WIPHY_FREQ
  *	if HT20 or HT40 are allowed (i.e., 802.11n disabled if not included):
- *	NL80211_SEC_CHAN_NO_HT = HT not allowed (i.e., same as not including
+ *	NL80211_CHAN_NO_HT = HT not allowed (i.e., same as not including
  *		this attribute)
- *	NL80211_SEC_CHAN_DISABLED = HT20 only
- *	NL80211_SEC_CHAN_BELOW = secondary channel is below the primary channel
- *	NL80211_SEC_CHAN_ABOVE = secondary channel is above the primary channel
+ *	NL80211_CHAN_HT20 = HT20 only
+ *	NL80211_CHAN_HT40MINUS = secondary channel is below the primary channel
+ *	NL80211_CHAN_HT40PLUS = secondary channel is above the primary channel
  *
  * @NL80211_ATTR_IFINDEX: network interface index of the device to operate on
  * @NL80211_ATTR_IFNAME: network interface name
@@ -344,7 +344,7 @@ enum nl80211_attrs {
 
 	NL80211_ATTR_WIPHY_TXQ_PARAMS,
 	NL80211_ATTR_WIPHY_FREQ,
-	NL80211_ATTR_WIPHY_SEC_CHAN_OFFSET,
+	NL80211_ATTR_WIPHY_CHANNEL_TYPE,
 
 	/* add attributes here, update the policy in nl80211.c */
 
@@ -805,10 +805,10 @@ enum nl80211_txq_q {
 	NL80211_TXQ_Q_BK
 };
 
-enum nl80211_sec_chan_offset {
-	NL80211_SEC_CHAN_NO_HT /* No HT */,
-	NL80211_SEC_CHAN_DISABLED /* HT20 only */,
-	NL80211_SEC_CHAN_BELOW /* HT40- */,
-	NL80211_SEC_CHAN_ABOVE /* HT40+ */
+enum nl80211_channel_type {
+	NL80211_CHAN_NO_HT,
+	NL80211_CHAN_HT20,
+	NL80211_CHAN_HT40MINUS,
+	NL80211_CHAN_HT40PLUS
 };
 #endif /* __LINUX_NL80211_H */
diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index 65e03ac9310..23c0ab74ded 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -563,7 +563,7 @@ struct cfg80211_ops {
 
 	int	(*set_channel)(struct wiphy *wiphy,
 			       struct ieee80211_channel *chan,
-			       enum nl80211_sec_chan_offset);
+			       enum nl80211_channel_type channel_type);
 };
 
 /* temporary wext handlers */
diff --git a/include/net/mac80211.h b/include/net/mac80211.h
index 046ce692a90..22ae72c58d7 100644
--- a/include/net/mac80211.h
+++ b/include/net/mac80211.h
@@ -165,14 +165,9 @@ enum ieee80211_bss_change {
 
 /**
  * struct ieee80211_bss_ht_conf - BSS's changing HT configuration
- * @secondary_channel_offset: secondary channel offset, uses
- *	%IEEE80211_HT_PARAM_CHA_SEC_ values
- * @width_40_ok: indicates that 40 MHz bandwidth may be used for TX
  * @operation_mode: HT operation mode (like in &struct ieee80211_ht_info)
  */
 struct ieee80211_bss_ht_conf {
-	u8 secondary_channel_offset;
-	bool width_40_ok;
 	u16 operation_mode;
 };
 
@@ -508,9 +503,7 @@ static inline int __deprecated __IEEE80211_CONF_SHORT_SLOT_TIME(void)
 
 struct ieee80211_ht_conf {
 	bool enabled;
-	int sec_chan_offset; /* 0 = HT40 disabled; -1 = HT40 enabled, secondary
-			      * channel below primary; 1 = HT40 enabled,
-			      * secondary channel above primary */
+	enum nl80211_channel_type channel_type;
 };
 
 /**
diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c
index 23b5eeaf7bc..3ea0884c943 100644
--- a/net/mac80211/cfg.c
+++ b/net/mac80211/cfg.c
@@ -1122,12 +1122,12 @@ static int ieee80211_set_txq_params(struct wiphy *wiphy,
 
 static int ieee80211_set_channel(struct wiphy *wiphy,
 				 struct ieee80211_channel *chan,
-				 enum nl80211_sec_chan_offset sec_chan_offset)
+				 enum nl80211_channel_type channel_type)
 {
 	struct ieee80211_local *local = wiphy_priv(wiphy);
 
 	local->oper_channel = chan;
-	local->oper_sec_chan_offset = sec_chan_offset;
+	local->oper_channel_type = channel_type;
 
 	return ieee80211_hw_config(local, IEEE80211_CONF_CHANGE_CHANNEL);
 }
diff --git a/net/mac80211/ht.c b/net/mac80211/ht.c
index a1eed7032c9..5f510a13b9f 100644
--- a/net/mac80211/ht.c
+++ b/net/mac80211/ht.c
@@ -98,6 +98,7 @@ u32 ieee80211_enable_ht(struct ieee80211_sub_if_data *sdata,
 	struct ieee80211_bss_ht_conf ht;
 	u32 changed = 0;
 	bool enable_ht = true, ht_changed;
+	enum nl80211_channel_type channel_type = NL80211_CHAN_NO_HT;
 
 	sband = local->hw.wiphy->bands[local->hw.conf.channel->band];
 
@@ -112,24 +113,36 @@ u32 ieee80211_enable_ht(struct ieee80211_sub_if_data *sdata,
 	    ieee80211_channel_to_frequency(hti->control_chan))
 		enable_ht = false;
 
-	/*
-	 * XXX: This is totally incorrect when there are multiple virtual
-	 *	interfaces, needs to be fixed later.
-	 */
-	ht_changed = local->hw.conf.ht.enabled != enable_ht;
+	if (enable_ht) {
+		channel_type = NL80211_CHAN_HT20;
+
+		if (!(ap_ht_cap_flags & IEEE80211_HT_CAP_40MHZ_INTOLERANT) &&
+		    (sband->ht_cap.cap & IEEE80211_HT_CAP_SUP_WIDTH_20_40) &&
+		    (hti->ht_param & IEEE80211_HT_PARAM_CHAN_WIDTH_ANY)) {
+			switch(hti->ht_param & IEEE80211_HT_PARAM_CHA_SEC_OFFSET) {
+			case IEEE80211_HT_PARAM_CHA_SEC_ABOVE:
+				channel_type = NL80211_CHAN_HT40PLUS;
+				break;
+			case IEEE80211_HT_PARAM_CHA_SEC_BELOW:
+				channel_type = NL80211_CHAN_HT40MINUS;
+				break;
+			}
+		}
+	}
+
+	ht_changed = local->hw.conf.ht.enabled != enable_ht ||
+		     channel_type != local->hw.conf.ht.channel_type;
+
+	local->oper_channel_type = channel_type;
 	local->hw.conf.ht.enabled = enable_ht;
+
 	if (ht_changed)
 		ieee80211_hw_config(local, IEEE80211_CONF_CHANGE_HT);
 
 	/* disable HT */
 	if (!enable_ht)
 		return 0;
-	ht.secondary_channel_offset =
-		hti->ht_param & IEEE80211_HT_PARAM_CHA_SEC_OFFSET;
-	ht.width_40_ok =
-		!(ap_ht_cap_flags & IEEE80211_HT_CAP_40MHZ_INTOLERANT) &&
-		(sband->ht_cap.cap & IEEE80211_HT_CAP_SUP_WIDTH_20_40) &&
-		(hti->ht_param & IEEE80211_HT_PARAM_CHAN_WIDTH_ANY);
+
 	ht.operation_mode = le16_to_cpu(hti->operation_mode);
 
 	/* if bss configuration changed store the new one */
diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h
index 6f59e11d7b3..a7dabaecfc7 100644
--- a/net/mac80211/ieee80211_i.h
+++ b/net/mac80211/ieee80211_i.h
@@ -625,7 +625,7 @@ struct ieee80211_local {
 	struct delayed_work scan_work;
 	struct ieee80211_sub_if_data *scan_sdata;
 	struct ieee80211_channel *oper_channel, *scan_channel;
-	enum nl80211_sec_chan_offset oper_sec_chan_offset;
+	enum nl80211_channel_type oper_channel_type;
 	u8 scan_ssid[IEEE80211_MAX_SSID_LEN];
 	size_t scan_ssid_len;
 	struct list_head bss_list;
diff --git a/net/mac80211/main.c b/net/mac80211/main.c
index 6d8710327d1..a0371caf01c 100644
--- a/net/mac80211/main.c
+++ b/net/mac80211/main.c
@@ -195,37 +195,30 @@ int ieee80211_hw_config(struct ieee80211_local *local, u32 changed)
 	struct ieee80211_channel *chan;
 	int ret = 0;
 	int power;
-	enum nl80211_sec_chan_offset sec_chan_offset;
+	enum nl80211_channel_type channel_type;
 
 	might_sleep();
 
 	if (local->sw_scanning) {
 		chan = local->scan_channel;
-		sec_chan_offset = NL80211_SEC_CHAN_NO_HT;
+		channel_type = NL80211_CHAN_NO_HT;
 	} else {
 		chan = local->oper_channel;
-		sec_chan_offset = local->oper_sec_chan_offset;
+		channel_type = local->oper_channel_type;
 	}
 
 	if (chan != local->hw.conf.channel ||
-	    sec_chan_offset != local->hw.conf.ht.sec_chan_offset) {
+	    channel_type != local->hw.conf.ht.channel_type) {
 		local->hw.conf.channel = chan;
-		switch (sec_chan_offset) {
-		case NL80211_SEC_CHAN_NO_HT:
+		local->hw.conf.ht.channel_type = channel_type;
+		switch (channel_type) {
+		case NL80211_CHAN_NO_HT:
 			local->hw.conf.ht.enabled = false;
-			local->hw.conf.ht.sec_chan_offset = 0;
 			break;
-		case NL80211_SEC_CHAN_DISABLED:
+		case NL80211_CHAN_HT20:
+		case NL80211_CHAN_HT40MINUS:
+		case NL80211_CHAN_HT40PLUS:
 			local->hw.conf.ht.enabled = true;
-			local->hw.conf.ht.sec_chan_offset = 0;
-			break;
-		case NL80211_SEC_CHAN_BELOW:
-			local->hw.conf.ht.enabled = true;
-			local->hw.conf.ht.sec_chan_offset = -1;
-			break;
-		case NL80211_SEC_CHAN_ABOVE:
-			local->hw.conf.ht.enabled = true;
-			local->hw.conf.ht.sec_chan_offset = 1;
 			break;
 		}
 		changed |= IEEE80211_CONF_CHANGE_CHANNEL;
diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c
index 290b0017ef2..e4d1fca5c72 100644
--- a/net/mac80211/mlme.c
+++ b/net/mac80211/mlme.c
@@ -858,6 +858,7 @@ static void ieee80211_set_disassoc(struct ieee80211_sub_if_data *sdata,
 	rcu_read_unlock();
 
 	local->hw.conf.ht.enabled = false;
+	local->oper_channel_type = NL80211_CHAN_NO_HT;
 	ieee80211_hw_config(local, IEEE80211_CONF_CHANGE_HT);
 
 	ieee80211_bss_info_change_notify(sdata, changed);
diff --git a/net/mac80211/util.c b/net/mac80211/util.c
index 505d68f344c..71a8391c54f 100644
--- a/net/mac80211/util.c
+++ b/net/mac80211/util.c
@@ -641,7 +641,7 @@ int ieee80211_set_freq(struct ieee80211_sub_if_data *sdata, int freqMHz)
 		    chan->flags & IEEE80211_CHAN_NO_IBSS)
 			return ret;
 		local->oper_channel = chan;
-		local->oper_sec_chan_offset = NL80211_SEC_CHAN_NO_HT;
+		local->oper_channel_type = NL80211_CHAN_NO_HT;
 
 		if (local->sw_scanning || local->hw_scanning)
 			ret = 0;
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index 93c9b983ce0..1e728fff474 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -60,7 +60,7 @@ static struct nla_policy nl80211_policy[NL80211_ATTR_MAX+1] __read_mostly = {
 				      .len = BUS_ID_SIZE-1 },
 	[NL80211_ATTR_WIPHY_TXQ_PARAMS] = { .type = NLA_NESTED },
 	[NL80211_ATTR_WIPHY_FREQ] = { .type = NLA_U32 },
-	[NL80211_ATTR_WIPHY_SEC_CHAN_OFFSET] = { .type = NLA_U32 },
+	[NL80211_ATTR_WIPHY_CHANNEL_TYPE] = { .type = NLA_U32 },
 
 	[NL80211_ATTR_IFTYPE] = { .type = NLA_U32 },
 	[NL80211_ATTR_IFINDEX] = { .type = NLA_U32 },
@@ -362,8 +362,7 @@ static int nl80211_set_wiphy(struct sk_buff *skb, struct genl_info *info)
 	}
 
 	if (info->attrs[NL80211_ATTR_WIPHY_FREQ]) {
-		enum nl80211_sec_chan_offset sec_chan_offset =
-			NL80211_SEC_CHAN_NO_HT;
+		enum nl80211_channel_type channel_type = NL80211_CHAN_NO_HT;
 		struct ieee80211_channel *chan;
 		struct ieee80211_sta_ht_cap *ht_cap;
 		u32 freq, sec_freq;
@@ -375,13 +374,13 @@ static int nl80211_set_wiphy(struct sk_buff *skb, struct genl_info *info)
 
 		result = -EINVAL;
 
-		if (info->attrs[NL80211_ATTR_WIPHY_SEC_CHAN_OFFSET]) {
-			sec_chan_offset = nla_get_u32(info->attrs[
-					NL80211_ATTR_WIPHY_SEC_CHAN_OFFSET]);
-			if (sec_chan_offset != NL80211_SEC_CHAN_NO_HT &&
-			    sec_chan_offset != NL80211_SEC_CHAN_DISABLED &&
-			    sec_chan_offset != NL80211_SEC_CHAN_BELOW &&
-			    sec_chan_offset != NL80211_SEC_CHAN_ABOVE)
+		if (info->attrs[NL80211_ATTR_WIPHY_CHANNEL_TYPE]) {
+			channel_type = nla_get_u32(info->attrs[
+					   NL80211_ATTR_WIPHY_CHANNEL_TYPE]);
+			if (channel_type != NL80211_CHAN_NO_HT &&
+			    channel_type != NL80211_CHAN_HT20 &&
+			    channel_type != NL80211_CHAN_HT40PLUS &&
+			    channel_type != NL80211_CHAN_HT40MINUS)
 				goto bad_res;
 		}
 
@@ -392,9 +391,9 @@ static int nl80211_set_wiphy(struct sk_buff *skb, struct genl_info *info)
 		if (!chan || chan->flags & IEEE80211_CHAN_DISABLED)
 			goto bad_res;
 
-		if (sec_chan_offset == NL80211_SEC_CHAN_BELOW)
+		if (channel_type == NL80211_CHAN_HT40MINUS)
 			sec_freq = freq - 20;
-		else if (sec_chan_offset == NL80211_SEC_CHAN_ABOVE)
+		else if (channel_type == NL80211_CHAN_HT40PLUS)
 			sec_freq = freq + 20;
 		else
 			sec_freq = 0;
@@ -402,7 +401,7 @@ static int nl80211_set_wiphy(struct sk_buff *skb, struct genl_info *info)
 		ht_cap = &rdev->wiphy.bands[chan->band]->ht_cap;
 
 		/* no HT capabilities */
-		if (sec_chan_offset != NL80211_SEC_CHAN_NO_HT &&
+		if (channel_type != NL80211_CHAN_NO_HT &&
 		    !ht_cap->ht_supported)
 			goto bad_res;
 
@@ -422,7 +421,7 @@ static int nl80211_set_wiphy(struct sk_buff *skb, struct genl_info *info)
 		}
 
 		result = rdev->ops->set_channel(&rdev->wiphy, chan,
-						sec_chan_offset);
+						channel_type);
 		if (result)
 			goto bad_res;
 	}
-- 
cgit v1.2.3-70-g09d2


From 12204e24b1330428c3062faee10a0d80b8a5cb61 Mon Sep 17 00:00:00 2001
From: James Morris <jmorris@namei.org>
Date: Fri, 19 Dec 2008 10:44:42 +1100
Subject: security: pass mount flags to security_sb_kern_mount()

Pass mount flags to security_sb_kern_mount(), so security modules
can determine if a mount operation is being performed by the kernel.

Signed-off-by: James Morris <jmorris@namei.org>
Acked-by: Stephen Smalley <sds@tycho.nsa.gov>
---
 fs/super.c                 | 2 +-
 include/linux/security.h   | 6 +++---
 security/capability.c      | 2 +-
 security/security.c        | 4 ++--
 security/selinux/hooks.c   | 2 +-
 security/smack/smack_lsm.c | 3 ++-
 6 files changed, 10 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/fs/super.c b/fs/super.c
index 400a7608f15..ddba069d7a9 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -914,7 +914,7 @@ vfs_kern_mount(struct file_system_type *type, int flags, const char *name, void
 		goto out_free_secdata;
 	BUG_ON(!mnt->mnt_sb);
 
- 	error = security_sb_kern_mount(mnt->mnt_sb, secdata);
+ 	error = security_sb_kern_mount(mnt->mnt_sb, flags, secdata);
  	if (error)
  		goto out_sb;
 
diff --git a/include/linux/security.h b/include/linux/security.h
index 6423abf1ac0..3416cb85e77 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -1308,7 +1308,7 @@ struct security_operations {
 	int (*sb_alloc_security) (struct super_block *sb);
 	void (*sb_free_security) (struct super_block *sb);
 	int (*sb_copy_data) (char *orig, char *copy);
-	int (*sb_kern_mount) (struct super_block *sb, void *data);
+	int (*sb_kern_mount) (struct super_block *sb, int flags, void *data);
 	int (*sb_show_options) (struct seq_file *m, struct super_block *sb);
 	int (*sb_statfs) (struct dentry *dentry);
 	int (*sb_mount) (char *dev_name, struct path *path,
@@ -1575,7 +1575,7 @@ int security_bprm_secureexec(struct linux_binprm *bprm);
 int security_sb_alloc(struct super_block *sb);
 void security_sb_free(struct super_block *sb);
 int security_sb_copy_data(char *orig, char *copy);
-int security_sb_kern_mount(struct super_block *sb, void *data);
+int security_sb_kern_mount(struct super_block *sb, int flags, void *data);
 int security_sb_show_options(struct seq_file *m, struct super_block *sb);
 int security_sb_statfs(struct dentry *dentry);
 int security_sb_mount(char *dev_name, struct path *path,
@@ -1850,7 +1850,7 @@ static inline int security_sb_copy_data(char *orig, char *copy)
 	return 0;
 }
 
-static inline int security_sb_kern_mount(struct super_block *sb, void *data)
+static inline int security_sb_kern_mount(struct super_block *sb, int flags, void *data)
 {
 	return 0;
 }
diff --git a/security/capability.c b/security/capability.c
index b9e391425e6..2dce66fcb99 100644
--- a/security/capability.c
+++ b/security/capability.c
@@ -59,7 +59,7 @@ static int cap_sb_copy_data(char *orig, char *copy)
 	return 0;
 }
 
-static int cap_sb_kern_mount(struct super_block *sb, void *data)
+static int cap_sb_kern_mount(struct super_block *sb, int flags, void *data)
 {
 	return 0;
 }
diff --git a/security/security.c b/security/security.c
index f0d96a6cc4e..d85dbb37c97 100644
--- a/security/security.c
+++ b/security/security.c
@@ -254,9 +254,9 @@ int security_sb_copy_data(char *orig, char *copy)
 }
 EXPORT_SYMBOL(security_sb_copy_data);
 
-int security_sb_kern_mount(struct super_block *sb, void *data)
+int security_sb_kern_mount(struct super_block *sb, int flags, void *data)
 {
-	return security_ops->sb_kern_mount(sb, data);
+	return security_ops->sb_kern_mount(sb, flags, data);
 }
 
 int security_sb_show_options(struct seq_file *m, struct super_block *sb)
diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index 8dbc54cde59..7465d713b53 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -2474,7 +2474,7 @@ out:
 	return rc;
 }
 
-static int selinux_sb_kern_mount(struct super_block *sb, void *data)
+static int selinux_sb_kern_mount(struct super_block *sb, int flags, void *data)
 {
 	const struct cred *cred = current_cred();
 	struct avc_audit_data ad;
diff --git a/security/smack/smack_lsm.c b/security/smack/smack_lsm.c
index 8ad48161cef..1b5551dfc1f 100644
--- a/security/smack/smack_lsm.c
+++ b/security/smack/smack_lsm.c
@@ -250,11 +250,12 @@ static int smack_sb_copy_data(char *orig, char *smackopts)
 /**
  * smack_sb_kern_mount - Smack specific mount processing
  * @sb: the file system superblock
+ * @flags: the mount flags
  * @data: the smack mount options
  *
  * Returns 0 on success, an error code on failure
  */
-static int smack_sb_kern_mount(struct super_block *sb, void *data)
+static int smack_sb_kern_mount(struct super_block *sb, int flags, void *data)
 {
 	struct dentry *root = sb->s_root;
 	struct inode *inode = root->d_inode;
-- 
cgit v1.2.3-70-g09d2


From 6bd9cd50c830eb88d571c492ec370a30bf999e15 Mon Sep 17 00:00:00 2001
From: "venkatesh.pallipadi@intel.com" <venkatesh.pallipadi@intel.com>
Date: Fri, 19 Dec 2008 13:47:26 -0800
Subject: x86: PAT: clarify is_linear_pfn_mapping() interface

Impact: Documentation only

Incremental patches to address the review comments from Nick Piggin
for v3 version of x86 PAT pfnmap changes patchset here

http://lkml.indiana.edu/hypermail/linux/kernel/0812.2/01330.html

This patch:

Clarify is_linear_pfn_mapping() and its usage.

It is used by x86 PAT code for performance reasons. Identifying pfnmap
as linear over entire vma helps speedup reserve and free of memtype
for the region.

Signed-off-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 include/linux/mm.h | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 87ecb40e11a..35f811b0cd6 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -145,6 +145,14 @@ extern pgprot_t protection_map[16];
 #define FAULT_FLAG_WRITE	0x01	/* Fault was a write access */
 #define FAULT_FLAG_NONLINEAR	0x02	/* Fault was via a nonlinear mapping */
 
+/*
+ * This interface is used by x86 PAT code to identify a pfn mapping that is
+ * linear over entire vma. This is to optimize PAT code that deals with
+ * marking the physical region with a particular prot. This is not for generic
+ * mm use. Note also that this check will not work if the pfn mapping is
+ * linear for a vma starting at physical address 0. In which case PAT code
+ * falls back to slow path of reserving physical range page by page.
+ */
 static inline int is_linear_pfn_mapping(struct vm_area_struct *vma)
 {
 	return ((vma->vm_flags & VM_PFNMAP) && vma->vm_pgoff);
-- 
cgit v1.2.3-70-g09d2


From d87fe6607c31944f7572f965c1507ae77026c133 Mon Sep 17 00:00:00 2001
From: "venkatesh.pallipadi@intel.com" <venkatesh.pallipadi@intel.com>
Date: Fri, 19 Dec 2008 13:47:27 -0800
Subject: x86: PAT: modify follow_phys to return phys_addr prot and return
 value

Impact: Changes and globalizes an existing static interface.

Follow_phys does similar things as follow_pfnmap_pte. Make a minor change
to follow_phys so that it can be used in place of follow_pfnmap_pte.
Physical address return value with 0 as error return does not work in
follow_phys as the actual physical address 0 mapping may exist in pte.

Signed-off-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 include/linux/mm.h |  2 ++
 mm/memory.c        | 31 ++++++++++++++-----------------
 2 files changed, 16 insertions(+), 17 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 35f811b0cd6..2f6e2f886d4 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -804,6 +804,8 @@ int copy_page_range(struct mm_struct *dst, struct mm_struct *src,
 			struct vm_area_struct *vma);
 void unmap_mapping_range(struct address_space *mapping,
 		loff_t const holebegin, loff_t const holelen, int even_cows);
+int follow_phys(struct vm_area_struct *vma, unsigned long address,
+		unsigned int flags, unsigned long *prot, resource_size_t *phys);
 int generic_access_phys(struct vm_area_struct *vma, unsigned long addr,
 			void *buf, int len, int write);
 
diff --git a/mm/memory.c b/mm/memory.c
index 1e8f0d347c0..79f28e35d4f 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2981,9 +2981,9 @@ int in_gate_area_no_task(unsigned long addr)
 #endif	/* __HAVE_ARCH_GATE_AREA */
 
 #ifdef CONFIG_HAVE_IOREMAP_PROT
-static resource_size_t follow_phys(struct vm_area_struct *vma,
-			unsigned long address, unsigned int flags,
-			unsigned long *prot)
+int follow_phys(struct vm_area_struct *vma,
+		unsigned long address, unsigned int flags,
+		unsigned long *prot, resource_size_t *phys)
 {
 	pgd_t *pgd;
 	pud_t *pud;
@@ -2992,24 +2992,26 @@ static resource_size_t follow_phys(struct vm_area_struct *vma,
 	spinlock_t *ptl;
 	resource_size_t phys_addr = 0;
 	struct mm_struct *mm = vma->vm_mm;
+	int ret = -EINVAL;
 
-	VM_BUG_ON(!(vma->vm_flags & (VM_IO | VM_PFNMAP)));
+	if (!(vma->vm_flags & (VM_IO | VM_PFNMAP)))
+		goto out;
 
 	pgd = pgd_offset(mm, address);
 	if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd)))
-		goto no_page_table;
+		goto out;
 
 	pud = pud_offset(pgd, address);
 	if (pud_none(*pud) || unlikely(pud_bad(*pud)))
-		goto no_page_table;
+		goto out;
 
 	pmd = pmd_offset(pud, address);
 	if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd)))
-		goto no_page_table;
+		goto out;
 
 	/* We cannot handle huge page PFN maps. Luckily they don't exist. */
 	if (pmd_huge(*pmd))
-		goto no_page_table;
+		goto out;
 
 	ptep = pte_offset_map_lock(mm, pmd, address, &ptl);
 	if (!ptep)
@@ -3024,13 +3026,13 @@ static resource_size_t follow_phys(struct vm_area_struct *vma,
 	phys_addr <<= PAGE_SHIFT; /* Shift here to avoid overflow on PAE */
 
 	*prot = pgprot_val(pte_pgprot(pte));
+	*phys = phys_addr;
+	ret = 0;
 
 unlock:
 	pte_unmap_unlock(ptep, ptl);
 out:
-	return phys_addr;
-no_page_table:
-	return 0;
+	return ret;
 }
 
 int generic_access_phys(struct vm_area_struct *vma, unsigned long addr,
@@ -3041,12 +3043,7 @@ int generic_access_phys(struct vm_area_struct *vma, unsigned long addr,
 	void *maddr;
 	int offset = addr & (PAGE_SIZE-1);
 
-	if (!(vma->vm_flags & (VM_IO | VM_PFNMAP)))
-		return -EINVAL;
-
-	phys_addr = follow_phys(vma, addr, write, &prot);
-
-	if (!phys_addr)
+	if (follow_phys(vma, addr, write, &prot, &phys_addr))
 		return -EINVAL;
 
 	maddr = ioremap_prot(phys_addr, PAGE_SIZE, prot);
-- 
cgit v1.2.3-70-g09d2


From 982d789ab76c8a11426852fec2fdf2f412e21c0c Mon Sep 17 00:00:00 2001
From: "venkatesh.pallipadi@intel.com" <venkatesh.pallipadi@intel.com>
Date: Fri, 19 Dec 2008 13:47:28 -0800
Subject: x86: PAT: remove follow_pfnmap_pte in favor of follow_phys

Impact: Cleanup - removes a new function in favor of a recently modified older one.

Replace follow_pfnmap_pte in pat code with follow_phys. follow_phys lso
returns protection eliminating the need of pte_pgprot call. Using follow_phys
also eliminates the need for pte_pa.

Signed-off-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/include/asm/pgtable.h |  5 -----
 arch/x86/mm/pat.c              | 30 +++++++++++------------------
 include/linux/mm.h             |  3 ---
 mm/memory.c                    | 43 ------------------------------------------
 4 files changed, 11 insertions(+), 70 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index 579f8ceee94..2aa792bbd7e 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -230,11 +230,6 @@ static inline unsigned long pte_pfn(pte_t pte)
 	return (pte_val(pte) & PTE_PFN_MASK) >> PAGE_SHIFT;
 }
 
-static inline u64 pte_pa(pte_t pte)
-{
-	return pte_val(pte) & PTE_PFN_MASK;
-}
-
 #define pte_page(pte)	pfn_to_page(pte_pfn(pte))
 
 static inline int pmd_large(pmd_t pte)
diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c
index d5254bae84f..541bcc944a5 100644
--- a/arch/x86/mm/pat.c
+++ b/arch/x86/mm/pat.c
@@ -685,8 +685,7 @@ int track_pfn_vma_copy(struct vm_area_struct *vma)
 	int retval = 0;
 	unsigned long i, j;
 	u64 paddr;
-	pgprot_t prot;
-	pte_t pte;
+	unsigned long prot;
 	unsigned long vma_start = vma->vm_start;
 	unsigned long vma_end = vma->vm_end;
 	unsigned long vma_size = vma_end - vma_start;
@@ -696,26 +695,22 @@ int track_pfn_vma_copy(struct vm_area_struct *vma)
 
 	if (is_linear_pfn_mapping(vma)) {
 		/*
-		 * reserve the whole chunk starting from vm_pgoff,
-		 * But, we have to get the protection from pte.
+		 * reserve the whole chunk covered by vma. We need the
+		 * starting address and protection from pte.
 		 */
-		if (follow_pfnmap_pte(vma, vma_start, &pte)) {
+		if (follow_phys(vma, vma_start, 0, &prot, &paddr)) {
 			WARN_ON_ONCE(1);
-			return -1;
+			return -EINVAL;
 		}
-		prot = pte_pgprot(pte);
-		paddr = (u64)vma->vm_pgoff << PAGE_SHIFT;
-		return reserve_pfn_range(paddr, vma_size, prot);
+		return reserve_pfn_range(paddr, vma_size, __pgprot(prot));
 	}
 
 	/* reserve entire vma page by page, using pfn and prot from pte */
 	for (i = 0; i < vma_size; i += PAGE_SIZE) {
-		if (follow_pfnmap_pte(vma, vma_start + i, &pte))
+		if (follow_phys(vma, vma_start + i, 0, &prot, &paddr))
 			continue;
 
-		paddr = pte_pa(pte);
-		prot = pte_pgprot(pte);
-		retval = reserve_pfn_range(paddr, PAGE_SIZE, prot);
+		retval = reserve_pfn_range(paddr, PAGE_SIZE, __pgprot(prot));
 		if (retval)
 			goto cleanup_ret;
 	}
@@ -724,10 +719,9 @@ int track_pfn_vma_copy(struct vm_area_struct *vma)
 cleanup_ret:
 	/* Reserve error: Cleanup partial reservation and return error */
 	for (j = 0; j < i; j += PAGE_SIZE) {
-		if (follow_pfnmap_pte(vma, vma_start + j, &pte))
+		if (follow_phys(vma, vma_start + j, 0, &prot, &paddr))
 			continue;
 
-		paddr = pte_pa(pte);
 		free_pfn_range(paddr, PAGE_SIZE);
 	}
 
@@ -797,6 +791,7 @@ void untrack_pfn_vma(struct vm_area_struct *vma, unsigned long pfn,
 {
 	unsigned long i;
 	u64 paddr;
+	unsigned long prot;
 	unsigned long vma_start = vma->vm_start;
 	unsigned long vma_end = vma->vm_end;
 	unsigned long vma_size = vma_end - vma_start;
@@ -821,12 +816,9 @@ void untrack_pfn_vma(struct vm_area_struct *vma, unsigned long pfn,
 	} else {
 		/* free entire vma, page by page, using the pfn from pte */
 		for (i = 0; i < vma_size; i += PAGE_SIZE) {
-			pte_t pte;
-
-			if (follow_pfnmap_pte(vma, vma_start + i, &pte))
+			if (follow_phys(vma, vma_start + i, 0, &prot, &paddr))
 				continue;
 
-			paddr = pte_pa(pte);
 			free_pfn_range(paddr, PAGE_SIZE);
 		}
 	}
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 2f6e2f886d4..36f9b3fa5e1 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1239,9 +1239,6 @@ struct page *follow_page(struct vm_area_struct *, unsigned long address,
 #define FOLL_GET	0x04	/* do get_page on page */
 #define FOLL_ANON	0x08	/* give ZERO_PAGE if no pgtable */
 
-int follow_pfnmap_pte(struct vm_area_struct *vma,
-				unsigned long address, pte_t *ret_ptep);
-
 typedef int (*pte_fn_t)(pte_t *pte, pgtable_t token, unsigned long addr,
 			void *data);
 extern int apply_to_page_range(struct mm_struct *mm, unsigned long address,
diff --git a/mm/memory.c b/mm/memory.c
index 79f28e35d4f..6b29f39a5a3 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1168,49 +1168,6 @@ no_page_table:
 	return page;
 }
 
-int follow_pfnmap_pte(struct vm_area_struct *vma, unsigned long address,
-			pte_t *ret_ptep)
-{
-	pgd_t *pgd;
-	pud_t *pud;
-	pmd_t *pmd;
-	pte_t *ptep, pte;
-	spinlock_t *ptl;
-	struct page *page;
-	struct mm_struct *mm = vma->vm_mm;
-
-	if (!is_pfn_mapping(vma))
-		goto err;
-
-	page = NULL;
-	pgd = pgd_offset(mm, address);
-	if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd)))
-		goto err;
-
-	pud = pud_offset(pgd, address);
-	if (pud_none(*pud) || unlikely(pud_bad(*pud)))
-		goto err;
-
-	pmd = pmd_offset(pud, address);
-	if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd)))
-		goto err;
-
-	ptep = pte_offset_map_lock(mm, pmd, address, &ptl);
-
-	pte = *ptep;
-	if (!pte_present(pte))
-		goto err_unlock;
-
-	*ret_ptep = pte;
-	pte_unmap_unlock(ptep, ptl);
-	return 0;
-
-err_unlock:
-	pte_unmap_unlock(ptep, ptl);
-err:
-	return -EINVAL;
-}
-
 /* Can we do the FOLL_ANON optimization? */
 static inline int use_zero_page(struct vm_area_struct *vma)
 {
-- 
cgit v1.2.3-70-g09d2


From 34801ba9bf0381fcf0e2b08179d2c07f2c6ede74 Mon Sep 17 00:00:00 2001
From: "venkatesh.pallipadi@intel.com" <venkatesh.pallipadi@intel.com>
Date: Fri, 19 Dec 2008 13:47:29 -0800
Subject: x86: PAT: move track untrack pfnmap stubs to asm-generic

Impact: Cleanup and branch hints only.

Move the track and untrack pfn stub routines from memory.c to asm-generic.
Also add unlikely to pfnmap related calls in fork and exit path.

Signed-off-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/include/asm/pgtable.h |  6 ++----
 include/asm-generic/pgtable.h  | 46 ++++++++++++++++++++++++++++++++++++++++
 include/linux/mm.h             |  6 ------
 mm/memory.c                    | 48 ++----------------------------------------
 4 files changed, 50 insertions(+), 56 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index 2aa792bbd7e..875192bf72c 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -339,12 +339,10 @@ static inline pgprot_t pgprot_modify(pgprot_t oldprot, pgprot_t newprot)
 
 #define canon_pgprot(p) __pgprot(pgprot_val(p) & __supported_pte_mask)
 
+#ifndef __ASSEMBLY__
 /* Indicate that x86 has its own track and untrack pfn vma functions */
-#define track_pfn_vma_new track_pfn_vma_new
-#define track_pfn_vma_copy track_pfn_vma_copy
-#define untrack_pfn_vma untrack_pfn_vma
+#define __HAVE_PFNMAP_TRACKING
 
-#ifndef __ASSEMBLY__
 #define __HAVE_PHYS_MEM_ACCESS_PROT
 struct file;
 pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn,
diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h
index b84633801fb..72ebe91005a 100644
--- a/include/asm-generic/pgtable.h
+++ b/include/asm-generic/pgtable.h
@@ -293,6 +293,52 @@ static inline void ptep_modify_prot_commit(struct mm_struct *mm,
 #define arch_flush_lazy_cpu_mode()	do {} while (0)
 #endif
 
+#ifndef __HAVE_PFNMAP_TRACKING
+/*
+ * Interface that can be used by architecture code to keep track of
+ * memory type of pfn mappings (remap_pfn_range, vm_insert_pfn)
+ *
+ * track_pfn_vma_new is called when a _new_ pfn mapping is being established
+ * for physical range indicated by pfn and size.
+ */
+static inline int track_pfn_vma_new(struct vm_area_struct *vma, pgprot_t prot,
+					unsigned long pfn, unsigned long size)
+{
+	return 0;
+}
+
+/*
+ * Interface that can be used by architecture code to keep track of
+ * memory type of pfn mappings (remap_pfn_range, vm_insert_pfn)
+ *
+ * track_pfn_vma_copy is called when vma that is covering the pfnmap gets
+ * copied through copy_page_range().
+ */
+static inline int track_pfn_vma_copy(struct vm_area_struct *vma)
+{
+	return 0;
+}
+
+/*
+ * Interface that can be used by architecture code to keep track of
+ * memory type of pfn mappings (remap_pfn_range, vm_insert_pfn)
+ *
+ * untrack_pfn_vma is called while unmapping a pfnmap for a region.
+ * untrack can be called for a specific region indicated by pfn and size or
+ * can be for the entire vma (in which case size can be zero).
+ */
+static inline void untrack_pfn_vma(struct vm_area_struct *vma,
+					unsigned long pfn, unsigned long size)
+{
+}
+#else
+extern int track_pfn_vma_new(struct vm_area_struct *vma, pgprot_t prot,
+				unsigned long pfn, unsigned long size);
+extern int track_pfn_vma_copy(struct vm_area_struct *vma);
+extern void untrack_pfn_vma(struct vm_area_struct *vma, unsigned long pfn,
+				unsigned long size);
+#endif
+
 #endif /* !__ASSEMBLY__ */
 
 #endif /* _ASM_GENERIC_PGTABLE_H */
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 36f9b3fa5e1..d3ddd735e37 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -163,12 +163,6 @@ static inline int is_pfn_mapping(struct vm_area_struct *vma)
 	return (vma->vm_flags & VM_PFNMAP);
 }
 
-extern int track_pfn_vma_new(struct vm_area_struct *vma, pgprot_t prot,
-				unsigned long pfn, unsigned long size);
-extern int track_pfn_vma_copy(struct vm_area_struct *vma);
-extern void untrack_pfn_vma(struct vm_area_struct *vma, unsigned long pfn,
-				unsigned long size);
-
 /*
  * vm_fault is filled by the the pagefault handler and passed to the vma's
  * ->fault function. The vma's ->fault is responsible for returning a bitmask
diff --git a/mm/memory.c b/mm/memory.c
index 6b29f39a5a3..f01b7eed6e1 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -99,50 +99,6 @@ int randomize_va_space __read_mostly =
 					2;
 #endif
 
-#ifndef track_pfn_vma_new
-/*
- * Interface that can be used by architecture code to keep track of
- * memory type of pfn mappings (remap_pfn_range, vm_insert_pfn)
- *
- * track_pfn_vma_new is called when a _new_ pfn mapping is being established
- * for physical range indicated by pfn and size.
- */
-int track_pfn_vma_new(struct vm_area_struct *vma, pgprot_t prot,
-			unsigned long pfn, unsigned long size)
-{
-	return 0;
-}
-#endif
-
-#ifndef track_pfn_vma_copy
-/*
- * Interface that can be used by architecture code to keep track of
- * memory type of pfn mappings (remap_pfn_range, vm_insert_pfn)
- *
- * track_pfn_vma_copy is called when vma that is covering the pfnmap gets
- * copied through copy_page_range().
- */
-int track_pfn_vma_copy(struct vm_area_struct *vma)
-{
-	return 0;
-}
-#endif
-
-#ifndef untrack_pfn_vma
-/*
- * Interface that can be used by architecture code to keep track of
- * memory type of pfn mappings (remap_pfn_range, vm_insert_pfn)
- *
- * untrack_pfn_vma is called while unmapping a pfnmap for a region.
- * untrack can be called for a specific region indicated by pfn and size or
- * can be for the entire vma (in which case size can be zero).
- */
-void untrack_pfn_vma(struct vm_area_struct *vma, unsigned long pfn,
-			unsigned long size)
-{
-}
-#endif
-
 static int __init disable_randmaps(char *s)
 {
 	randomize_va_space = 0;
@@ -713,7 +669,7 @@ int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
 	if (is_vm_hugetlb_page(vma))
 		return copy_hugetlb_page_range(dst_mm, src_mm, vma);
 
-	if (is_pfn_mapping(vma)) {
+	if (unlikely(is_pfn_mapping(vma))) {
 		/*
 		 * We do not free on error cases below as remove_vma
 		 * gets called on error from higher level routine
@@ -969,7 +925,7 @@ unsigned long unmap_vmas(struct mmu_gather **tlbp,
 		if (vma->vm_flags & VM_ACCOUNT)
 			*nr_accounted += (end - start) >> PAGE_SHIFT;
 
-		if (is_pfn_mapping(vma))
+		if (unlikely(is_pfn_mapping(vma)))
 			untrack_pfn_vma(vma, 0, 0);
 
 		while (start != end) {
-- 
cgit v1.2.3-70-g09d2


From bf53de907dfdaac178c92d774aae7370d7b97d20 Mon Sep 17 00:00:00 2001
From: Markus Metzger <markus.t.metzger@intel.com>
Date: Fri, 19 Dec 2008 15:10:24 +0100
Subject: x86, bts: add fork and exit handling

Impact: introduce new ptrace facility

Add arch_ptrace_untrace() function that is called when the tracer
detaches (either voluntarily or when the tracing task dies);
ptrace_disable() is only called on a voluntary detach.

Add ptrace_fork() and arch_ptrace_fork(). They are called when a
traced task is forked.

Clear DS and BTS related fields on fork.

Release DS resources and reclaim memory in ptrace_untrace(). This
releases resources already when the tracing task dies. We used to do
that when the traced task dies.

Signed-off-by: Markus Metzger <markus.t.metzger@intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/include/asm/ds.h     |  9 ++++++++
 arch/x86/include/asm/ptrace.h |  7 ++++++
 arch/x86/kernel/ds.c          | 11 ++++++++++
 arch/x86/kernel/process_32.c  | 20 ++++++++---------
 arch/x86/kernel/process_64.c  | 20 ++++++++---------
 arch/x86/kernel/ptrace.c      | 50 ++++++++++++++++++++++++++++++++++---------
 include/linux/ptrace.h        | 22 +++++++++++++++++++
 kernel/fork.c                 |  2 ++
 kernel/ptrace.c               | 12 +++++++++++
 9 files changed, 121 insertions(+), 32 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/include/asm/ds.h b/arch/x86/include/asm/ds.h
index ee0ea3a96c1..a8f672ba100 100644
--- a/arch/x86/include/asm/ds.h
+++ b/arch/x86/include/asm/ds.h
@@ -252,12 +252,21 @@ extern void __cpuinit ds_init_intel(struct cpuinfo_x86 *);
  */
 extern void ds_switch_to(struct task_struct *prev, struct task_struct *next);
 
+/*
+ * Task clone/init and cleanup work
+ */
+extern void ds_copy_thread(struct task_struct *tsk, struct task_struct *father);
+extern void ds_exit_thread(struct task_struct *tsk);
+
 #else /* CONFIG_X86_DS */
 
 struct cpuinfo_x86;
 static inline void __cpuinit ds_init_intel(struct cpuinfo_x86 *ignored) {}
 static inline void ds_switch_to(struct task_struct *prev,
 				struct task_struct *next) {}
+static inline void ds_copy_thread(struct task_struct *tsk,
+				  struct task_struct *father) {}
+static inline void ds_exit_thread(struct task_struct *tsk) {}
 
 #endif /* CONFIG_X86_DS */
 #endif /* _ASM_X86_DS_H */
diff --git a/arch/x86/include/asm/ptrace.h b/arch/x86/include/asm/ptrace.h
index fbf74421591..6d34d954c22 100644
--- a/arch/x86/include/asm/ptrace.h
+++ b/arch/x86/include/asm/ptrace.h
@@ -235,6 +235,13 @@ extern int do_get_thread_area(struct task_struct *p, int idx,
 extern int do_set_thread_area(struct task_struct *p, int idx,
 			      struct user_desc __user *info, int can_allocate);
 
+extern void x86_ptrace_untrace(struct task_struct *);
+extern void x86_ptrace_fork(struct task_struct *child,
+			    unsigned long clone_flags);
+
+#define arch_ptrace_untrace(tsk) x86_ptrace_untrace(tsk)
+#define arch_ptrace_fork(child, flags) x86_ptrace_fork(child, flags)
+
 #endif /* __KERNEL__ */
 
 #endif /* !__ASSEMBLY__ */
diff --git a/arch/x86/kernel/ds.c b/arch/x86/kernel/ds.c
index 98d271e60e0..da91701a234 100644
--- a/arch/x86/kernel/ds.c
+++ b/arch/x86/kernel/ds.c
@@ -1017,3 +1017,14 @@ void ds_switch_to(struct task_struct *prev, struct task_struct *next)
 
 	update_debugctlmsr(next->thread.debugctlmsr);
 }
+
+void ds_copy_thread(struct task_struct *tsk, struct task_struct *father)
+{
+	clear_tsk_thread_flag(tsk, TIF_DS_AREA_MSR);
+	tsk->thread.ds_ctx = NULL;
+}
+
+void ds_exit_thread(struct task_struct *tsk)
+{
+	WARN_ON(tsk->thread.ds_ctx);
+}
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index 605eff9a8ac..3ba155d2488 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -60,6 +60,7 @@
 #include <asm/idle.h>
 #include <asm/syscalls.h>
 #include <asm/smp.h>
+#include <asm/ds.h>
 
 asmlinkage void ret_from_fork(void) __asm__("ret_from_fork");
 
@@ -251,17 +252,8 @@ void exit_thread(void)
 		tss->x86_tss.io_bitmap_base = INVALID_IO_BITMAP_OFFSET;
 		put_cpu();
 	}
-#ifdef CONFIG_X86_DS
-	/* Free any BTS tracers that have not been properly released. */
-	if (unlikely(current->bts)) {
-		ds_release_bts(current->bts);
-		current->bts = NULL;
-
-		kfree(current->bts_buffer);
-		current->bts_buffer = NULL;
-		current->bts_size = 0;
-	}
-#endif /* CONFIG_X86_DS */
+
+	ds_exit_thread(current);
 }
 
 void flush_thread(void)
@@ -343,6 +335,12 @@ int copy_thread(int nr, unsigned long clone_flags, unsigned long sp,
 		kfree(p->thread.io_bitmap_ptr);
 		p->thread.io_bitmap_max = 0;
 	}
+
+	ds_copy_thread(p, current);
+
+	clear_tsk_thread_flag(p, TIF_DEBUGCTLMSR);
+	p->thread.debugctlmsr = 0;
+
 	return err;
 }
 
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index 1cfd2a4bf85..416fb9282f4 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -53,6 +53,7 @@
 #include <asm/ia32.h>
 #include <asm/idle.h>
 #include <asm/syscalls.h>
+#include <asm/ds.h>
 
 asmlinkage extern void ret_from_fork(void);
 
@@ -236,17 +237,8 @@ void exit_thread(void)
 		t->io_bitmap_max = 0;
 		put_cpu();
 	}
-#ifdef CONFIG_X86_DS
-	/* Free any BTS tracers that have not been properly released. */
-	if (unlikely(current->bts)) {
-		ds_release_bts(current->bts);
-		current->bts = NULL;
-
-		kfree(current->bts_buffer);
-		current->bts_buffer = NULL;
-		current->bts_size = 0;
-	}
-#endif /* CONFIG_X86_DS */
+
+	ds_exit_thread(current);
 }
 
 void flush_thread(void)
@@ -376,6 +368,12 @@ int copy_thread(int nr, unsigned long clone_flags, unsigned long sp,
 		if (err)
 			goto out;
 	}
+
+	ds_copy_thread(p, me);
+
+	clear_tsk_thread_flag(p, TIF_DEBUGCTLMSR);
+	p->thread.debugctlmsr = 0;
+
 	err = 0;
 out:
 	if (err && p->thread.io_bitmap_ptr) {
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index 45e9855da2d..6ad2bb60765 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -769,8 +769,47 @@ static int ptrace_bts_size(struct task_struct *child)
 
 	return (trace->ds.top - trace->ds.begin) / trace->ds.size;
 }
+
+static void ptrace_bts_fork(struct task_struct *tsk)
+{
+	tsk->bts = NULL;
+	tsk->bts_buffer = NULL;
+	tsk->bts_size = 0;
+	tsk->thread.bts_ovfl_signal = 0;
+}
+
+static void ptrace_bts_untrace(struct task_struct *child)
+{
+	if (unlikely(child->bts)) {
+		ds_release_bts(child->bts);
+		child->bts = NULL;
+
+		kfree(child->bts_buffer);
+		child->bts_buffer = NULL;
+		child->bts_size = 0;
+	}
+}
+
+static void ptrace_bts_detach(struct task_struct *child)
+{
+	ptrace_bts_untrace(child);
+}
+#else
+static inline void ptrace_bts_fork(struct task_struct *tsk) {}
+static inline void ptrace_bts_detach(struct task_struct *child) {}
+static inline void ptrace_bts_untrace(struct task_struct *child) {}
 #endif /* CONFIG_X86_PTRACE_BTS */
 
+void x86_ptrace_fork(struct task_struct *child, unsigned long clone_flags)
+{
+	ptrace_bts_fork(child);
+}
+
+void x86_ptrace_untrace(struct task_struct *child)
+{
+	ptrace_bts_untrace(child);
+}
+
 /*
  * Called by kernel/ptrace.c when detaching..
  *
@@ -782,16 +821,7 @@ void ptrace_disable(struct task_struct *child)
 #ifdef TIF_SYSCALL_EMU
 	clear_tsk_thread_flag(child, TIF_SYSCALL_EMU);
 #endif
-#ifdef CONFIG_X86_PTRACE_BTS
-	if (child->bts) {
-		ds_release_bts(child->bts);
-		child->bts = NULL;
-
-		kfree(child->bts_buffer);
-		child->bts_buffer = NULL;
-		child->bts_size = 0;
-	}
-#endif /* CONFIG_X86_PTRACE_BTS */
+	ptrace_bts_detach(child);
 }
 
 #if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION
diff --git a/include/linux/ptrace.h b/include/linux/ptrace.h
index 22641d5d45d..98b93ca4db0 100644
--- a/include/linux/ptrace.h
+++ b/include/linux/ptrace.h
@@ -94,6 +94,7 @@ extern void ptrace_notify(int exit_code);
 extern void __ptrace_link(struct task_struct *child,
 			  struct task_struct *new_parent);
 extern void __ptrace_unlink(struct task_struct *child);
+extern void ptrace_fork(struct task_struct *task, unsigned long clone_flags);
 #define PTRACE_MODE_READ   1
 #define PTRACE_MODE_ATTACH 2
 /* Returns 0 on success, -errno on denial. */
@@ -313,6 +314,27 @@ static inline void user_enable_block_step(struct task_struct *task)
 #define arch_ptrace_stop(code, info)		do { } while (0)
 #endif
 
+#ifndef arch_ptrace_untrace
+/*
+ * Do machine-specific work before untracing child.
+ *
+ * This is called for a normal detach as well as from ptrace_exit()
+ * when the tracing task dies.
+ *
+ * Called with write_lock(&tasklist_lock) held.
+ */
+#define arch_ptrace_untrace(task)		do { } while (0)
+#endif
+
+#ifndef arch_ptrace_fork
+/*
+ * Do machine-specific work to initialize a new task.
+ *
+ * This is called from copy_process().
+ */
+#define arch_ptrace_fork(child, clone_flags)	do { } while (0)
+#endif
+
 extern int task_current_syscall(struct task_struct *target, long *callno,
 				unsigned long args[6], unsigned int maxargs,
 				unsigned long *sp, unsigned long *pc);
diff --git a/kernel/fork.c b/kernel/fork.c
index 7b93da72d4a..65ce60adc8e 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1096,6 +1096,8 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 #ifdef CONFIG_DEBUG_MUTEXES
 	p->blocked_on = NULL; /* not blocked yet */
 #endif
+	if (unlikely(ptrace_reparented(current)))
+		ptrace_fork(p, clone_flags);
 
 	/* Perform scheduler related setup. Assign this task to a CPU. */
 	sched_fork(p, clone_flags);
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 4c8bcd7dd8e..100a71cfdab 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -25,6 +25,17 @@
 #include <asm/pgtable.h>
 #include <asm/uaccess.h>
 
+
+/*
+ * Initialize a new task whose father had been ptraced.
+ *
+ * Called from copy_process().
+ */
+void ptrace_fork(struct task_struct *child, unsigned long clone_flags)
+{
+	arch_ptrace_fork(child, clone_flags);
+}
+
 /*
  * ptrace a task: make the debugger its new parent and
  * move it to the ptrace list.
@@ -72,6 +83,7 @@ void __ptrace_unlink(struct task_struct *child)
 	child->parent = child->real_parent;
 	list_del_init(&child->ptrace_entry);
 
+	arch_ptrace_untrace(child);
 	if (task_is_traced(child))
 		ptrace_untrace(child);
 }
-- 
cgit v1.2.3-70-g09d2


From c5dee6177f4bd2095aab7d9be9f6ebdddd6deee9 Mon Sep 17 00:00:00 2001
From: Markus Metzger <markus.t.metzger@intel.com>
Date: Fri, 19 Dec 2008 15:17:02 +0100
Subject: x86, bts: memory accounting

Impact: move the BTS buffer accounting to the mlock bucket

Add alloc_locked_buffer() and free_locked_buffer() functions to mm/mlock.c
to kalloc a buffer and account the locked memory to current.

Account the memory for the BTS buffer to the tracer.

Signed-off-by: Markus Metzger <markus.t.metzger@intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/ptrace.c | 45 ++++++++++++++++++++++++++++++++++-----------
 include/linux/mm.h       |  2 ++
 mm/mlock.c               | 45 +++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 81 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index 6ad2bb60765..0a5df5f82fb 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -650,6 +650,24 @@ static int ptrace_bts_drain(struct task_struct *child,
 	return drained;
 }
 
+static int ptrace_bts_allocate_buffer(struct task_struct *child, size_t size)
+{
+	child->bts_buffer = alloc_locked_buffer(size);
+	if (!child->bts_buffer)
+		return -ENOMEM;
+
+	child->bts_size = size;
+
+	return 0;
+}
+
+static void ptrace_bts_free_buffer(struct task_struct *child)
+{
+	free_locked_buffer(child->bts_buffer, child->bts_size);
+	child->bts_buffer = NULL;
+	child->bts_size = 0;
+}
+
 static int ptrace_bts_config(struct task_struct *child,
 			     long cfg_size,
 			     const struct ptrace_bts_config __user *ucfg)
@@ -679,14 +697,13 @@ static int ptrace_bts_config(struct task_struct *child,
 
 	if ((cfg.flags & PTRACE_BTS_O_ALLOC) &&
 	    (cfg.size != child->bts_size)) {
-		kfree(child->bts_buffer);
+		int error;
 
-		child->bts_size = cfg.size;
-		child->bts_buffer = kzalloc(cfg.size, GFP_KERNEL);
-		if (!child->bts_buffer) {
-			child->bts_size = 0;
-			return -ENOMEM;
-		}
+		ptrace_bts_free_buffer(child);
+
+		error = ptrace_bts_allocate_buffer(child, cfg.size);
+		if (error < 0)
+			return error;
 	}
 
 	if (cfg.flags & PTRACE_BTS_O_TRACE)
@@ -701,10 +718,8 @@ static int ptrace_bts_config(struct task_struct *child,
 	if (IS_ERR(child->bts)) {
 		int error = PTR_ERR(child->bts);
 
-		kfree(child->bts_buffer);
+		ptrace_bts_free_buffer(child);
 		child->bts = NULL;
-		child->bts_buffer = NULL;
-		child->bts_size = 0;
 
 		return error;
 	}
@@ -784,6 +799,9 @@ static void ptrace_bts_untrace(struct task_struct *child)
 		ds_release_bts(child->bts);
 		child->bts = NULL;
 
+		/* We cannot update total_vm and locked_vm since
+		   child's mm is already gone. But we can reclaim the
+		   memory. */
 		kfree(child->bts_buffer);
 		child->bts_buffer = NULL;
 		child->bts_size = 0;
@@ -792,7 +810,12 @@ static void ptrace_bts_untrace(struct task_struct *child)
 
 static void ptrace_bts_detach(struct task_struct *child)
 {
-	ptrace_bts_untrace(child);
+	if (unlikely(child->bts)) {
+		ds_release_bts(child->bts);
+		child->bts = NULL;
+
+		ptrace_bts_free_buffer(child);
+	}
 }
 #else
 static inline void ptrace_bts_fork(struct task_struct *tsk) {}
diff --git a/include/linux/mm.h b/include/linux/mm.h
index ffee2f74341..9979d3fab6e 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1286,5 +1286,7 @@ int vmemmap_populate_basepages(struct page *start_page,
 int vmemmap_populate(struct page *start_page, unsigned long pages, int node);
 void vmemmap_populate_print_last(void);
 
+extern void *alloc_locked_buffer(size_t size);
+extern void free_locked_buffer(void *buffer, size_t size);
 #endif /* __KERNEL__ */
 #endif /* _LINUX_MM_H */
diff --git a/mm/mlock.c b/mm/mlock.c
index 1ada366570c..3035a56e761 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -667,3 +667,48 @@ void user_shm_unlock(size_t size, struct user_struct *user)
 	spin_unlock(&shmlock_user_lock);
 	free_uid(user);
 }
+
+void *alloc_locked_buffer(size_t size)
+{
+	unsigned long rlim, vm, pgsz;
+	void *buffer = NULL;
+
+	pgsz = PAGE_ALIGN(size) >> PAGE_SHIFT;
+
+	down_write(&current->mm->mmap_sem);
+
+	rlim = current->signal->rlim[RLIMIT_AS].rlim_cur >> PAGE_SHIFT;
+	vm   = current->mm->total_vm + pgsz;
+	if (rlim < vm)
+		goto out;
+
+	rlim = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur >> PAGE_SHIFT;
+	vm   = current->mm->locked_vm + pgsz;
+	if (rlim < vm)
+		goto out;
+
+	buffer = kzalloc(size, GFP_KERNEL);
+	if (!buffer)
+		goto out;
+
+	current->mm->total_vm  += pgsz;
+	current->mm->locked_vm += pgsz;
+
+ out:
+	up_write(&current->mm->mmap_sem);
+	return buffer;
+}
+
+void free_locked_buffer(void *buffer, size_t size)
+{
+	unsigned long pgsz = PAGE_ALIGN(size) >> PAGE_SHIFT;
+
+	down_write(&current->mm->mmap_sem);
+
+	current->mm->total_vm  -= pgsz;
+	current->mm->locked_vm -= pgsz;
+
+	up_write(&current->mm->mmap_sem);
+
+	kfree(buffer);
+}
-- 
cgit v1.2.3-70-g09d2


From 749820928a2fd47ff536773d869d2c3f8038b7d1 Mon Sep 17 00:00:00 2001
From: Anton Vorontsov <avorontsov@ru.mvista.com>
Date: Fri, 5 Dec 2008 08:15:54 +0000
Subject: of/gpio: Implement of_gpio_count()

This function is used to count how many GPIOs are specified for
a device node.

Signed-off-by: Anton Vorontsov <avorontsov@ru.mvista.com>
Signed-off-by: Paul Mackerras <paulus@samba.org>
---
 drivers/of/gpio.c       | 34 ++++++++++++++++++++++++++++++++++
 include/linux/of_gpio.h |  6 ++++++
 2 files changed, 40 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/of/gpio.c b/drivers/of/gpio.c
index a4ba217116e..6eea601a920 100644
--- a/drivers/of/gpio.c
+++ b/drivers/of/gpio.c
@@ -79,6 +79,40 @@ err0:
 }
 EXPORT_SYMBOL(of_get_gpio_flags);
 
+/**
+ * of_gpio_count - Count GPIOs for a device
+ * @np:		device node to count GPIOs for
+ *
+ * The function returns the count of GPIOs specified for a node.
+ *
+ * Note that the empty GPIO specifiers counts too. For example,
+ *
+ * gpios = <0
+ *          &pio1 1 2
+ *          0
+ *          &pio2 3 4>;
+ *
+ * defines four GPIOs (so this function will return 4), two of which
+ * are not specified.
+ */
+unsigned int of_gpio_count(struct device_node *np)
+{
+	unsigned int cnt = 0;
+
+	do {
+		int ret;
+
+		ret = of_parse_phandles_with_args(np, "gpios", "#gpio-cells",
+						  cnt, NULL, NULL);
+		/* A hole in the gpios = <> counts anyway. */
+		if (ret < 0 && ret != -EEXIST)
+			break;
+	} while (++cnt);
+
+	return cnt;
+}
+EXPORT_SYMBOL(of_gpio_count);
+
 /**
  * of_gpio_simple_xlate - translate gpio_spec to the GPIO number and flags
  * @of_gc:	pointer to the of_gpio_chip structure
diff --git a/include/linux/of_gpio.h b/include/linux/of_gpio.h
index e25abf610cb..fc2472c3c25 100644
--- a/include/linux/of_gpio.h
+++ b/include/linux/of_gpio.h
@@ -65,6 +65,7 @@ static inline struct of_mm_gpio_chip *to_of_mm_gpio_chip(struct gpio_chip *gc)
 
 extern int of_get_gpio_flags(struct device_node *np, int index,
 			     enum of_gpio_flags *flags);
+extern unsigned int of_gpio_count(struct device_node *np);
 
 extern int of_mm_gpiochip_add(struct device_node *np,
 			      struct of_mm_gpio_chip *mm_gc);
@@ -81,6 +82,11 @@ static inline int of_get_gpio_flags(struct device_node *np, int index,
 	return -ENOSYS;
 }
 
+static inline unsigned int of_gpio_count(struct device_node *np)
+{
+	return 0;
+}
+
 #endif /* CONFIG_OF_GPIO */
 
 /**
-- 
cgit v1.2.3-70-g09d2


From 3ddeb912f41801fd1968c7880d031702a396e4d0 Mon Sep 17 00:00:00 2001
From: Lai Jiangshan <laijs@cn.fujitsu.com>
Date: Sat, 20 Dec 2008 17:15:14 +0800
Subject: ftrace: enable format arguments checking

Impact: broaden gcc printf format checks for ftrace_printk()

format arguments checking for ftrace_printk() is __printf(1, 2),
not __printf(1, 0).

Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/ftrace.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index 04b52e6ebc6..677432b9cb7 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -303,7 +303,7 @@ extern void ftrace_dump(void);
 static inline void
 ftrace_special(unsigned long arg1, unsigned long arg2, unsigned long arg3) { }
 static inline int
-ftrace_printk(const char *fmt, ...) __attribute__ ((format (printf, 1, 0)));
+ftrace_printk(const char *fmt, ...) __attribute__ ((format (printf, 1, 2)));
 
 static inline void tracing_start(void) { }
 static inline void tracing_stop(void) { }
-- 
cgit v1.2.3-70-g09d2


From f4314e815e87b4ab1c9b1115dd5853cd20ca999c Mon Sep 17 00:00:00 2001
From: Don Skidmore <donald.c.skidmore@intel.com>
Date: Sun, 21 Dec 2008 20:10:29 -0800
Subject: net: add DCNA attribute to the BCN interface for DCB

Adds the Backward Congestion Notification Address (BCNA) attribute to the
Backward Congestion Notification (BCN) interface for Data Center Bridging
(DCB), which was missing.  Receive the BCNA attribute in the ixgbe driver.
The BCNA attribute is for a switch to inform the endstation about the physical
port identification in order to support BCN on aggregated links.

Signed-off-by: Don Skidmore <donald.c.skidmore@intel.com>
Signed-off-by: Eric W Multanen <eric.w.multanen@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
 drivers/net/ixgbe/ixgbe_dcb_nl.c | 20 ++++++++++++++++++++
 include/linux/dcbnl.h            |  2 ++
 net/dcb/dcbnl.c                  |  6 ++++--
 3 files changed, 26 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ixgbe/ixgbe_dcb_nl.c b/drivers/net/ixgbe/ixgbe_dcb_nl.c
index 7d158a5c545..8ac639d0da0 100644
--- a/drivers/net/ixgbe/ixgbe_dcb_nl.c
+++ b/drivers/net/ixgbe/ixgbe_dcb_nl.c
@@ -93,6 +93,8 @@ int ixgbe_copy_dcb_cfg(struct ixgbe_dcb_config *src_dcb_cfg,
 		dst_dcb_cfg->bcn.rp_admin_mode[i - DCB_BCN_ATTR_RP_0] =
 			src_dcb_cfg->bcn.rp_admin_mode[i - DCB_BCN_ATTR_RP_0];
 	}
+	dst_dcb_cfg->bcn.bcna_option[0] = src_dcb_cfg->bcn.bcna_option[0];
+	dst_dcb_cfg->bcn.bcna_option[1] = src_dcb_cfg->bcn.bcna_option[1];
 	dst_dcb_cfg->bcn.rp_alpha = src_dcb_cfg->bcn.rp_alpha;
 	dst_dcb_cfg->bcn.rp_beta = src_dcb_cfg->bcn.rp_beta;
 	dst_dcb_cfg->bcn.rp_gd = src_dcb_cfg->bcn.rp_gd;
@@ -457,6 +459,12 @@ static void ixgbe_dcbnl_getbcncfg(struct net_device *netdev, int enum_index,
 	struct ixgbe_adapter *adapter = netdev_priv(netdev);
 
 	switch (enum_index) {
+	case DCB_BCN_ATTR_BCNA_0:
+		*setting = adapter->dcb_cfg.bcn.bcna_option[0];
+		break;
+	case DCB_BCN_ATTR_BCNA_1:
+		*setting = adapter->dcb_cfg.bcn.bcna_option[1];
+		break;
 	case DCB_BCN_ATTR_ALPHA:
 		*setting = adapter->dcb_cfg.bcn.rp_alpha;
 		break;
@@ -516,6 +524,18 @@ static void ixgbe_dcbnl_setbcncfg(struct net_device *netdev, int enum_index,
 	struct ixgbe_adapter *adapter = netdev_priv(netdev);
 
 	switch (enum_index) {
+	case DCB_BCN_ATTR_BCNA_0:
+		adapter->temp_dcb_cfg.bcn.bcna_option[0] = setting;
+		if (adapter->temp_dcb_cfg.bcn.bcna_option[0] !=
+			adapter->dcb_cfg.bcn.bcna_option[0])
+			adapter->dcb_set_bitmap |= BIT_BCN;
+		break;
+	case DCB_BCN_ATTR_BCNA_1:
+		adapter->temp_dcb_cfg.bcn.bcna_option[1] = setting;
+		if (adapter->temp_dcb_cfg.bcn.bcna_option[1] !=
+			adapter->dcb_cfg.bcn.bcna_option[1])
+			adapter->dcb_set_bitmap |= BIT_BCN;
+		break;
 	case DCB_BCN_ATTR_ALPHA:
 		adapter->temp_dcb_cfg.bcn.rp_alpha = setting;
 		if (adapter->temp_dcb_cfg.bcn.rp_alpha !=
diff --git a/include/linux/dcbnl.h b/include/linux/dcbnl.h
index e73a61449ad..b0ef274e003 100644
--- a/include/linux/dcbnl.h
+++ b/include/linux/dcbnl.h
@@ -305,6 +305,8 @@ enum dcbnl_bcn_attrs{
 	DCB_BCN_ATTR_RP_7,
 	DCB_BCN_ATTR_RP_ALL,
 
+	DCB_BCN_ATTR_BCNA_0,
+	DCB_BCN_ATTR_BCNA_1,
 	DCB_BCN_ATTR_ALPHA,
 	DCB_BCN_ATTR_BETA,
 	DCB_BCN_ATTR_GD,
diff --git a/net/dcb/dcbnl.c b/net/dcb/dcbnl.c
index fc88fc4d4f6..5dbfe5fdc0d 100644
--- a/net/dcb/dcbnl.c
+++ b/net/dcb/dcbnl.c
@@ -140,6 +140,8 @@ static struct nla_policy dcbnl_bcn_nest[DCB_BCN_ATTR_MAX + 1] = {
 	[DCB_BCN_ATTR_RP_6]         = {.type = NLA_U8},
 	[DCB_BCN_ATTR_RP_7]         = {.type = NLA_U8},
 	[DCB_BCN_ATTR_RP_ALL]       = {.type = NLA_FLAG},
+	[DCB_BCN_ATTR_BCNA_0]       = {.type = NLA_U32},
+	[DCB_BCN_ATTR_BCNA_1]       = {.type = NLA_U32},
 	[DCB_BCN_ATTR_ALPHA]        = {.type = NLA_U32},
 	[DCB_BCN_ATTR_BETA]         = {.type = NLA_U32},
 	[DCB_BCN_ATTR_GD]           = {.type = NLA_U32},
@@ -922,7 +924,7 @@ static int dcbnl_bcn_getcfg(struct net_device *netdev, struct nlattr **tb,
 			goto err_bcn;
 	}
 
-	for (i = DCB_BCN_ATTR_ALPHA; i <= DCB_BCN_ATTR_RI; i++) {
+	for (i = DCB_BCN_ATTR_BCNA_0; i <= DCB_BCN_ATTR_RI; i++) {
 		if (!getall && !bcn_tb[i])
 			continue;
 
@@ -980,7 +982,7 @@ static int dcbnl_bcn_setcfg(struct net_device *netdev, struct nlattr **tb,
 			data[i]->nla_type - DCB_BCN_ATTR_RP_0, value_byte);
 	}
 
-	for (i = DCB_BCN_ATTR_ALPHA; i <= DCB_BCN_ATTR_RI; i++) {
+	for (i = DCB_BCN_ATTR_BCNA_0; i <= DCB_BCN_ATTR_RI; i++) {
 		if (data[i] == NULL)
 			continue;
 		value_int = nla_get_u32(data[i]);
-- 
cgit v1.2.3-70-g09d2


From 209aa4fdc39eacc145a7f9c32a4b9ffcc68912c6 Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Fri, 12 Dec 2008 16:35:40 +0900
Subject: fb: SH-5 uses __raw I/O accessors now also, drop the special casing.

Signed-off-by: Paul Mundt <lethal@linux-sh.org>
---
 include/linux/fb.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/fb.h b/include/linux/fb.h
index 75a81eaf343..1ee63df5be9 100644
--- a/include/linux/fb.h
+++ b/include/linux/fb.h
@@ -888,7 +888,7 @@ struct fb_info {
 #define fb_writeq sbus_writeq
 #define fb_memset sbus_memset_io
 
-#elif defined(__i386__) || defined(__alpha__) || defined(__x86_64__) || defined(__hppa__) || (defined(__sh__) && !defined(__SH5__)) || defined(__powerpc__) || defined(__avr32__)
+#elif defined(__i386__) || defined(__alpha__) || defined(__x86_64__) || defined(__hppa__) || defined(__sh__) || defined(__powerpc__) || defined(__avr32__)
 
 #define fb_readb __raw_readb
 #define fb_readw __raw_readw
-- 
cgit v1.2.3-70-g09d2


From b8dd786f9417e5885929bfe33a235c76a9c1c569 Mon Sep 17 00:00:00 2001
From: Yevgeny Petrilin <yevgenyp@mellanox.co.il>
Date: Mon, 22 Dec 2008 07:15:03 -0800
Subject: mlx4_core: Add support for multiple completion event vectors

When using MSI-X mode, create a completion event queue for each CPU.
Report the number of completion EQs in a new struct mlx4_caps member,
num_comp_vectors, and extend the mlx4_cq_alloc() interface with a
vector parameter so that consumers can specify which completion EQ
should be used to report events for the CQ being created.

Signed-off-by: Yevgeny Petrilin <yevgenyp@mellanox.co.il>
Signed-off-by: Roland Dreier <rolandd@cisco.com>
---
 drivers/infiniband/hw/mlx4/cq.c   |   2 +-
 drivers/infiniband/hw/mlx4/main.c |   2 +-
 drivers/net/mlx4/cq.c             |  11 +++-
 drivers/net/mlx4/en_cq.c          |   9 ++-
 drivers/net/mlx4/en_main.c        |   4 +-
 drivers/net/mlx4/eq.c             | 117 ++++++++++++++++++++++++++++----------
 drivers/net/mlx4/main.c           |  53 ++++++++++++-----
 drivers/net/mlx4/mlx4.h           |  14 ++---
 drivers/net/mlx4/profile.c        |   4 +-
 include/linux/mlx4/device.h       |   4 +-
 10 files changed, 157 insertions(+), 63 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/infiniband/hw/mlx4/cq.c b/drivers/infiniband/hw/mlx4/cq.c
index 18308494a19..2198753bf13 100644
--- a/drivers/infiniband/hw/mlx4/cq.c
+++ b/drivers/infiniband/hw/mlx4/cq.c
@@ -222,7 +222,7 @@ struct ib_cq *mlx4_ib_create_cq(struct ib_device *ibdev, int entries, int vector
 	}
 
 	err = mlx4_cq_alloc(dev->dev, entries, &cq->buf.mtt, uar,
-			    cq->db.dma, &cq->mcq, 0);
+			    cq->db.dma, &cq->mcq, vector, 0);
 	if (err)
 		goto err_dbmap;
 
diff --git a/drivers/infiniband/hw/mlx4/main.c b/drivers/infiniband/hw/mlx4/main.c
index 2e80f8f47b0..dcefe1fceb5 100644
--- a/drivers/infiniband/hw/mlx4/main.c
+++ b/drivers/infiniband/hw/mlx4/main.c
@@ -578,7 +578,7 @@ static void *mlx4_ib_add(struct mlx4_dev *dev)
 	mlx4_foreach_port(i, dev, MLX4_PORT_TYPE_IB)
 		ibdev->num_ports++;
 	ibdev->ib_dev.phys_port_cnt     = ibdev->num_ports;
-	ibdev->ib_dev.num_comp_vectors	= 1;
+	ibdev->ib_dev.num_comp_vectors	= dev->caps.num_comp_vectors;
 	ibdev->ib_dev.dma_device	= &dev->pdev->dev;
 
 	ibdev->ib_dev.uverbs_abi_ver	= MLX4_IB_UVERBS_ABI_VERSION;
diff --git a/drivers/net/mlx4/cq.c b/drivers/net/mlx4/cq.c
index b7ad2829d67..ac57b6a42c6 100644
--- a/drivers/net/mlx4/cq.c
+++ b/drivers/net/mlx4/cq.c
@@ -189,7 +189,7 @@ EXPORT_SYMBOL_GPL(mlx4_cq_resize);
 
 int mlx4_cq_alloc(struct mlx4_dev *dev, int nent, struct mlx4_mtt *mtt,
 		  struct mlx4_uar *uar, u64 db_rec, struct mlx4_cq *cq,
-		  int collapsed)
+		  unsigned vector, int collapsed)
 {
 	struct mlx4_priv *priv = mlx4_priv(dev);
 	struct mlx4_cq_table *cq_table = &priv->cq_table;
@@ -198,6 +198,11 @@ int mlx4_cq_alloc(struct mlx4_dev *dev, int nent, struct mlx4_mtt *mtt,
 	u64 mtt_addr;
 	int err;
 
+	if (vector >= dev->caps.num_comp_vectors)
+		return -EINVAL;
+
+	cq->vector = vector;
+
 	cq->cqn = mlx4_bitmap_alloc(&cq_table->bitmap);
 	if (cq->cqn == -1)
 		return -ENOMEM;
@@ -227,7 +232,7 @@ int mlx4_cq_alloc(struct mlx4_dev *dev, int nent, struct mlx4_mtt *mtt,
 
 	cq_context->flags	    = cpu_to_be32(!!collapsed << 18);
 	cq_context->logsize_usrpage = cpu_to_be32((ilog2(nent) << 24) | uar->index);
-	cq_context->comp_eqn        = priv->eq_table.eq[MLX4_EQ_COMP].eqn;
+	cq_context->comp_eqn	    = priv->eq_table.eq[vector].eqn;
 	cq_context->log_page_size   = mtt->page_shift - MLX4_ICM_PAGE_SHIFT;
 
 	mtt_addr = mlx4_mtt_addr(dev, mtt);
@@ -276,7 +281,7 @@ void mlx4_cq_free(struct mlx4_dev *dev, struct mlx4_cq *cq)
 	if (err)
 		mlx4_warn(dev, "HW2SW_CQ failed (%d) for CQN %06x\n", err, cq->cqn);
 
-	synchronize_irq(priv->eq_table.eq[MLX4_EQ_COMP].irq);
+	synchronize_irq(priv->eq_table.eq[cq->vector].irq);
 
 	spin_lock_irq(&cq_table->lock);
 	radix_tree_delete(&cq_table->tree, cq->cqn);
diff --git a/drivers/net/mlx4/en_cq.c b/drivers/net/mlx4/en_cq.c
index 1368a8010af..674f836e225 100644
--- a/drivers/net/mlx4/en_cq.c
+++ b/drivers/net/mlx4/en_cq.c
@@ -51,10 +51,13 @@ int mlx4_en_create_cq(struct mlx4_en_priv *priv,
 	int err;
 
 	cq->size = entries;
-	if (mode == RX)
+	if (mode == RX) {
 		cq->buf_size = cq->size * sizeof(struct mlx4_cqe);
-	else
+		cq->vector   = ring % mdev->dev->caps.num_comp_vectors;
+	} else {
 		cq->buf_size = sizeof(struct mlx4_cqe);
+		cq->vector   = 0;
+	}
 
 	cq->ring = ring;
 	cq->is_tx = mode;
@@ -86,7 +89,7 @@ int mlx4_en_activate_cq(struct mlx4_en_priv *priv, struct mlx4_en_cq *cq)
 	memset(cq->buf, 0, cq->buf_size);
 
 	err = mlx4_cq_alloc(mdev->dev, cq->size, &cq->wqres.mtt, &mdev->priv_uar,
-			    cq->wqres.db.dma, &cq->mcq, cq->is_tx);
+			    cq->wqres.db.dma, &cq->mcq, cq->vector, cq->is_tx);
 	if (err)
 		return err;
 
diff --git a/drivers/net/mlx4/en_main.c b/drivers/net/mlx4/en_main.c
index 4b9794e97a7..c1c05852a95 100644
--- a/drivers/net/mlx4/en_main.c
+++ b/drivers/net/mlx4/en_main.c
@@ -170,9 +170,9 @@ static void *mlx4_en_add(struct mlx4_dev *dev)
 		mlx4_info(mdev, "Using %d tx rings for port:%d\n",
 			  mdev->profile.prof[i].tx_ring_num, i);
 		if (!mdev->profile.prof[i].rx_ring_num) {
-			mdev->profile.prof[i].rx_ring_num = 1;
+			mdev->profile.prof[i].rx_ring_num = dev->caps.num_comp_vectors;
 			mlx4_info(mdev, "Defaulting to %d rx rings for port:%d\n",
-				  1, i);
+				  mdev->profile.prof[i].rx_ring_num, i);
 		} else
 			mlx4_info(mdev, "Using %d rx rings for port:%d\n",
 				  mdev->profile.prof[i].rx_ring_num, i);
diff --git a/drivers/net/mlx4/eq.c b/drivers/net/mlx4/eq.c
index de169338cd9..5d867ebe6a4 100644
--- a/drivers/net/mlx4/eq.c
+++ b/drivers/net/mlx4/eq.c
@@ -266,7 +266,7 @@ static irqreturn_t mlx4_interrupt(int irq, void *dev_ptr)
 
 	writel(priv->eq_table.clr_mask, priv->eq_table.clr_int);
 
-	for (i = 0; i < MLX4_NUM_EQ; ++i)
+	for (i = 0; i < dev->caps.num_comp_vectors + 1; ++i)
 		work |= mlx4_eq_int(dev, &priv->eq_table.eq[i]);
 
 	return IRQ_RETVAL(work);
@@ -304,6 +304,17 @@ static int mlx4_HW2SW_EQ(struct mlx4_dev *dev, struct mlx4_cmd_mailbox *mailbox,
 			    MLX4_CMD_TIME_CLASS_A);
 }
 
+static int mlx4_num_eq_uar(struct mlx4_dev *dev)
+{
+	/*
+	 * Each UAR holds 4 EQ doorbells.  To figure out how many UARs
+	 * we need to map, take the difference of highest index and
+	 * the lowest index we'll use and add 1.
+	 */
+	return (dev->caps.num_comp_vectors + 1 + dev->caps.reserved_eqs) / 4 -
+		dev->caps.reserved_eqs / 4 + 1;
+}
+
 static void __iomem *mlx4_get_eq_uar(struct mlx4_dev *dev, struct mlx4_eq *eq)
 {
 	struct mlx4_priv *priv = mlx4_priv(dev);
@@ -483,9 +494,11 @@ static void mlx4_free_irqs(struct mlx4_dev *dev)
 
 	if (eq_table->have_irq)
 		free_irq(dev->pdev->irq, dev);
-	for (i = 0; i < MLX4_NUM_EQ; ++i)
+	for (i = 0; i < dev->caps.num_comp_vectors + 1; ++i)
 		if (eq_table->eq[i].have_irq)
 			free_irq(eq_table->eq[i].irq, eq_table->eq + i);
+
+	kfree(eq_table->irq_names);
 }
 
 static int mlx4_map_clr_int(struct mlx4_dev *dev)
@@ -551,57 +564,93 @@ void mlx4_unmap_eq_icm(struct mlx4_dev *dev)
 	__free_page(priv->eq_table.icm_page);
 }
 
+int mlx4_alloc_eq_table(struct mlx4_dev *dev)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+
+	priv->eq_table.eq = kcalloc(dev->caps.num_eqs - dev->caps.reserved_eqs,
+				    sizeof *priv->eq_table.eq, GFP_KERNEL);
+	if (!priv->eq_table.eq)
+		return -ENOMEM;
+
+	return 0;
+}
+
+void mlx4_free_eq_table(struct mlx4_dev *dev)
+{
+	kfree(mlx4_priv(dev)->eq_table.eq);
+}
+
 int mlx4_init_eq_table(struct mlx4_dev *dev)
 {
 	struct mlx4_priv *priv = mlx4_priv(dev);
 	int err;
 	int i;
 
+	priv->eq_table.uar_map = kcalloc(sizeof *priv->eq_table.uar_map,
+					 mlx4_num_eq_uar(dev), GFP_KERNEL);
+	if (!priv->eq_table.uar_map) {
+		err = -ENOMEM;
+		goto err_out_free;
+	}
+
 	err = mlx4_bitmap_init(&priv->eq_table.bitmap, dev->caps.num_eqs,
 			       dev->caps.num_eqs - 1, dev->caps.reserved_eqs, 0);
 	if (err)
-		return err;
+		goto err_out_free;
 
-	for (i = 0; i < ARRAY_SIZE(priv->eq_table.uar_map); ++i)
+	for (i = 0; i < mlx4_num_eq_uar(dev); ++i)
 		priv->eq_table.uar_map[i] = NULL;
 
 	err = mlx4_map_clr_int(dev);
 	if (err)
-		goto err_out_free;
+		goto err_out_bitmap;
 
 	priv->eq_table.clr_mask =
 		swab32(1 << (priv->eq_table.inta_pin & 31));
 	priv->eq_table.clr_int  = priv->clr_base +
 		(priv->eq_table.inta_pin < 32 ? 4 : 0);
 
-	err = mlx4_create_eq(dev, dev->caps.num_cqs + MLX4_NUM_SPARE_EQE,
-			     (dev->flags & MLX4_FLAG_MSI_X) ? MLX4_EQ_COMP : 0,
-			     &priv->eq_table.eq[MLX4_EQ_COMP]);
-	if (err)
-		goto err_out_unmap;
+	priv->eq_table.irq_names = kmalloc(16 * dev->caps.num_comp_vectors, GFP_KERNEL);
+	if (!priv->eq_table.irq_names) {
+		err = -ENOMEM;
+		goto err_out_bitmap;
+	}
+
+	for (i = 0; i < dev->caps.num_comp_vectors; ++i) {
+		err = mlx4_create_eq(dev, dev->caps.num_cqs + MLX4_NUM_SPARE_EQE,
+				     (dev->flags & MLX4_FLAG_MSI_X) ? i : 0,
+				     &priv->eq_table.eq[i]);
+		if (err)
+			goto err_out_unmap;
+	}
 
 	err = mlx4_create_eq(dev, MLX4_NUM_ASYNC_EQE + MLX4_NUM_SPARE_EQE,
-			     (dev->flags & MLX4_FLAG_MSI_X) ? MLX4_EQ_ASYNC : 0,
-			     &priv->eq_table.eq[MLX4_EQ_ASYNC]);
+			     (dev->flags & MLX4_FLAG_MSI_X) ? dev->caps.num_comp_vectors : 0,
+			     &priv->eq_table.eq[dev->caps.num_comp_vectors]);
 	if (err)
 		goto err_out_comp;
 
 	if (dev->flags & MLX4_FLAG_MSI_X) {
-		static const char *eq_name[] = {
-			[MLX4_EQ_COMP]  = DRV_NAME " (comp)",
-			[MLX4_EQ_ASYNC] = DRV_NAME " (async)"
-		};
+		static const char async_eq_name[] = "mlx4-async";
+		const char *eq_name;
+
+		for (i = 0; i < dev->caps.num_comp_vectors + 1; ++i) {
+			if (i < dev->caps.num_comp_vectors) {
+				snprintf(priv->eq_table.irq_names + i * 16, 16,
+					 "mlx4-comp-%d", i);
+				eq_name = priv->eq_table.irq_names + i * 16;
+			} else
+				eq_name = async_eq_name;
 
-		for (i = 0; i < MLX4_NUM_EQ; ++i) {
 			err = request_irq(priv->eq_table.eq[i].irq,
-					  mlx4_msi_x_interrupt,
-					  0, eq_name[i], priv->eq_table.eq + i);
+					  mlx4_msi_x_interrupt, 0, eq_name,
+					  priv->eq_table.eq + i);
 			if (err)
 				goto err_out_async;
 
 			priv->eq_table.eq[i].have_irq = 1;
 		}
-
 	} else {
 		err = request_irq(dev->pdev->irq, mlx4_interrupt,
 				  IRQF_SHARED, DRV_NAME, dev);
@@ -612,28 +661,36 @@ int mlx4_init_eq_table(struct mlx4_dev *dev)
 	}
 
 	err = mlx4_MAP_EQ(dev, MLX4_ASYNC_EVENT_MASK, 0,
-			  priv->eq_table.eq[MLX4_EQ_ASYNC].eqn);
+			  priv->eq_table.eq[dev->caps.num_comp_vectors].eqn);
 	if (err)
 		mlx4_warn(dev, "MAP_EQ for async EQ %d failed (%d)\n",
-			   priv->eq_table.eq[MLX4_EQ_ASYNC].eqn, err);
+			   priv->eq_table.eq[dev->caps.num_comp_vectors].eqn, err);
 
-	for (i = 0; i < MLX4_NUM_EQ; ++i)
+	for (i = 0; i < dev->caps.num_comp_vectors + 1; ++i)
 		eq_set_ci(&priv->eq_table.eq[i], 1);
 
 	return 0;
 
 err_out_async:
-	mlx4_free_eq(dev, &priv->eq_table.eq[MLX4_EQ_ASYNC]);
+	mlx4_free_eq(dev, &priv->eq_table.eq[dev->caps.num_comp_vectors]);
 
 err_out_comp:
-	mlx4_free_eq(dev, &priv->eq_table.eq[MLX4_EQ_COMP]);
+	i = dev->caps.num_comp_vectors - 1;
 
 err_out_unmap:
+	while (i >= 0) {
+		mlx4_free_eq(dev, &priv->eq_table.eq[i]);
+		--i;
+	}
 	mlx4_unmap_clr_int(dev);
 	mlx4_free_irqs(dev);
 
-err_out_free:
+err_out_bitmap:
 	mlx4_bitmap_cleanup(&priv->eq_table.bitmap);
+
+err_out_free:
+	kfree(priv->eq_table.uar_map);
+
 	return err;
 }
 
@@ -643,18 +700,20 @@ void mlx4_cleanup_eq_table(struct mlx4_dev *dev)
 	int i;
 
 	mlx4_MAP_EQ(dev, MLX4_ASYNC_EVENT_MASK, 1,
-		    priv->eq_table.eq[MLX4_EQ_ASYNC].eqn);
+		    priv->eq_table.eq[dev->caps.num_comp_vectors].eqn);
 
 	mlx4_free_irqs(dev);
 
-	for (i = 0; i < MLX4_NUM_EQ; ++i)
+	for (i = 0; i < dev->caps.num_comp_vectors + 1; ++i)
 		mlx4_free_eq(dev, &priv->eq_table.eq[i]);
 
 	mlx4_unmap_clr_int(dev);
 
-	for (i = 0; i < ARRAY_SIZE(priv->eq_table.uar_map); ++i)
+	for (i = 0; i < mlx4_num_eq_uar(dev); ++i)
 		if (priv->eq_table.uar_map[i])
 			iounmap(priv->eq_table.uar_map[i]);
 
 	mlx4_bitmap_cleanup(&priv->eq_table.bitmap);
+
+	kfree(priv->eq_table.uar_map);
 }
diff --git a/drivers/net/mlx4/main.c b/drivers/net/mlx4/main.c
index 90a0281d15e..710c79e7a2d 100644
--- a/drivers/net/mlx4/main.c
+++ b/drivers/net/mlx4/main.c
@@ -421,9 +421,7 @@ static int mlx4_init_cmpt_table(struct mlx4_dev *dev, u64 cmpt_base,
 				  ((u64) (MLX4_CMPT_TYPE_EQ *
 					  cmpt_entry_sz) << MLX4_CMPT_SHIFT),
 				  cmpt_entry_sz,
-				  roundup_pow_of_two(MLX4_NUM_EQ +
-						     dev->caps.reserved_eqs),
-				  MLX4_NUM_EQ + dev->caps.reserved_eqs, 0, 0);
+				  dev->caps.num_eqs, dev->caps.num_eqs, 0, 0);
 	if (err)
 		goto err_cq;
 
@@ -810,12 +808,12 @@ static int mlx4_setup_hca(struct mlx4_dev *dev)
 		if (dev->flags & MLX4_FLAG_MSI_X) {
 			mlx4_warn(dev, "NOP command failed to generate MSI-X "
 				  "interrupt IRQ %d).\n",
-				  priv->eq_table.eq[MLX4_EQ_ASYNC].irq);
+				  priv->eq_table.eq[dev->caps.num_comp_vectors].irq);
 			mlx4_warn(dev, "Trying again without MSI-X.\n");
 		} else {
 			mlx4_err(dev, "NOP command failed to generate interrupt "
 				 "(IRQ %d), aborting.\n",
-				 priv->eq_table.eq[MLX4_EQ_ASYNC].irq);
+				 priv->eq_table.eq[dev->caps.num_comp_vectors].irq);
 			mlx4_err(dev, "BIOS or ACPI interrupt routing problem?\n");
 		}
 
@@ -908,31 +906,50 @@ err_uar_table_free:
 static void mlx4_enable_msi_x(struct mlx4_dev *dev)
 {
 	struct mlx4_priv *priv = mlx4_priv(dev);
-	struct msix_entry entries[MLX4_NUM_EQ];
+	struct msix_entry *entries;
+	int nreq;
 	int err;
 	int i;
 
 	if (msi_x) {
-		for (i = 0; i < MLX4_NUM_EQ; ++i)
+		nreq = min(dev->caps.num_eqs - dev->caps.reserved_eqs,
+			   num_possible_cpus() + 1);
+		entries = kcalloc(nreq, sizeof *entries, GFP_KERNEL);
+		if (!entries)
+			goto no_msi;
+
+		for (i = 0; i < nreq; ++i)
 			entries[i].entry = i;
 
-		err = pci_enable_msix(dev->pdev, entries, ARRAY_SIZE(entries));
+	retry:
+		err = pci_enable_msix(dev->pdev, entries, nreq);
 		if (err) {
-			if (err > 0)
-				mlx4_info(dev, "Only %d MSI-X vectors available, "
-					  "not using MSI-X\n", err);
+			/* Try again if at least 2 vectors are available */
+			if (err > 1) {
+				mlx4_info(dev, "Requested %d vectors, "
+					  "but only %d MSI-X vectors available, "
+					  "trying again\n", nreq, err);
+				nreq = err;
+				goto retry;
+			}
+
 			goto no_msi;
 		}
 
-		for (i = 0; i < MLX4_NUM_EQ; ++i)
+		dev->caps.num_comp_vectors = nreq - 1;
+		for (i = 0; i < nreq; ++i)
 			priv->eq_table.eq[i].irq = entries[i].vector;
 
 		dev->flags |= MLX4_FLAG_MSI_X;
+
+		kfree(entries);
 		return;
 	}
 
 no_msi:
-	for (i = 0; i < MLX4_NUM_EQ; ++i)
+	dev->caps.num_comp_vectors = 1;
+
+	for (i = 0; i < 2; ++i)
 		priv->eq_table.eq[i].irq = dev->pdev->irq;
 }
 
@@ -1074,6 +1091,10 @@ static int __mlx4_init_one(struct pci_dev *pdev, const struct pci_device_id *id)
 	if (err)
 		goto err_cmd;
 
+	err = mlx4_alloc_eq_table(dev);
+	if (err)
+		goto err_close;
+
 	mlx4_enable_msi_x(dev);
 
 	err = mlx4_setup_hca(dev);
@@ -1084,7 +1105,7 @@ static int __mlx4_init_one(struct pci_dev *pdev, const struct pci_device_id *id)
 	}
 
 	if (err)
-		goto err_close;
+		goto err_free_eq;
 
 	for (port = 1; port <= dev->caps.num_ports; port++) {
 		err = mlx4_init_port_info(dev, port);
@@ -1114,6 +1135,9 @@ err_port:
 	mlx4_cleanup_pd_table(dev);
 	mlx4_cleanup_uar_table(dev);
 
+err_free_eq:
+	mlx4_free_eq_table(dev);
+
 err_close:
 	if (dev->flags & MLX4_FLAG_MSI_X)
 		pci_disable_msix(pdev);
@@ -1177,6 +1201,7 @@ static void mlx4_remove_one(struct pci_dev *pdev)
 		iounmap(priv->kar);
 		mlx4_uar_free(dev, &priv->driver_uar);
 		mlx4_cleanup_uar_table(dev);
+		mlx4_free_eq_table(dev);
 		mlx4_close_hca(dev);
 		mlx4_cmd_cleanup(dev);
 
diff --git a/drivers/net/mlx4/mlx4.h b/drivers/net/mlx4/mlx4.h
index 34c909deaff..e0213bad61c 100644
--- a/drivers/net/mlx4/mlx4.h
+++ b/drivers/net/mlx4/mlx4.h
@@ -62,12 +62,6 @@ enum {
 	MLX4_MTT_ENTRY_PER_SEG	= 8
 };
 
-enum {
-	MLX4_EQ_ASYNC,
-	MLX4_EQ_COMP,
-	MLX4_NUM_EQ
-};
-
 enum {
 	MLX4_NUM_PDS		= 1 << 15
 };
@@ -205,10 +199,11 @@ struct mlx4_cq_table {
 
 struct mlx4_eq_table {
 	struct mlx4_bitmap	bitmap;
+	char		       *irq_names;
 	void __iomem	       *clr_int;
-	void __iomem	       *uar_map[(MLX4_NUM_EQ + 6) / 4];
+	void __iomem	      **uar_map;
 	u32			clr_mask;
-	struct mlx4_eq		eq[MLX4_NUM_EQ];
+	struct mlx4_eq	       *eq;
 	u64			icm_virt;
 	struct page	       *icm_page;
 	dma_addr_t		icm_dma;
@@ -328,6 +323,9 @@ void mlx4_bitmap_cleanup(struct mlx4_bitmap *bitmap);
 
 int mlx4_reset(struct mlx4_dev *dev);
 
+int mlx4_alloc_eq_table(struct mlx4_dev *dev);
+void mlx4_free_eq_table(struct mlx4_dev *dev);
+
 int mlx4_init_pd_table(struct mlx4_dev *dev);
 int mlx4_init_uar_table(struct mlx4_dev *dev);
 int mlx4_init_mr_table(struct mlx4_dev *dev);
diff --git a/drivers/net/mlx4/profile.c b/drivers/net/mlx4/profile.c
index 9ca42b213d5..919fb9eb1b6 100644
--- a/drivers/net/mlx4/profile.c
+++ b/drivers/net/mlx4/profile.c
@@ -107,7 +107,9 @@ u64 mlx4_make_profile(struct mlx4_dev *dev,
 	profile[MLX4_RES_AUXC].num    = request->num_qp;
 	profile[MLX4_RES_SRQ].num     = request->num_srq;
 	profile[MLX4_RES_CQ].num      = request->num_cq;
-	profile[MLX4_RES_EQ].num      = MLX4_NUM_EQ + dev_cap->reserved_eqs;
+	profile[MLX4_RES_EQ].num      = min(dev_cap->max_eqs,
+					    dev_cap->reserved_eqs +
+					    num_possible_cpus() + 1);
 	profile[MLX4_RES_DMPT].num    = request->num_mpt;
 	profile[MLX4_RES_CMPT].num    = MLX4_NUM_CMPTS;
 	profile[MLX4_RES_MTT].num     = request->num_mtt;
diff --git a/include/linux/mlx4/device.h b/include/linux/mlx4/device.h
index 371086fd946..8f659cc2996 100644
--- a/include/linux/mlx4/device.h
+++ b/include/linux/mlx4/device.h
@@ -206,6 +206,7 @@ struct mlx4_caps {
 	int			reserved_cqs;
 	int			num_eqs;
 	int			reserved_eqs;
+	int			num_comp_vectors;
 	int			num_mpts;
 	int			num_mtt_segs;
 	int			fmr_reserved_mtts;
@@ -328,6 +329,7 @@ struct mlx4_cq {
 	int			arm_sn;
 
 	int			cqn;
+	unsigned		vector;
 
 	atomic_t		refcount;
 	struct completion	free;
@@ -437,7 +439,7 @@ void mlx4_free_hwq_res(struct mlx4_dev *mdev, struct mlx4_hwq_resources *wqres,
 
 int mlx4_cq_alloc(struct mlx4_dev *dev, int nent, struct mlx4_mtt *mtt,
 		  struct mlx4_uar *uar, u64 db_rec, struct mlx4_cq *cq,
-		  int collapsed);
+		  unsigned vector, int collapsed);
 void mlx4_cq_free(struct mlx4_dev *dev, struct mlx4_cq *cq);
 
 int mlx4_qp_reserve_range(struct mlx4_dev *dev, int cnt, int align, int *base);
-- 
cgit v1.2.3-70-g09d2


From a01777ecf227de735d7e525ecda48fe74b838a17 Mon Sep 17 00:00:00 2001
From: David Vrabel <david.vrabel@csr.com>
Date: Mon, 22 Dec 2008 18:30:29 +0000
Subject: uwb: remove unused include/linux/uwb/debug.h

Signed-off-by: David Vrabel <david.vrabel@csr.com>
---
 drivers/uwb/address.c                 |  2 +-
 drivers/uwb/driver.c                  |  2 +-
 drivers/uwb/i1480/i1480u-wlp/lc.c     |  2 +-
 drivers/uwb/i1480/i1480u-wlp/netdev.c |  2 +-
 drivers/uwb/i1480/i1480u-wlp/sysfs.c  |  2 +-
 include/linux/uwb/debug.h             | 82 -----------------------------------
 6 files changed, 5 insertions(+), 87 deletions(-)
 delete mode 100644 include/linux/uwb/debug.h

(limited to 'include/linux')

diff --git a/drivers/uwb/address.c b/drivers/uwb/address.c
index 1664ae5f170..ad21b1d7218 100644
--- a/drivers/uwb/address.c
+++ b/drivers/uwb/address.c
@@ -28,7 +28,7 @@
 #include <linux/device.h>
 #include <linux/random.h>
 #include <linux/etherdevice.h>
-#include <linux/uwb/debug.h>
+
 #include "uwb-internal.h"
 
 
diff --git a/drivers/uwb/driver.c b/drivers/uwb/driver.c
index f57c26580de..da77e41de99 100644
--- a/drivers/uwb/driver.c
+++ b/drivers/uwb/driver.c
@@ -53,7 +53,7 @@
 #include <linux/err.h>
 #include <linux/kdev_t.h>
 #include <linux/random.h>
-#include <linux/uwb/debug.h>
+
 #include "uwb-internal.h"
 
 
diff --git a/drivers/uwb/i1480/i1480u-wlp/lc.c b/drivers/uwb/i1480/i1480u-wlp/lc.c
index 488b2e30a0a..049c05d4cc6 100644
--- a/drivers/uwb/i1480/i1480u-wlp/lc.c
+++ b/drivers/uwb/i1480/i1480u-wlp/lc.c
@@ -57,7 +57,7 @@
  */
 #include <linux/if_arp.h>
 #include <linux/etherdevice.h>
-#include <linux/uwb/debug.h>
+
 #include "i1480u-wlp.h"
 
 
diff --git a/drivers/uwb/i1480/i1480u-wlp/netdev.c b/drivers/uwb/i1480/i1480u-wlp/netdev.c
index 2eafb973cd0..e3873ffb942 100644
--- a/drivers/uwb/i1480/i1480u-wlp/netdev.c
+++ b/drivers/uwb/i1480/i1480u-wlp/netdev.c
@@ -41,7 +41,7 @@
 
 #include <linux/if_arp.h>
 #include <linux/etherdevice.h>
-#include <linux/uwb/debug.h>
+
 #include "i1480u-wlp.h"
 
 struct i1480u_cmd_set_ip_mas {
diff --git a/drivers/uwb/i1480/i1480u-wlp/sysfs.c b/drivers/uwb/i1480/i1480u-wlp/sysfs.c
index a92a787725f..4ffaf546cc6 100644
--- a/drivers/uwb/i1480/i1480u-wlp/sysfs.c
+++ b/drivers/uwb/i1480/i1480u-wlp/sysfs.c
@@ -25,8 +25,8 @@
 
 #include <linux/netdevice.h>
 #include <linux/etherdevice.h>
-#include <linux/uwb/debug.h>
 #include <linux/device.h>
+
 #include "i1480u-wlp.h"
 
 
diff --git a/include/linux/uwb/debug.h b/include/linux/uwb/debug.h
deleted file mode 100644
index 67a24052714..00000000000
--- a/include/linux/uwb/debug.h
+++ /dev/null
@@ -1,82 +0,0 @@
-/*
- * Ultra Wide Band
- * Debug Support
- *
- * Copyright (C) 2005-2006 Intel Corporation
- * Inaky Perez-Gonzalez <inaky.perez-gonzalez@intel.com>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License version
- * 2 as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
- * 02110-1301, USA.
- *
- *
- * FIXME: doc
- * Invoke like:
- *
- * #define D_LOCAL 4
- * #include <linux/uwb/debug.h>
- *
- * At the end of your include files.
- */
-#include <linux/types.h>
-
-struct device;
-extern void dump_bytes(struct device *dev, const void *_buf, size_t rsize);
-
-/* Master debug switch; !0 enables, 0 disables */
-#define D_MASTER (!0)
-
-/* Local (per-file) debug switch; #define before #including */
-#ifndef D_LOCAL
-#define D_LOCAL 0
-#endif
-
-#undef __d_printf
-#undef d_fnstart
-#undef d_fnend
-#undef d_printf
-#undef d_dump
-
-#define __d_printf(l, _tag, _dev, f, a...)				\
-do {									\
-	struct device *__dev = (_dev);					\
-	if (D_MASTER && D_LOCAL >= (l)) {				\
-		char __head[64] = "";					\
-		if (_dev != NULL) {					\
-			if ((unsigned long)__dev < 4096)		\
-				printk(KERN_ERR "E: Corrupt dev %p\n",	\
-					__dev);				\
-			else						\
-				snprintf(__head, sizeof(__head),	\
-					 "%s %s: ",			\
-					 dev_driver_string(__dev),	\
-					 dev_name(__dev));		\
-		}							\
-		printk(KERN_ERR "%s%s" _tag ": " f, __head,		\
-			__func__, ## a);				\
-	}								\
-} while (0 && _dev)
-
-#define d_fnstart(l, _dev, f, a...)	\
-	__d_printf(l, " FNSTART", _dev, f, ## a)
-#define d_fnend(l, _dev, f, a...)	\
-	__d_printf(l, " FNEND", _dev, f, ## a)
-#define d_printf(l, _dev, f, a...)	\
-	__d_printf(l, "", _dev, f, ## a)
-#define d_dump(l, _dev, ptr, size)		\
-do {						\
-	struct device *__dev = _dev;		\
-	if (D_MASTER && D_LOCAL >= (l))		\
-		dump_bytes(__dev, ptr, size);	\
-} while (0 && _dev)
-#define d_test(l) (D_MASTER && D_LOCAL >= (l))
-- 
cgit v1.2.3-70-g09d2


From 908a7a16b852ffd618a9127be8d62432182d81b4 Mon Sep 17 00:00:00 2001
From: Neil Horman <nhorman@tuxdriver.com>
Date: Mon, 22 Dec 2008 20:43:12 -0800
Subject: net: Remove unused netdev arg from some NAPI interfaces.

When the napi api was changed to separate its 1:1 binding to the net_device
struct, the netif_rx_[prep|schedule|complete] api failed to remove the now
vestigual net_device structure parameter.  This patch cleans up that api by
properly removing it..

Signed-off-by: Neil Horman <nhorman@tuxdriver.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/infiniband/hw/nes/nes_hw.c      |  2 +-
 drivers/infiniband/hw/nes/nes_nic.c     |  2 +-
 drivers/infiniband/ulp/ipoib/ipoib_ib.c |  6 +++---
 drivers/net/8139cp.c                    |  6 +++---
 drivers/net/8139too.c                   |  6 +++---
 drivers/net/amd8111e.c                  |  6 +++---
 drivers/net/arm/ep93xx_eth.c            |  6 +++---
 drivers/net/arm/ixp4xx_eth.c            |  6 +++---
 drivers/net/atl1e/atl1e_main.c          |  6 +++---
 drivers/net/b44.c                       |  6 +++---
 drivers/net/bnx2.c                      | 15 ++++++---------
 drivers/net/bnx2x_main.c                |  6 +++---
 drivers/net/cassini.c                   |  8 ++++----
 drivers/net/chelsio/sge.c               |  4 ++--
 drivers/net/cpmac.c                     | 10 +++++-----
 drivers/net/e100.c                      |  7 +++----
 drivers/net/e1000/e1000_main.c          | 10 +++++-----
 drivers/net/e1000e/netdev.c             | 14 +++++++-------
 drivers/net/ehea/ehea_main.c            |  6 +++---
 drivers/net/enic/enic_main.c            | 12 ++++++------
 drivers/net/epic100.c                   |  6 +++---
 drivers/net/forcedeth.c                 | 10 +++++-----
 drivers/net/fs_enet/fs_enet-main.c      |  4 ++--
 drivers/net/gianfar.c                   |  6 +++---
 drivers/net/ibmveth.c                   |  6 +++---
 drivers/net/igb/igb_main.c              | 12 ++++++------
 drivers/net/ixgb/ixgb_main.c            |  6 +++---
 drivers/net/ixgbe/ixgbe_main.c          | 12 ++++++------
 drivers/net/ixp2000/ixpdev.c            |  4 ++--
 drivers/net/jme.c                       |  1 -
 drivers/net/jme.h                       |  6 +++---
 drivers/net/korina.c                    |  4 ++--
 drivers/net/macb.c                      | 10 +++++-----
 drivers/net/mlx4/en_rx.c                |  4 ++--
 drivers/net/myri10ge/myri10ge.c         |  6 +++---
 drivers/net/natsemi.c                   |  6 +++---
 drivers/net/netxen/netxen_nic_main.c    |  2 +-
 drivers/net/niu.c                       |  6 +++---
 drivers/net/pasemi_mac.c                |  6 +++---
 drivers/net/pcnet32.c                   |  6 +++---
 drivers/net/qla3xxx.c                   |  6 +++---
 drivers/net/qlge/qlge_main.c            |  7 +++----
 drivers/net/r6040.c                     |  4 ++--
 drivers/net/r8169.c                     |  6 +++---
 drivers/net/s2io.c                      |  8 ++++----
 drivers/net/sb1250-mac.c                |  6 +++---
 drivers/net/sfc/efx.c                   |  2 +-
 drivers/net/sfc/efx.h                   |  2 +-
 drivers/net/skge.c                      |  6 +++---
 drivers/net/smsc911x.c                  |  2 +-
 drivers/net/smsc9420.c                  |  4 ++--
 drivers/net/spider_net.c                | 15 ++++++---------
 drivers/net/starfire.c                  |  6 +++---
 drivers/net/sungem.c                    |  6 +++---
 drivers/net/tc35815.c                   |  6 +++---
 drivers/net/tehuti.c                    |  7 +++----
 drivers/net/tg3.c                       | 14 +++++++-------
 drivers/net/tsi108_eth.c                |  6 +++---
 drivers/net/tulip/interrupt.c           |  8 ++++----
 drivers/net/typhoon.c                   |  7 +++----
 drivers/net/ucc_geth.c                  |  6 +++---
 drivers/net/via-rhine.c                 |  4 ++--
 drivers/net/virtio_net.c                | 12 ++++++------
 drivers/net/wan/hd64572.c               |  4 ++--
 drivers/net/xen-netfront.c              |  8 ++++----
 include/linux/netdevice.h               | 24 +++++++++---------------
 66 files changed, 218 insertions(+), 235 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/infiniband/hw/nes/nes_hw.c b/drivers/infiniband/hw/nes/nes_hw.c
index 7c49cc882d7..735c125b48a 100644
--- a/drivers/infiniband/hw/nes/nes_hw.c
+++ b/drivers/infiniband/hw/nes/nes_hw.c
@@ -2541,7 +2541,7 @@ static void nes_nic_napi_ce_handler(struct nes_device *nesdev, struct nes_hw_nic
 {
 	struct nes_vnic *nesvnic = container_of(cq, struct nes_vnic, nic_cq);
 
-	netif_rx_schedule(nesdev->netdev[nesvnic->netdev_index], &nesvnic->napi);
+	netif_rx_schedule(&nesvnic->napi);
 }
 
 
diff --git a/drivers/infiniband/hw/nes/nes_nic.c b/drivers/infiniband/hw/nes/nes_nic.c
index 3c96203e0d9..80e7a4d98d5 100644
--- a/drivers/infiniband/hw/nes/nes_nic.c
+++ b/drivers/infiniband/hw/nes/nes_nic.c
@@ -112,7 +112,7 @@ static int nes_netdev_poll(struct napi_struct *napi, int budget)
 	nes_nic_ce_handler(nesdev, nescq);
 
 	if (nescq->cqes_pending == 0) {
-		netif_rx_complete(netdev, napi);
+		netif_rx_complete(napi);
 		/* clear out completed cqes and arm */
 		nes_write32(nesdev->regs+NES_CQE_ALLOC, NES_CQE_ALLOC_NOTIFY_NEXT |
 				nescq->cq_number | (nescq->cqe_allocs_pending << 16));
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_ib.c b/drivers/infiniband/ulp/ipoib/ipoib_ib.c
index 28eb6f03c58..a1925810be3 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_ib.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_ib.c
@@ -446,11 +446,11 @@ poll_more:
 		if (dev->features & NETIF_F_LRO)
 			lro_flush_all(&priv->lro.lro_mgr);
 
-		netif_rx_complete(dev, napi);
+		netif_rx_complete(napi);
 		if (unlikely(ib_req_notify_cq(priv->recv_cq,
 					      IB_CQ_NEXT_COMP |
 					      IB_CQ_REPORT_MISSED_EVENTS)) &&
-		    netif_rx_reschedule(dev, napi))
+		    netif_rx_reschedule(napi))
 			goto poll_more;
 	}
 
@@ -462,7 +462,7 @@ void ipoib_ib_completion(struct ib_cq *cq, void *dev_ptr)
 	struct net_device *dev = dev_ptr;
 	struct ipoib_dev_priv *priv = netdev_priv(dev);
 
-	netif_rx_schedule(dev, &priv->napi);
+	netif_rx_schedule(&priv->napi);
 }
 
 static void drain_tx_cq(struct net_device *dev)
diff --git a/drivers/net/8139cp.c b/drivers/net/8139cp.c
index f6d9d1353dd..dd7ac8290ae 100644
--- a/drivers/net/8139cp.c
+++ b/drivers/net/8139cp.c
@@ -604,7 +604,7 @@ rx_next:
 
 		spin_lock_irqsave(&cp->lock, flags);
 		cpw16_f(IntrMask, cp_intr_mask);
-		__netif_rx_complete(dev, napi);
+		__netif_rx_complete(napi);
 		spin_unlock_irqrestore(&cp->lock, flags);
 	}
 
@@ -641,9 +641,9 @@ static irqreturn_t cp_interrupt (int irq, void *dev_instance)
 	}
 
 	if (status & (RxOK | RxErr | RxEmpty | RxFIFOOvr))
-		if (netif_rx_schedule_prep(dev, &cp->napi)) {
+		if (netif_rx_schedule_prep(&cp->napi)) {
 			cpw16_f(IntrMask, cp_norx_intr_mask);
-			__netif_rx_schedule(dev, &cp->napi);
+			__netif_rx_schedule(&cp->napi);
 		}
 
 	if (status & (TxOK | TxErr | TxEmpty | SWInt))
diff --git a/drivers/net/8139too.c b/drivers/net/8139too.c
index 67bbf4f25be..fe370f80579 100644
--- a/drivers/net/8139too.c
+++ b/drivers/net/8139too.c
@@ -2128,7 +2128,7 @@ static int rtl8139_poll(struct napi_struct *napi, int budget)
 		 */
 		spin_lock_irqsave(&tp->lock, flags);
 		RTL_W16_F(IntrMask, rtl8139_intr_mask);
-		__netif_rx_complete(dev, napi);
+		__netif_rx_complete(napi);
 		spin_unlock_irqrestore(&tp->lock, flags);
 	}
 	spin_unlock(&tp->rx_lock);
@@ -2178,9 +2178,9 @@ static irqreturn_t rtl8139_interrupt (int irq, void *dev_instance)
 	/* Receive packets are processed by poll routine.
 	   If not running start it now. */
 	if (status & RxAckBits){
-		if (netif_rx_schedule_prep(dev, &tp->napi)) {
+		if (netif_rx_schedule_prep(&tp->napi)) {
 			RTL_W16_F (IntrMask, rtl8139_norx_intr_mask);
-			__netif_rx_schedule(dev, &tp->napi);
+			__netif_rx_schedule(&tp->napi);
 		}
 	}
 
diff --git a/drivers/net/amd8111e.c b/drivers/net/amd8111e.c
index 0bc4f54d5db..187ac6eb6e9 100644
--- a/drivers/net/amd8111e.c
+++ b/drivers/net/amd8111e.c
@@ -831,7 +831,7 @@ static int amd8111e_rx_poll(struct napi_struct *napi, int budget)
 	if (rx_pkt_limit > 0) {
 		/* Receive descriptor is empty now */
 		spin_lock_irqsave(&lp->lock, flags);
-		__netif_rx_complete(dev, napi);
+		__netif_rx_complete(napi);
 		writel(VAL0|RINTEN0, mmio + INTEN0);
 		writel(VAL2 | RDMD0, mmio + CMD0);
 		spin_unlock_irqrestore(&lp->lock, flags);
@@ -1170,11 +1170,11 @@ static irqreturn_t amd8111e_interrupt(int irq, void *dev_id)
 
 	/* Check if Receive Interrupt has occurred. */
 	if (intr0 & RINT0) {
-		if (netif_rx_schedule_prep(dev, &lp->napi)) {
+		if (netif_rx_schedule_prep(&lp->napi)) {
 			/* Disable receive interupts */
 			writel(RINTEN0, mmio + INTEN0);
 			/* Schedule a polling routine */
-			__netif_rx_schedule(dev, &lp->napi);
+			__netif_rx_schedule(&lp->napi);
 		} else if (intren0 & RINTEN0) {
 			printk("************Driver bug! \
 				interrupt while in poll\n");
diff --git a/drivers/net/arm/ep93xx_eth.c b/drivers/net/arm/ep93xx_eth.c
index 588c9739d13..6ecc600c1bc 100644
--- a/drivers/net/arm/ep93xx_eth.c
+++ b/drivers/net/arm/ep93xx_eth.c
@@ -298,7 +298,7 @@ poll_some_more:
 		int more = 0;
 
 		spin_lock_irq(&ep->rx_lock);
-		__netif_rx_complete(dev, napi);
+		__netif_rx_complete(napi);
 		wrl(ep, REG_INTEN, REG_INTEN_TX | REG_INTEN_RX);
 		if (ep93xx_have_more_rx(ep)) {
 			wrl(ep, REG_INTEN, REG_INTEN_TX);
@@ -415,9 +415,9 @@ static irqreturn_t ep93xx_irq(int irq, void *dev_id)
 
 	if (status & REG_INTSTS_RX) {
 		spin_lock(&ep->rx_lock);
-		if (likely(netif_rx_schedule_prep(dev, &ep->napi))) {
+		if (likely(netif_rx_schedule_prep(&ep->napi))) {
 			wrl(ep, REG_INTEN, REG_INTEN_TX);
-			__netif_rx_schedule(dev, &ep->napi);
+			__netif_rx_schedule(&ep->napi);
 		}
 		spin_unlock(&ep->rx_lock);
 	}
diff --git a/drivers/net/arm/ixp4xx_eth.c b/drivers/net/arm/ixp4xx_eth.c
index 14ffa2a6189..b03609f2e90 100644
--- a/drivers/net/arm/ixp4xx_eth.c
+++ b/drivers/net/arm/ixp4xx_eth.c
@@ -498,7 +498,7 @@ static void eth_rx_irq(void *pdev)
 	printk(KERN_DEBUG "%s: eth_rx_irq\n", dev->name);
 #endif
 	qmgr_disable_irq(port->plat->rxq);
-	netif_rx_schedule(dev, &port->napi);
+	netif_rx_schedule(&port->napi);
 }
 
 static int eth_poll(struct napi_struct *napi, int budget)
@@ -526,7 +526,7 @@ static int eth_poll(struct napi_struct *napi, int budget)
 			printk(KERN_DEBUG "%s: eth_poll netif_rx_complete\n",
 			       dev->name);
 #endif
-			netif_rx_complete(dev, napi);
+			netif_rx_complete(napi);
 			qmgr_enable_irq(rxq);
 			if (!qmgr_stat_empty(rxq) &&
 			    netif_rx_reschedule(dev, napi)) {
@@ -1025,7 +1025,7 @@ static int eth_open(struct net_device *dev)
 	}
 	ports_open++;
 	/* we may already have RX data, enables IRQ */
-	netif_rx_schedule(dev, &port->napi);
+	netif_rx_schedule(&port->napi);
 	return 0;
 }
 
diff --git a/drivers/net/atl1e/atl1e_main.c b/drivers/net/atl1e/atl1e_main.c
index 98b2a7a466b..a72a46145ed 100644
--- a/drivers/net/atl1e/atl1e_main.c
+++ b/drivers/net/atl1e/atl1e_main.c
@@ -1326,9 +1326,9 @@ static irqreturn_t atl1e_intr(int irq, void *data)
 			AT_WRITE_REG(hw, REG_IMR,
 				     IMR_NORMAL_MASK & ~ISR_RX_EVENT);
 			AT_WRITE_FLUSH(hw);
-			if (likely(netif_rx_schedule_prep(netdev,
+			if (likely(netif_rx_schedule_prep(
 				   &adapter->napi)))
-				__netif_rx_schedule(netdev, &adapter->napi);
+				__netif_rx_schedule(&adapter->napi);
 		}
 	} while (--max_ints > 0);
 	/* re-enable Interrupt*/
@@ -1515,7 +1515,7 @@ static int atl1e_clean(struct napi_struct *napi, int budget)
 	/* If no Tx and not enough Rx work done, exit the polling mode */
 	if (work_done < budget) {
 quit_polling:
-		netif_rx_complete(netdev, napi);
+		netif_rx_complete(napi);
 		imr_data = AT_READ_REG(&adapter->hw, REG_IMR);
 		AT_WRITE_REG(&adapter->hw, REG_IMR, imr_data | ISR_RX_EVENT);
 		/* test debug */
diff --git a/drivers/net/b44.c b/drivers/net/b44.c
index 2c7a32eb92a..934a95091dc 100644
--- a/drivers/net/b44.c
+++ b/drivers/net/b44.c
@@ -875,7 +875,7 @@ static int b44_poll(struct napi_struct *napi, int budget)
 	}
 
 	if (work_done < budget) {
-		netif_rx_complete(netdev, napi);
+		netif_rx_complete(napi);
 		b44_enable_ints(bp);
 	}
 
@@ -907,13 +907,13 @@ static irqreturn_t b44_interrupt(int irq, void *dev_id)
 			goto irq_ack;
 		}
 
-		if (netif_rx_schedule_prep(dev, &bp->napi)) {
+		if (netif_rx_schedule_prep(&bp->napi)) {
 			/* NOTE: These writes are posted by the readback of
 			 *       the ISTAT register below.
 			 */
 			bp->istat = istat;
 			__b44_disable_ints(bp);
-			__netif_rx_schedule(dev, &bp->napi);
+			__netif_rx_schedule(&bp->napi);
 		} else {
 			printk(KERN_ERR PFX "%s: Error, poll already scheduled\n",
 			       dev->name);
diff --git a/drivers/net/bnx2.c b/drivers/net/bnx2.c
index 1a2780374a4..33d69ddc90a 100644
--- a/drivers/net/bnx2.c
+++ b/drivers/net/bnx2.c
@@ -3043,7 +3043,6 @@ bnx2_msi(int irq, void *dev_instance)
 {
 	struct bnx2_napi *bnapi = dev_instance;
 	struct bnx2 *bp = bnapi->bp;
-	struct net_device *dev = bp->dev;
 
 	prefetch(bnapi->status_blk.msi);
 	REG_WR(bp, BNX2_PCICFG_INT_ACK_CMD,
@@ -3054,7 +3053,7 @@ bnx2_msi(int irq, void *dev_instance)
 	if (unlikely(atomic_read(&bp->intr_sem) != 0))
 		return IRQ_HANDLED;
 
-	netif_rx_schedule(dev, &bnapi->napi);
+	netif_rx_schedule(&bnapi->napi);
 
 	return IRQ_HANDLED;
 }
@@ -3064,7 +3063,6 @@ bnx2_msi_1shot(int irq, void *dev_instance)
 {
 	struct bnx2_napi *bnapi = dev_instance;
 	struct bnx2 *bp = bnapi->bp;
-	struct net_device *dev = bp->dev;
 
 	prefetch(bnapi->status_blk.msi);
 
@@ -3072,7 +3070,7 @@ bnx2_msi_1shot(int irq, void *dev_instance)
 	if (unlikely(atomic_read(&bp->intr_sem) != 0))
 		return IRQ_HANDLED;
 
-	netif_rx_schedule(dev, &bnapi->napi);
+	netif_rx_schedule(&bnapi->napi);
 
 	return IRQ_HANDLED;
 }
@@ -3082,7 +3080,6 @@ bnx2_interrupt(int irq, void *dev_instance)
 {
 	struct bnx2_napi *bnapi = dev_instance;
 	struct bnx2 *bp = bnapi->bp;
-	struct net_device *dev = bp->dev;
 	struct status_block *sblk = bnapi->status_blk.msi;
 
 	/* When using INTx, it is possible for the interrupt to arrive
@@ -3109,9 +3106,9 @@ bnx2_interrupt(int irq, void *dev_instance)
 	if (unlikely(atomic_read(&bp->intr_sem) != 0))
 		return IRQ_HANDLED;
 
-	if (netif_rx_schedule_prep(dev, &bnapi->napi)) {
+	if (netif_rx_schedule_prep(&bnapi->napi)) {
 		bnapi->last_status_idx = sblk->status_idx;
-		__netif_rx_schedule(dev, &bnapi->napi);
+		__netif_rx_schedule(&bnapi->napi);
 	}
 
 	return IRQ_HANDLED;
@@ -3221,7 +3218,7 @@ static int bnx2_poll_msix(struct napi_struct *napi, int budget)
 		rmb();
 		if (likely(!bnx2_has_fast_work(bnapi))) {
 
-			netif_rx_complete(bp->dev, napi);
+			netif_rx_complete(napi);
 			REG_WR(bp, BNX2_PCICFG_INT_ACK_CMD, bnapi->int_num |
 			       BNX2_PCICFG_INT_ACK_CMD_INDEX_VALID |
 			       bnapi->last_status_idx);
@@ -3254,7 +3251,7 @@ static int bnx2_poll(struct napi_struct *napi, int budget)
 
 		rmb();
 		if (likely(!bnx2_has_work(bnapi))) {
-			netif_rx_complete(bp->dev, napi);
+			netif_rx_complete(napi);
 			if (likely(bp->flags & BNX2_FLAG_USING_MSI_OR_MSIX)) {
 				REG_WR(bp, BNX2_PCICFG_INT_ACK_CMD,
 				       BNX2_PCICFG_INT_ACK_CMD_INDEX_VALID |
diff --git a/drivers/net/bnx2x_main.c b/drivers/net/bnx2x_main.c
index 24d2ae8b74b..02ab9b0ea69 100644
--- a/drivers/net/bnx2x_main.c
+++ b/drivers/net/bnx2x_main.c
@@ -1615,7 +1615,7 @@ static irqreturn_t bnx2x_msix_fp_int(int irq, void *fp_cookie)
 	prefetch(&fp->status_blk->c_status_block.status_block_index);
 	prefetch(&fp->status_blk->u_status_block.status_block_index);
 
-	netif_rx_schedule(dev, &bnx2x_fp(bp, index, napi));
+	netif_rx_schedule(&bnx2x_fp(bp, index, napi));
 
 	return IRQ_HANDLED;
 }
@@ -1654,7 +1654,7 @@ static irqreturn_t bnx2x_interrupt(int irq, void *dev_instance)
 		prefetch(&fp->status_blk->c_status_block.status_block_index);
 		prefetch(&fp->status_blk->u_status_block.status_block_index);
 
-		netif_rx_schedule(dev, &bnx2x_fp(bp, 0, napi));
+		netif_rx_schedule(&bnx2x_fp(bp, 0, napi));
 
 		status &= ~mask;
 	}
@@ -9284,7 +9284,7 @@ static int bnx2x_poll(struct napi_struct *napi, int budget)
 #ifdef BNX2X_STOP_ON_ERROR
 poll_panic:
 #endif
-		netif_rx_complete(bp->dev, napi);
+		netif_rx_complete(napi);
 
 		bnx2x_ack_sb(bp, FP_SB_ID(fp), USTORM_ID,
 			     le16_to_cpu(fp->fp_u_idx), IGU_INT_NOP, 1);
diff --git a/drivers/net/cassini.c b/drivers/net/cassini.c
index 023d205e905..321f43d9f0e 100644
--- a/drivers/net/cassini.c
+++ b/drivers/net/cassini.c
@@ -2506,7 +2506,7 @@ static irqreturn_t cas_interruptN(int irq, void *dev_id)
 	if (status & INTR_RX_DONE_ALT) { /* handle rx separately */
 #ifdef USE_NAPI
 		cas_mask_intr(cp);
-		netif_rx_schedule(dev, &cp->napi);
+		netif_rx_schedule(&cp->napi);
 #else
 		cas_rx_ringN(cp, ring, 0);
 #endif
@@ -2557,7 +2557,7 @@ static irqreturn_t cas_interrupt1(int irq, void *dev_id)
 	if (status & INTR_RX_DONE_ALT) { /* handle rx separately */
 #ifdef USE_NAPI
 		cas_mask_intr(cp);
-		netif_rx_schedule(dev, &cp->napi);
+		netif_rx_schedule(&cp->napi);
 #else
 		cas_rx_ringN(cp, 1, 0);
 #endif
@@ -2613,7 +2613,7 @@ static irqreturn_t cas_interrupt(int irq, void *dev_id)
 	if (status & INTR_RX_DONE) {
 #ifdef USE_NAPI
 		cas_mask_intr(cp);
-		netif_rx_schedule(dev, &cp->napi);
+		netif_rx_schedule(&cp->napi);
 #else
 		cas_rx_ringN(cp, 0, 0);
 #endif
@@ -2691,7 +2691,7 @@ rx_comp:
 #endif
 	spin_unlock_irqrestore(&cp->lock, flags);
 	if (enable_intr) {
-		netif_rx_complete(dev, napi);
+		netif_rx_complete(napi);
 		cas_unmask_intr(cp);
 	}
 	return credits;
diff --git a/drivers/net/chelsio/sge.c b/drivers/net/chelsio/sge.c
index 1da70070c2f..7896468dda1 100644
--- a/drivers/net/chelsio/sge.c
+++ b/drivers/net/chelsio/sge.c
@@ -1613,7 +1613,7 @@ int t1_poll(struct napi_struct *napi, int budget)
 	int work_done = process_responses(adapter, budget);
 
 	if (likely(work_done < budget)) {
-		netif_rx_complete(dev, napi);
+		netif_rx_complete(napi);
 		writel(adapter->sge->respQ.cidx,
 		       adapter->regs + A_SG_SLEEPING);
 	}
@@ -1633,7 +1633,7 @@ irqreturn_t t1_interrupt(int irq, void *data)
 
 		if (napi_schedule_prep(&adapter->napi)) {
 			if (process_pure_responses(adapter))
-				__netif_rx_schedule(dev, &adapter->napi);
+				__netif_rx_schedule(&adapter->napi);
 			else {
 				/* no data, no NAPI needed */
 				writel(sge->respQ.cidx, adapter->regs + A_SG_SLEEPING);
diff --git a/drivers/net/cpmac.c b/drivers/net/cpmac.c
index d39a77cba1a..f66548751c3 100644
--- a/drivers/net/cpmac.c
+++ b/drivers/net/cpmac.c
@@ -428,7 +428,7 @@ static int cpmac_poll(struct napi_struct *napi, int budget)
 			printk(KERN_WARNING "%s: rx: polling, but no queue\n",
 			       priv->dev->name);
 		spin_unlock(&priv->rx_lock);
-		netif_rx_complete(priv->dev, napi);
+		netif_rx_complete(napi);
 		return 0;
 	}
 
@@ -514,7 +514,7 @@ static int cpmac_poll(struct napi_struct *napi, int budget)
 	if (processed == 0) {
 		/* we ran out of packets to read,
 		 * revert to interrupt-driven mode */
-		netif_rx_complete(priv->dev, napi);
+		netif_rx_complete(napi);
 		cpmac_write(priv->regs, CPMAC_RX_INT_ENABLE, 1);
 		return 0;
 	}
@@ -536,7 +536,7 @@ fatal_error:
 	}
 
 	spin_unlock(&priv->rx_lock);
-	netif_rx_complete(priv->dev, napi);
+	netif_rx_complete(napi);
 	netif_tx_stop_all_queues(priv->dev);
 	napi_disable(&priv->napi);
 
@@ -802,9 +802,9 @@ static irqreturn_t cpmac_irq(int irq, void *dev_id)
 
 	if (status & MAC_INT_RX) {
 		queue = (status >> 8) & 7;
-		if (netif_rx_schedule_prep(dev, &priv->napi)) {
+		if (netif_rx_schedule_prep(&priv->napi)) {
 			cpmac_write(priv->regs, CPMAC_RX_INT_CLEAR, 1 << queue);
-			__netif_rx_schedule(dev, &priv->napi);
+			__netif_rx_schedule(&priv->napi);
 		}
 	}
 
diff --git a/drivers/net/e100.c b/drivers/net/e100.c
index dce7ff28c3f..9f38b16ccbb 100644
--- a/drivers/net/e100.c
+++ b/drivers/net/e100.c
@@ -2049,9 +2049,9 @@ static irqreturn_t e100_intr(int irq, void *dev_id)
 	if(stat_ack & stat_ack_rnr)
 		nic->ru_running = RU_SUSPENDED;
 
-	if(likely(netif_rx_schedule_prep(netdev, &nic->napi))) {
+	if(likely(netif_rx_schedule_prep(&nic->napi))) {
 		e100_disable_irq(nic);
-		__netif_rx_schedule(netdev, &nic->napi);
+		__netif_rx_schedule(&nic->napi);
 	}
 
 	return IRQ_HANDLED;
@@ -2060,7 +2060,6 @@ static irqreturn_t e100_intr(int irq, void *dev_id)
 static int e100_poll(struct napi_struct *napi, int budget)
 {
 	struct nic *nic = container_of(napi, struct nic, napi);
-	struct net_device *netdev = nic->netdev;
 	unsigned int work_done = 0;
 
 	e100_rx_clean(nic, &work_done, budget);
@@ -2068,7 +2067,7 @@ static int e100_poll(struct napi_struct *napi, int budget)
 
 	/* If budget not fully consumed, exit the polling mode */
 	if (work_done < budget) {
-		netif_rx_complete(netdev, napi);
+		netif_rx_complete(napi);
 		e100_enable_irq(nic);
 	}
 
diff --git a/drivers/net/e1000/e1000_main.c b/drivers/net/e1000/e1000_main.c
index 116c96e0b11..26474c92193 100644
--- a/drivers/net/e1000/e1000_main.c
+++ b/drivers/net/e1000/e1000_main.c
@@ -3687,12 +3687,12 @@ static irqreturn_t e1000_intr_msi(int irq, void *data)
 			mod_timer(&adapter->watchdog_timer, jiffies + 1);
 	}
 
-	if (likely(netif_rx_schedule_prep(netdev, &adapter->napi))) {
+	if (likely(netif_rx_schedule_prep(&adapter->napi))) {
 		adapter->total_tx_bytes = 0;
 		adapter->total_tx_packets = 0;
 		adapter->total_rx_bytes = 0;
 		adapter->total_rx_packets = 0;
-		__netif_rx_schedule(netdev, &adapter->napi);
+		__netif_rx_schedule(&adapter->napi);
 	} else
 		e1000_irq_enable(adapter);
 
@@ -3747,12 +3747,12 @@ static irqreturn_t e1000_intr(int irq, void *data)
 		ew32(IMC, ~0);
 		E1000_WRITE_FLUSH();
 	}
-	if (likely(netif_rx_schedule_prep(netdev, &adapter->napi))) {
+	if (likely(netif_rx_schedule_prep(&adapter->napi))) {
 		adapter->total_tx_bytes = 0;
 		adapter->total_tx_packets = 0;
 		adapter->total_rx_bytes = 0;
 		adapter->total_rx_packets = 0;
-		__netif_rx_schedule(netdev, &adapter->napi);
+		__netif_rx_schedule(&adapter->napi);
 	} else
 		/* this really should not happen! if it does it is basically a
 		 * bug, but not a hard error, so enable ints and continue */
@@ -3793,7 +3793,7 @@ static int e1000_clean(struct napi_struct *napi, int budget)
 	if (work_done < budget) {
 		if (likely(adapter->itr_setting & 3))
 			e1000_set_itr(adapter);
-		netif_rx_complete(poll_dev, napi);
+		netif_rx_complete(napi);
 		e1000_irq_enable(adapter);
 	}
 
diff --git a/drivers/net/e1000e/netdev.c b/drivers/net/e1000e/netdev.c
index f7b05609073..d4639facd1b 100644
--- a/drivers/net/e1000e/netdev.c
+++ b/drivers/net/e1000e/netdev.c
@@ -1179,12 +1179,12 @@ static irqreturn_t e1000_intr_msi(int irq, void *data)
 			mod_timer(&adapter->watchdog_timer, jiffies + 1);
 	}
 
-	if (netif_rx_schedule_prep(netdev, &adapter->napi)) {
+	if (netif_rx_schedule_prep(&adapter->napi)) {
 		adapter->total_tx_bytes = 0;
 		adapter->total_tx_packets = 0;
 		adapter->total_rx_bytes = 0;
 		adapter->total_rx_packets = 0;
-		__netif_rx_schedule(netdev, &adapter->napi);
+		__netif_rx_schedule(&adapter->napi);
 	}
 
 	return IRQ_HANDLED;
@@ -1246,12 +1246,12 @@ static irqreturn_t e1000_intr(int irq, void *data)
 			mod_timer(&adapter->watchdog_timer, jiffies + 1);
 	}
 
-	if (netif_rx_schedule_prep(netdev, &adapter->napi)) {
+	if (netif_rx_schedule_prep(&adapter->napi)) {
 		adapter->total_tx_bytes = 0;
 		adapter->total_tx_packets = 0;
 		adapter->total_rx_bytes = 0;
 		adapter->total_rx_packets = 0;
-		__netif_rx_schedule(netdev, &adapter->napi);
+		__netif_rx_schedule(&adapter->napi);
 	}
 
 	return IRQ_HANDLED;
@@ -1320,10 +1320,10 @@ static irqreturn_t e1000_intr_msix_rx(int irq, void *data)
 		adapter->rx_ring->set_itr = 0;
 	}
 
-	if (netif_rx_schedule_prep(netdev, &adapter->napi)) {
+	if (netif_rx_schedule_prep(&adapter->napi)) {
 		adapter->total_rx_bytes = 0;
 		adapter->total_rx_packets = 0;
-		__netif_rx_schedule(netdev, &adapter->napi);
+		__netif_rx_schedule(&adapter->napi);
 	}
 	return IRQ_HANDLED;
 }
@@ -2028,7 +2028,7 @@ clean_rx:
 	if (work_done < budget) {
 		if (adapter->itr_setting & 3)
 			e1000_set_itr(adapter);
-		netif_rx_complete(poll_dev, napi);
+		netif_rx_complete(napi);
 		if (adapter->msix_entries)
 			ew32(IMS, adapter->rx_ring->ims_val);
 		else
diff --git a/drivers/net/ehea/ehea_main.c b/drivers/net/ehea/ehea_main.c
index 44c9ae18383..035aa7dfc5c 100644
--- a/drivers/net/ehea/ehea_main.c
+++ b/drivers/net/ehea/ehea_main.c
@@ -830,7 +830,7 @@ static int ehea_poll(struct napi_struct *napi, int budget)
 	while ((rx != budget) || force_irq) {
 		pr->poll_counter = 0;
 		force_irq = 0;
-		netif_rx_complete(dev, napi);
+		netif_rx_complete(napi);
 		ehea_reset_cq_ep(pr->recv_cq);
 		ehea_reset_cq_ep(pr->send_cq);
 		ehea_reset_cq_n1(pr->recv_cq);
@@ -859,7 +859,7 @@ static void ehea_netpoll(struct net_device *dev)
 	int i;
 
 	for (i = 0; i < port->num_def_qps; i++)
-		netif_rx_schedule(dev, &port->port_res[i].napi);
+		netif_rx_schedule(&port->port_res[i].napi);
 }
 #endif
 
@@ -867,7 +867,7 @@ static irqreturn_t ehea_recv_irq_handler(int irq, void *param)
 {
 	struct ehea_port_res *pr = param;
 
-	netif_rx_schedule(pr->port->netdev, &pr->napi);
+	netif_rx_schedule(&pr->napi);
 
 	return IRQ_HANDLED;
 }
diff --git a/drivers/net/enic/enic_main.c b/drivers/net/enic/enic_main.c
index deddd76a550..d039e16f276 100644
--- a/drivers/net/enic/enic_main.c
+++ b/drivers/net/enic/enic_main.c
@@ -411,8 +411,8 @@ static irqreturn_t enic_isr_legacy(int irq, void *data)
 	}
 
 	if (ENIC_TEST_INTR(pba, ENIC_INTX_WQ_RQ)) {
-		if (netif_rx_schedule_prep(netdev, &enic->napi))
-			__netif_rx_schedule(netdev, &enic->napi);
+		if (netif_rx_schedule_prep(&enic->napi))
+			__netif_rx_schedule(&enic->napi);
 	} else {
 		vnic_intr_unmask(&enic->intr[ENIC_INTX_WQ_RQ]);
 	}
@@ -440,7 +440,7 @@ static irqreturn_t enic_isr_msi(int irq, void *data)
 	 * writes).
 	 */
 
-	netif_rx_schedule(enic->netdev, &enic->napi);
+	netif_rx_schedule(&enic->napi);
 
 	return IRQ_HANDLED;
 }
@@ -450,7 +450,7 @@ static irqreturn_t enic_isr_msix_rq(int irq, void *data)
 	struct enic *enic = data;
 
 	/* schedule NAPI polling for RQ cleanup */
-	netif_rx_schedule(enic->netdev, &enic->napi);
+	netif_rx_schedule(&enic->napi);
 
 	return IRQ_HANDLED;
 }
@@ -1068,7 +1068,7 @@ static int enic_poll(struct napi_struct *napi, int budget)
 		if (netdev->features & NETIF_F_LRO)
 			lro_flush_all(&enic->lro_mgr);
 
-		netif_rx_complete(netdev, napi);
+		netif_rx_complete(napi);
 		vnic_intr_unmask(&enic->intr[ENIC_MSIX_RQ]);
 	}
 
@@ -1112,7 +1112,7 @@ static int enic_poll_msix(struct napi_struct *napi, int budget)
 		if (netdev->features & NETIF_F_LRO)
 			lro_flush_all(&enic->lro_mgr);
 
-		netif_rx_complete(netdev, napi);
+		netif_rx_complete(napi);
 		vnic_intr_unmask(&enic->intr[ENIC_MSIX_RQ]);
 	}
 
diff --git a/drivers/net/epic100.c b/drivers/net/epic100.c
index 4a951b8cb4d..f9b37c80dda 100644
--- a/drivers/net/epic100.c
+++ b/drivers/net/epic100.c
@@ -1109,9 +1109,9 @@ static irqreturn_t epic_interrupt(int irq, void *dev_instance)
 
 	if ((status & EpicNapiEvent) && !ep->reschedule_in_poll) {
 		spin_lock(&ep->napi_lock);
-		if (netif_rx_schedule_prep(dev, &ep->napi)) {
+		if (netif_rx_schedule_prep(&ep->napi)) {
 			epic_napi_irq_off(dev, ep);
-			__netif_rx_schedule(dev, &ep->napi);
+			__netif_rx_schedule(&ep->napi);
 		} else
 			ep->reschedule_in_poll++;
 		spin_unlock(&ep->napi_lock);
@@ -1288,7 +1288,7 @@ rx_action:
 
 		more = ep->reschedule_in_poll;
 		if (!more) {
-			__netif_rx_complete(dev, napi);
+			__netif_rx_complete(napi);
 			outl(EpicNapiEvent, ioaddr + INTSTAT);
 			epic_napi_irq_on(dev, ep);
 		} else
diff --git a/drivers/net/forcedeth.c b/drivers/net/forcedeth.c
index 1f2b24743ee..9fbfa856ae5 100644
--- a/drivers/net/forcedeth.c
+++ b/drivers/net/forcedeth.c
@@ -1760,7 +1760,7 @@ static void nv_do_rx_refill(unsigned long data)
 	struct fe_priv *np = netdev_priv(dev);
 
 	/* Just reschedule NAPI rx processing */
-	netif_rx_schedule(dev, &np->napi);
+	netif_rx_schedule(&np->napi);
 }
 #else
 static void nv_do_rx_refill(unsigned long data)
@@ -3403,7 +3403,7 @@ static irqreturn_t nv_nic_irq(int foo, void *data)
 
 #ifdef CONFIG_FORCEDETH_NAPI
 		if (events & NVREG_IRQ_RX_ALL) {
-			netif_rx_schedule(dev, &np->napi);
+			netif_rx_schedule(&np->napi);
 
 			/* Disable furthur receive irq's */
 			spin_lock(&np->lock);
@@ -3520,7 +3520,7 @@ static irqreturn_t nv_nic_irq_optimized(int foo, void *data)
 
 #ifdef CONFIG_FORCEDETH_NAPI
 		if (events & NVREG_IRQ_RX_ALL) {
-			netif_rx_schedule(dev, &np->napi);
+			netif_rx_schedule(&np->napi);
 
 			/* Disable furthur receive irq's */
 			spin_lock(&np->lock);
@@ -3678,7 +3678,7 @@ static int nv_napi_poll(struct napi_struct *napi, int budget)
 		/* re-enable receive interrupts */
 		spin_lock_irqsave(&np->lock, flags);
 
-		__netif_rx_complete(dev, napi);
+		__netif_rx_complete(napi);
 
 		np->irqmask |= NVREG_IRQ_RX_ALL;
 		if (np->msi_flags & NV_MSI_X_ENABLED)
@@ -3704,7 +3704,7 @@ static irqreturn_t nv_nic_irq_rx(int foo, void *data)
 	writel(NVREG_IRQ_RX_ALL, base + NvRegMSIXIrqStatus);
 
 	if (events) {
-		netif_rx_schedule(dev, &np->napi);
+		netif_rx_schedule(&np->napi);
 		/* disable receive interrupts on the nic */
 		writel(NVREG_IRQ_RX_ALL, base + NvRegIrqMask);
 		pci_push(base);
diff --git a/drivers/net/fs_enet/fs_enet-main.c b/drivers/net/fs_enet/fs_enet-main.c
index df66d620b11..4e6a9195fe5 100644
--- a/drivers/net/fs_enet/fs_enet-main.c
+++ b/drivers/net/fs_enet/fs_enet-main.c
@@ -209,7 +209,7 @@ static int fs_enet_rx_napi(struct napi_struct *napi, int budget)
 
 	if (received < budget) {
 		/* done */
-		netif_rx_complete(dev, napi);
+		netif_rx_complete(napi);
 		(*fep->ops->napi_enable_rx)(dev);
 	}
 	return received;
@@ -478,7 +478,7 @@ fs_enet_interrupt(int irq, void *dev_id)
 				/* NOTE: it is possible for FCCs in NAPI mode    */
 				/* to submit a spurious interrupt while in poll  */
 				if (napi_ok)
-					__netif_rx_schedule(dev, &fep->napi);
+					__netif_rx_schedule(&fep->napi);
 			}
 		}
 
diff --git a/drivers/net/gianfar.c b/drivers/net/gianfar.c
index 13f49643ba0..c672ecfc959 100644
--- a/drivers/net/gianfar.c
+++ b/drivers/net/gianfar.c
@@ -1607,9 +1607,9 @@ static int gfar_clean_tx_ring(struct net_device *dev)
 static void gfar_schedule_cleanup(struct net_device *dev)
 {
 	struct gfar_private *priv = netdev_priv(dev);
-	if (netif_rx_schedule_prep(dev, &priv->napi)) {
+	if (netif_rx_schedule_prep(&priv->napi)) {
 		gfar_write(&priv->regs->imask, IMASK_RTX_DISABLED);
-		__netif_rx_schedule(dev, &priv->napi);
+		__netif_rx_schedule(&priv->napi);
 	}
 }
 
@@ -1863,7 +1863,7 @@ static int gfar_poll(struct napi_struct *napi, int budget)
 		return budget;
 
 	if (rx_cleaned < budget) {
-		netif_rx_complete(dev, napi);
+		netif_rx_complete(napi);
 
 		/* Clear the halt bit in RSTAT */
 		gfar_write(&priv->regs->rstat, RSTAT_CLEAR_RHALT);
diff --git a/drivers/net/ibmveth.c b/drivers/net/ibmveth.c
index 02ecfdb4df6..1f055a95508 100644
--- a/drivers/net/ibmveth.c
+++ b/drivers/net/ibmveth.c
@@ -1028,7 +1028,7 @@ static int ibmveth_poll(struct napi_struct *napi, int budget)
 
 		ibmveth_assert(lpar_rc == H_SUCCESS);
 
-		netif_rx_complete(netdev, napi);
+		netif_rx_complete(napi);
 
 		if (ibmveth_rxq_pending_buffer(adapter) &&
 		    netif_rx_reschedule(netdev, napi)) {
@@ -1047,11 +1047,11 @@ static irqreturn_t ibmveth_interrupt(int irq, void *dev_instance)
 	struct ibmveth_adapter *adapter = netdev_priv(netdev);
 	unsigned long lpar_rc;
 
-	if (netif_rx_schedule_prep(netdev, &adapter->napi)) {
+	if (netif_rx_schedule_prep(&adapter->napi)) {
 		lpar_rc = h_vio_signal(adapter->vdev->unit_address,
 				       VIO_IRQ_DISABLE);
 		ibmveth_assert(lpar_rc == H_SUCCESS);
-		__netif_rx_schedule(netdev, &adapter->napi);
+		__netif_rx_schedule(&adapter->napi);
 	}
 	return IRQ_HANDLED;
 }
diff --git a/drivers/net/igb/igb_main.c b/drivers/net/igb/igb_main.c
index 25df7c93106..6a40d9486da 100644
--- a/drivers/net/igb/igb_main.c
+++ b/drivers/net/igb/igb_main.c
@@ -3347,8 +3347,8 @@ static irqreturn_t igb_msix_rx(int irq, void *data)
 
 	igb_write_itr(rx_ring);
 
-	if (netif_rx_schedule_prep(adapter->netdev, &rx_ring->napi))
-		__netif_rx_schedule(adapter->netdev, &rx_ring->napi);
+	if (netif_rx_schedule_prep(&rx_ring->napi))
+		__netif_rx_schedule(&rx_ring->napi);
 
 #ifdef CONFIG_IGB_DCA
 	if (adapter->flags & IGB_FLAG_DCA_ENABLED)
@@ -3500,7 +3500,7 @@ static irqreturn_t igb_intr_msi(int irq, void *data)
 			mod_timer(&adapter->watchdog_timer, jiffies + 1);
 	}
 
-	netif_rx_schedule(netdev, &adapter->rx_ring[0].napi);
+	netif_rx_schedule(&adapter->rx_ring[0].napi);
 
 	return IRQ_HANDLED;
 }
@@ -3538,7 +3538,7 @@ static irqreturn_t igb_intr(int irq, void *data)
 			mod_timer(&adapter->watchdog_timer, jiffies + 1);
 	}
 
-	netif_rx_schedule(netdev, &adapter->rx_ring[0].napi);
+	netif_rx_schedule(&adapter->rx_ring[0].napi);
 
 	return IRQ_HANDLED;
 }
@@ -3573,7 +3573,7 @@ static int igb_poll(struct napi_struct *napi, int budget)
 	    !netif_running(netdev)) {
 		if (adapter->itr_setting & 3)
 			igb_set_itr(adapter);
-		netif_rx_complete(netdev, napi);
+		netif_rx_complete(napi);
 		if (!test_bit(__IGB_DOWN, &adapter->state))
 			igb_irq_enable(adapter);
 		return 0;
@@ -3599,7 +3599,7 @@ static int igb_clean_rx_ring_msix(struct napi_struct *napi, int budget)
 
 	/* If not enough Rx work done, exit the polling mode */
 	if ((work_done == 0) || !netif_running(netdev)) {
-		netif_rx_complete(netdev, napi);
+		netif_rx_complete(napi);
 
 		if (adapter->itr_setting & 3) {
 			if (adapter->num_rx_queues == 1)
diff --git a/drivers/net/ixgb/ixgb_main.c b/drivers/net/ixgb/ixgb_main.c
index 820a92cc7f6..679125b3bbc 100644
--- a/drivers/net/ixgb/ixgb_main.c
+++ b/drivers/net/ixgb/ixgb_main.c
@@ -1721,14 +1721,14 @@ ixgb_intr(int irq, void *data)
 		if (!test_bit(__IXGB_DOWN, &adapter->flags))
 			mod_timer(&adapter->watchdog_timer, jiffies);
 
-	if (netif_rx_schedule_prep(netdev, &adapter->napi)) {
+	if (netif_rx_schedule_prep(&adapter->napi)) {
 
 		/* Disable interrupts and register for poll. The flush
 		  of the posted write is intentionally left out.
 		*/
 
 		IXGB_WRITE_REG(&adapter->hw, IMC, ~0);
-		__netif_rx_schedule(netdev, &adapter->napi);
+		__netif_rx_schedule(&adapter->napi);
 	}
 	return IRQ_HANDLED;
 }
@@ -1750,7 +1750,7 @@ ixgb_clean(struct napi_struct *napi, int budget)
 
 	/* If budget not fully consumed, exit the polling mode */
 	if (work_done < budget) {
-		netif_rx_complete(netdev, napi);
+		netif_rx_complete(napi);
 		if (!test_bit(__IXGB_DOWN, &adapter->flags))
 			ixgb_irq_enable(adapter);
 	}
diff --git a/drivers/net/ixgbe/ixgbe_main.c b/drivers/net/ixgbe/ixgbe_main.c
index 92b35cfc7a4..b6ae9f674ba 100644
--- a/drivers/net/ixgbe/ixgbe_main.c
+++ b/drivers/net/ixgbe/ixgbe_main.c
@@ -1012,7 +1012,7 @@ static irqreturn_t ixgbe_msix_clean_rx(int irq, void *data)
 	rx_ring = &(adapter->rx_ring[r_idx]);
 	/* disable interrupts on this vector only */
 	IXGBE_WRITE_REG(&adapter->hw, IXGBE_EIMC, rx_ring->v_idx);
-	netif_rx_schedule(adapter->netdev, &q_vector->napi);
+	netif_rx_schedule(&q_vector->napi);
 
 	return IRQ_HANDLED;
 }
@@ -1053,7 +1053,7 @@ static int ixgbe_clean_rxonly(struct napi_struct *napi, int budget)
 
 	/* If all Rx work done, exit the polling mode */
 	if (work_done < budget) {
-		netif_rx_complete(adapter->netdev, napi);
+		netif_rx_complete(napi);
 		if (adapter->itr_setting & 3)
 			ixgbe_set_itr_msix(q_vector);
 		if (!test_bit(__IXGBE_DOWN, &adapter->state))
@@ -1102,7 +1102,7 @@ static int ixgbe_clean_rxonly_many(struct napi_struct *napi, int budget)
 	rx_ring = &(adapter->rx_ring[r_idx]);
 	/* If all Rx work done, exit the polling mode */
 	if (work_done < budget) {
-		netif_rx_complete(adapter->netdev, napi);
+		netif_rx_complete(napi);
 		if (adapter->itr_setting & 3)
 			ixgbe_set_itr_msix(q_vector);
 		if (!test_bit(__IXGBE_DOWN, &adapter->state))
@@ -1378,13 +1378,13 @@ static irqreturn_t ixgbe_intr(int irq, void *data)
 
 	ixgbe_check_fan_failure(adapter, eicr);
 
-	if (netif_rx_schedule_prep(netdev, &adapter->q_vector[0].napi)) {
+	if (netif_rx_schedule_prep(&adapter->q_vector[0].napi)) {
 		adapter->tx_ring[0].total_packets = 0;
 		adapter->tx_ring[0].total_bytes = 0;
 		adapter->rx_ring[0].total_packets = 0;
 		adapter->rx_ring[0].total_bytes = 0;
 		/* would disable interrupts here but EIAM disabled it */
-		__netif_rx_schedule(netdev, &adapter->q_vector[0].napi);
+		__netif_rx_schedule(&adapter->q_vector[0].napi);
 	}
 
 	return IRQ_HANDLED;
@@ -2308,7 +2308,7 @@ static int ixgbe_poll(struct napi_struct *napi, int budget)
 
 	/* If budget not fully consumed, exit the polling mode */
 	if (work_done < budget) {
-		netif_rx_complete(adapter->netdev, napi);
+		netif_rx_complete(napi);
 		if (adapter->itr_setting & 3)
 			ixgbe_set_itr(adapter);
 		if (!test_bit(__IXGBE_DOWN, &adapter->state))
diff --git a/drivers/net/ixp2000/ixpdev.c b/drivers/net/ixp2000/ixpdev.c
index bd96dbc8e02..01474572056 100644
--- a/drivers/net/ixp2000/ixpdev.c
+++ b/drivers/net/ixp2000/ixpdev.c
@@ -141,7 +141,7 @@ static int ixpdev_poll(struct napi_struct *napi, int budget)
 			break;
 	} while (ixp2000_reg_read(IXP2000_IRQ_THD_RAW_STATUS_A_0) & 0x00ff);
 
-	netif_rx_complete(dev, napi);
+	netif_rx_complete(napi);
 	ixp2000_reg_write(IXP2000_IRQ_THD_ENABLE_SET_A_0, 0x00ff);
 
 	return rx;
@@ -204,7 +204,7 @@ static irqreturn_t ixpdev_interrupt(int irq, void *dev_id)
 
 		ixp2000_reg_wrb(IXP2000_IRQ_THD_ENABLE_CLEAR_A_0, 0x00ff);
 		if (likely(napi_schedule_prep(&ip->napi))) {
-			__netif_rx_schedule(dev, &ip->napi);
+			__netif_rx_schedule(&ip->napi);
 		} else {
 			printk(KERN_CRIT "ixp2000: irq while polling!!\n");
 		}
diff --git a/drivers/net/jme.c b/drivers/net/jme.c
index 15035cb1738..08b34051c64 100644
--- a/drivers/net/jme.c
+++ b/drivers/net/jme.c
@@ -1250,7 +1250,6 @@ static int
 jme_poll(JME_NAPI_HOLDER(holder), JME_NAPI_WEIGHT(budget))
 {
 	struct jme_adapter *jme = jme_napi_priv(holder);
-	struct net_device *netdev = jme->dev;
 	int rest;
 
 	rest = jme_process_receive(jme, JME_NAPI_WEIGHT_VAL(budget));
diff --git a/drivers/net/jme.h b/drivers/net/jme.h
index adaf3ddbf78..2d6f30e638f 100644
--- a/drivers/net/jme.h
+++ b/drivers/net/jme.h
@@ -398,15 +398,15 @@ struct jme_ring {
 #define JME_NAPI_WEIGHT(w) int w
 #define JME_NAPI_WEIGHT_VAL(w) w
 #define JME_NAPI_WEIGHT_SET(w, r)
-#define JME_RX_COMPLETE(dev, napis) netif_rx_complete(dev, napis)
+#define JME_RX_COMPLETE(dev, napis) netif_rx_complete(napis)
 #define JME_NAPI_ENABLE(priv) napi_enable(&priv->napi);
 #define JME_NAPI_DISABLE(priv) \
 	if (!napi_disable_pending(&priv->napi)) \
 		napi_disable(&priv->napi);
 #define JME_RX_SCHEDULE_PREP(priv) \
-	netif_rx_schedule_prep(priv->dev, &priv->napi)
+	netif_rx_schedule_prep(&priv->napi)
 #define JME_RX_SCHEDULE(priv) \
-	__netif_rx_schedule(priv->dev, &priv->napi);
+	__netif_rx_schedule(&priv->napi);
 
 /*
  * Jmac Adapter Private data
diff --git a/drivers/net/korina.c b/drivers/net/korina.c
index 63626953f07..4a5580c1126 100644
--- a/drivers/net/korina.c
+++ b/drivers/net/korina.c
@@ -327,7 +327,7 @@ static irqreturn_t korina_rx_dma_interrupt(int irq, void *dev_id)
 
 	dmas = readl(&lp->rx_dma_regs->dmas);
 	if (dmas & (DMA_STAT_DONE | DMA_STAT_HALT | DMA_STAT_ERR)) {
-		netif_rx_schedule_prep(dev, &lp->napi);
+		netif_rx_schedule_prep(&lp->napi);
 
 		dmasm = readl(&lp->rx_dma_regs->dmasm);
 		writel(dmasm | (DMA_STAT_DONE |
@@ -466,7 +466,7 @@ static int korina_poll(struct napi_struct *napi, int budget)
 
 	work_done = korina_rx(dev, budget);
 	if (work_done < budget) {
-		netif_rx_complete(dev, napi);
+		netif_rx_complete(napi);
 
 		writel(readl(&lp->rx_dma_regs->dmasm) &
 			~(DMA_STAT_DONE | DMA_STAT_HALT | DMA_STAT_ERR),
diff --git a/drivers/net/macb.c b/drivers/net/macb.c
index 261b9507124..a04da4ecaa8 100644
--- a/drivers/net/macb.c
+++ b/drivers/net/macb.c
@@ -519,7 +519,7 @@ static int macb_poll(struct napi_struct *napi, int budget)
 		 * this function was called last time, and no packets
 		 * have been received since.
 		 */
-		netif_rx_complete(dev, napi);
+		netif_rx_complete(napi);
 		goto out;
 	}
 
@@ -530,13 +530,13 @@ static int macb_poll(struct napi_struct *napi, int budget)
 		dev_warn(&bp->pdev->dev,
 			 "No RX buffers complete, status = %02lx\n",
 			 (unsigned long)status);
-		netif_rx_complete(dev, napi);
+		netif_rx_complete(napi);
 		goto out;
 	}
 
 	work_done = macb_rx(bp, budget);
 	if (work_done < budget)
-		netif_rx_complete(dev, napi);
+		netif_rx_complete(napi);
 
 	/*
 	 * We've done what we can to clean the buffers. Make sure we
@@ -571,7 +571,7 @@ static irqreturn_t macb_interrupt(int irq, void *dev_id)
 		}
 
 		if (status & MACB_RX_INT_FLAGS) {
-			if (netif_rx_schedule_prep(dev, &bp->napi)) {
+			if (netif_rx_schedule_prep(&bp->napi)) {
 				/*
 				 * There's no point taking any more interrupts
 				 * until we have processed the buffers
@@ -579,7 +579,7 @@ static irqreturn_t macb_interrupt(int irq, void *dev_id)
 				macb_writel(bp, IDR, MACB_RX_INT_FLAGS);
 				dev_dbg(&bp->pdev->dev,
 					"scheduling RX softirq\n");
-				__netif_rx_schedule(dev, &bp->napi);
+				__netif_rx_schedule(&bp->napi);
 			}
 		}
 
diff --git a/drivers/net/mlx4/en_rx.c b/drivers/net/mlx4/en_rx.c
index ffe28089b68..c61b0bdca1a 100644
--- a/drivers/net/mlx4/en_rx.c
+++ b/drivers/net/mlx4/en_rx.c
@@ -814,7 +814,7 @@ void mlx4_en_rx_irq(struct mlx4_cq *mcq)
 	struct mlx4_en_priv *priv = netdev_priv(cq->dev);
 
 	if (priv->port_up)
-		netif_rx_schedule(cq->dev, &cq->napi);
+		netif_rx_schedule(&cq->napi);
 	else
 		mlx4_en_arm_cq(priv, cq);
 }
@@ -834,7 +834,7 @@ int mlx4_en_poll_rx_cq(struct napi_struct *napi, int budget)
 		INC_PERF_COUNTER(priv->pstats.napi_quota);
 	else {
 		/* Done for now */
-		netif_rx_complete(dev, napi);
+		netif_rx_complete(napi);
 		mlx4_en_arm_cq(priv, cq);
 	}
 	return done;
diff --git a/drivers/net/myri10ge/myri10ge.c b/drivers/net/myri10ge/myri10ge.c
index f017c774e1a..378c89e6c7d 100644
--- a/drivers/net/myri10ge/myri10ge.c
+++ b/drivers/net/myri10ge/myri10ge.c
@@ -1515,7 +1515,7 @@ static int myri10ge_poll(struct napi_struct *napi, int budget)
 	work_done = myri10ge_clean_rx_done(ss, budget);
 
 	if (work_done < budget) {
-		netif_rx_complete(netdev, napi);
+		netif_rx_complete(napi);
 		put_be32(htonl(3), ss->irq_claim);
 	}
 	return work_done;
@@ -1533,7 +1533,7 @@ static irqreturn_t myri10ge_intr(int irq, void *arg)
 	/* an interrupt on a non-zero receive-only slice is implicitly
 	 * valid  since MSI-X irqs are not shared */
 	if ((mgp->dev->real_num_tx_queues == 1) && (ss != mgp->ss)) {
-		netif_rx_schedule(ss->dev, &ss->napi);
+		netif_rx_schedule(&ss->napi);
 		return (IRQ_HANDLED);
 	}
 
@@ -1544,7 +1544,7 @@ static irqreturn_t myri10ge_intr(int irq, void *arg)
 	/* low bit indicates receives are present, so schedule
 	 * napi poll handler */
 	if (stats->valid & 1)
-		netif_rx_schedule(ss->dev, &ss->napi);
+		netif_rx_schedule(&ss->napi);
 
 	if (!mgp->msi_enabled && !mgp->msix_enabled) {
 		put_be32(0, mgp->irq_deassert);
diff --git a/drivers/net/natsemi.c b/drivers/net/natsemi.c
index 9f81fcb9688..478edb92bca 100644
--- a/drivers/net/natsemi.c
+++ b/drivers/net/natsemi.c
@@ -2193,10 +2193,10 @@ static irqreturn_t intr_handler(int irq, void *dev_instance)
 
 	prefetch(&np->rx_skbuff[np->cur_rx % RX_RING_SIZE]);
 
-	if (netif_rx_schedule_prep(dev, &np->napi)) {
+	if (netif_rx_schedule_prep(&np->napi)) {
 		/* Disable interrupts and register for poll */
 		natsemi_irq_disable(dev);
-		__netif_rx_schedule(dev, &np->napi);
+		__netif_rx_schedule(&np->napi);
 	} else
 		printk(KERN_WARNING
 	       	       "%s: Ignoring interrupt, status %#08x, mask %#08x.\n",
@@ -2248,7 +2248,7 @@ static int natsemi_poll(struct napi_struct *napi, int budget)
 		np->intr_status = readl(ioaddr + IntrStatus);
 	} while (np->intr_status);
 
-	netif_rx_complete(dev, napi);
+	netif_rx_complete(napi);
 
 	/* Reenable interrupts providing nothing is trying to shut
 	 * the chip down. */
diff --git a/drivers/net/netxen/netxen_nic_main.c b/drivers/net/netxen/netxen_nic_main.c
index 6876bfd4455..ba01524b553 100644
--- a/drivers/net/netxen/netxen_nic_main.c
+++ b/drivers/net/netxen/netxen_nic_main.c
@@ -1583,7 +1583,7 @@ static int netxen_nic_poll(struct napi_struct *napi, int budget)
 	}
 
 	if ((work_done < budget) && tx_complete) {
-		netif_rx_complete(adapter->netdev, &adapter->napi);
+		netif_rx_complete(&adapter->napi);
 		netxen_nic_enable_int(adapter);
 	}
 
diff --git a/drivers/net/niu.c b/drivers/net/niu.c
index f219f16ec97..5698c155bbf 100644
--- a/drivers/net/niu.c
+++ b/drivers/net/niu.c
@@ -3669,7 +3669,7 @@ static int niu_poll(struct napi_struct *napi, int budget)
 	work_done = niu_poll_core(np, lp, budget);
 
 	if (work_done < budget) {
-		netif_rx_complete(np->dev, napi);
+		netif_rx_complete(napi);
 		niu_ldg_rearm(np, lp, 1);
 	}
 	return work_done;
@@ -4088,12 +4088,12 @@ static void __niu_fastpath_interrupt(struct niu *np, int ldg, u64 v0)
 static void niu_schedule_napi(struct niu *np, struct niu_ldg *lp,
 			      u64 v0, u64 v1, u64 v2)
 {
-	if (likely(netif_rx_schedule_prep(np->dev, &lp->napi))) {
+	if (likely(netif_rx_schedule_prep(&lp->napi))) {
 		lp->v0 = v0;
 		lp->v1 = v1;
 		lp->v2 = v2;
 		__niu_fastpath_interrupt(np, lp->ldg_num, v0);
-		__netif_rx_schedule(np->dev, &lp->napi);
+		__netif_rx_schedule(&lp->napi);
 	}
 }
 
diff --git a/drivers/net/pasemi_mac.c b/drivers/net/pasemi_mac.c
index fcbf6ccd0a8..dcd19904561 100644
--- a/drivers/net/pasemi_mac.c
+++ b/drivers/net/pasemi_mac.c
@@ -971,7 +971,7 @@ static irqreturn_t pasemi_mac_rx_intr(int irq, void *data)
 	if (*chan->status & PAS_STATUS_ERROR)
 		reg |= PAS_IOB_DMA_RXCH_RESET_DINTC;
 
-	netif_rx_schedule(dev, &mac->napi);
+	netif_rx_schedule(&mac->napi);
 
 	write_iob_reg(PAS_IOB_DMA_RXCH_RESET(chan->chno), reg);
 
@@ -1011,7 +1011,7 @@ static irqreturn_t pasemi_mac_tx_intr(int irq, void *data)
 
 	mod_timer(&txring->clean_timer, jiffies + (TX_CLEAN_INTERVAL)*2);
 
-	netif_rx_schedule(mac->netdev, &mac->napi);
+	netif_rx_schedule(&mac->napi);
 
 	if (reg)
 		write_iob_reg(PAS_IOB_DMA_TXCH_RESET(chan->chno), reg);
@@ -1641,7 +1641,7 @@ static int pasemi_mac_poll(struct napi_struct *napi, int budget)
 	pkts = pasemi_mac_clean_rx(rx_ring(mac), budget);
 	if (pkts < budget) {
 		/* all done, no more packets present */
-		netif_rx_complete(dev, napi);
+		netif_rx_complete(napi);
 
 		pasemi_mac_restart_rx_intr(mac);
 		pasemi_mac_restart_tx_intr(mac);
diff --git a/drivers/net/pcnet32.c b/drivers/net/pcnet32.c
index f2b192c80e1..044b7b07f5f 100644
--- a/drivers/net/pcnet32.c
+++ b/drivers/net/pcnet32.c
@@ -1397,7 +1397,7 @@ static int pcnet32_poll(struct napi_struct *napi, int budget)
 	if (work_done < budget) {
 		spin_lock_irqsave(&lp->lock, flags);
 
-		__netif_rx_complete(dev, napi);
+		__netif_rx_complete(napi);
 
 		/* clear interrupt masks */
 		val = lp->a.read_csr(ioaddr, CSR3);
@@ -2586,14 +2586,14 @@ pcnet32_interrupt(int irq, void *dev_id)
 				       dev->name, csr0);
 			/* unlike for the lance, there is no restart needed */
 		}
-		if (netif_rx_schedule_prep(dev, &lp->napi)) {
+		if (netif_rx_schedule_prep(&lp->napi)) {
 			u16 val;
 			/* set interrupt masks */
 			val = lp->a.read_csr(ioaddr, CSR3);
 			val |= 0x5f00;
 			lp->a.write_csr(ioaddr, CSR3, val);
 			mmiowb();
-			__netif_rx_schedule(dev, &lp->napi);
+			__netif_rx_schedule(&lp->napi);
 			break;
 		}
 		csr0 = lp->a.read_csr(ioaddr, CSR0);
diff --git a/drivers/net/qla3xxx.c b/drivers/net/qla3xxx.c
index 6b7ed1a5b3b..33e8e62b450 100644
--- a/drivers/net/qla3xxx.c
+++ b/drivers/net/qla3xxx.c
@@ -2293,7 +2293,7 @@ static int ql_poll(struct napi_struct *napi, int budget)
 
 	if (tx_cleaned + rx_cleaned != budget) {
 		spin_lock_irqsave(&qdev->hw_lock, hw_flags);
-		__netif_rx_complete(ndev, napi);
+		__netif_rx_complete(napi);
 		ql_update_small_bufq_prod_index(qdev);
 		ql_update_lrg_bufq_prod_index(qdev);
 		writel(qdev->rsp_consumer_index,
@@ -2352,8 +2352,8 @@ static irqreturn_t ql3xxx_isr(int irq, void *dev_id)
 		spin_unlock(&qdev->adapter_lock);
 	} else if (value & ISP_IMR_DISABLE_CMPL_INT) {
 		ql_disable_interrupts(qdev);
-		if (likely(netif_rx_schedule_prep(ndev, &qdev->napi))) {
-			__netif_rx_schedule(ndev, &qdev->napi);
+		if (likely(netif_rx_schedule_prep(&qdev->napi))) {
+			__netif_rx_schedule(&qdev->napi);
 		}
 	} else {
 		return IRQ_NONE;
diff --git a/drivers/net/qlge/qlge_main.c b/drivers/net/qlge/qlge_main.c
index 225930fda5a..02147082786 100644
--- a/drivers/net/qlge/qlge_main.c
+++ b/drivers/net/qlge/qlge_main.c
@@ -1647,7 +1647,7 @@ static int ql_napi_poll_msix(struct napi_struct *napi, int budget)
 		rx_ring->cq_id);
 
 	if (work_done < budget) {
-		__netif_rx_complete(qdev->ndev, napi);
+		__netif_rx_complete(napi);
 		ql_enable_completion_interrupt(qdev, rx_ring->irq);
 	}
 	return work_done;
@@ -1733,7 +1733,7 @@ static irqreturn_t qlge_msix_rx_isr(int irq, void *dev_id)
 {
 	struct rx_ring *rx_ring = dev_id;
 	struct ql_adapter *qdev = rx_ring->qdev;
-	netif_rx_schedule(qdev->ndev, &rx_ring->napi);
+	netif_rx_schedule(&rx_ring->napi);
 	return IRQ_HANDLED;
 }
 
@@ -1819,8 +1819,7 @@ static irqreturn_t qlge_isr(int irq, void *dev_id)
 							      &rx_ring->rx_work,
 							      0);
 				else
-					netif_rx_schedule(qdev->ndev,
-							  &rx_ring->napi);
+					netif_rx_schedule(&rx_ring->napi);
 				work_done++;
 			}
 		}
diff --git a/drivers/net/r6040.c b/drivers/net/r6040.c
index aff1cc627c0..53bbddfc8c9 100644
--- a/drivers/net/r6040.c
+++ b/drivers/net/r6040.c
@@ -667,7 +667,7 @@ static int r6040_poll(struct napi_struct *napi, int budget)
 	work_done = r6040_rx(dev, budget);
 
 	if (work_done < budget) {
-		netif_rx_complete(dev, napi);
+		netif_rx_complete(napi);
 		/* Enable RX interrupt */
 		iowrite16(ioread16(ioaddr + MIER) | RX_INTS, ioaddr + MIER);
 	}
@@ -704,7 +704,7 @@ static irqreturn_t r6040_interrupt(int irq, void *dev_id)
 
 		/* Mask off RX interrupt */
 		misr &= ~RX_INTS;
-		netif_rx_schedule(dev, &lp->napi);
+		netif_rx_schedule(&lp->napi);
 	}
 
 	/* TX interrupt request */
diff --git a/drivers/net/r8169.c b/drivers/net/r8169.c
index dddf6aeff49..2c73ca606b3 100644
--- a/drivers/net/r8169.c
+++ b/drivers/net/r8169.c
@@ -3581,8 +3581,8 @@ static irqreturn_t rtl8169_interrupt(int irq, void *dev_instance)
 		RTL_W16(IntrMask, tp->intr_event & ~tp->napi_event);
 		tp->intr_mask = ~tp->napi_event;
 
-		if (likely(netif_rx_schedule_prep(dev, &tp->napi)))
-			__netif_rx_schedule(dev, &tp->napi);
+		if (likely(netif_rx_schedule_prep(&tp->napi)))
+			__netif_rx_schedule(&tp->napi);
 		else if (netif_msg_intr(tp)) {
 			printk(KERN_INFO "%s: interrupt %04x in poll\n",
 			       dev->name, status);
@@ -3603,7 +3603,7 @@ static int rtl8169_poll(struct napi_struct *napi, int budget)
 	rtl8169_tx_interrupt(dev, tp, ioaddr);
 
 	if (work_done < budget) {
-		netif_rx_complete(dev, napi);
+		netif_rx_complete(napi);
 		tp->intr_mask = 0xffff;
 		/*
 		 * 20040426: the barrier is not strictly required but the
diff --git a/drivers/net/s2io.c b/drivers/net/s2io.c
index 1b489df80fa..512861923c6 100644
--- a/drivers/net/s2io.c
+++ b/drivers/net/s2io.c
@@ -2852,7 +2852,7 @@ static int s2io_poll_msix(struct napi_struct *napi, int budget)
 	s2io_chk_rx_buffers(nic, ring);
 
 	if (pkts_processed < budget_org) {
-		netif_rx_complete(dev, napi);
+		netif_rx_complete(napi);
 		/*Re Enable MSI-Rx Vector*/
 		addr = (u8 __iomem *)&bar0->xmsi_mask_reg;
 		addr += 7 - ring->ring_no;
@@ -2890,7 +2890,7 @@ static int s2io_poll_inta(struct napi_struct *napi, int budget)
 			break;
 	}
 	if (pkts_processed < budget_org) {
-		netif_rx_complete(dev, napi);
+		netif_rx_complete(napi);
 		/* Re enable the Rx interrupts for the ring */
 		writeq(0, &bar0->rx_traffic_mask);
 		readl(&bar0->rx_traffic_mask);
@@ -4344,7 +4344,7 @@ static irqreturn_t s2io_msix_ring_handle(int irq, void *dev_id)
 		val8 = (ring->ring_no == 0) ? 0x7f : 0xff;
 		writeb(val8, addr);
 		val8 = readb(addr);
-		netif_rx_schedule(dev, &ring->napi);
+		netif_rx_schedule(&ring->napi);
 	} else {
 		rx_intr_handler(ring, 0);
 		s2io_chk_rx_buffers(sp, ring);
@@ -4791,7 +4791,7 @@ static irqreturn_t s2io_isr(int irq, void *dev_id)
 
 		if (config->napi) {
 			if (reason & GEN_INTR_RXTRAFFIC) {
-				netif_rx_schedule(dev, &sp->napi);
+				netif_rx_schedule(&sp->napi);
 				writeq(S2IO_MINUS_ONE, &bar0->rx_traffic_mask);
 				writeq(S2IO_MINUS_ONE, &bar0->rx_traffic_int);
 				readl(&bar0->rx_traffic_int);
diff --git a/drivers/net/sb1250-mac.c b/drivers/net/sb1250-mac.c
index 480caec1e02..31e38fae017 100644
--- a/drivers/net/sb1250-mac.c
+++ b/drivers/net/sb1250-mac.c
@@ -2039,9 +2039,9 @@ static irqreturn_t sbmac_intr(int irq,void *dev_instance)
 		sbdma_tx_process(sc,&(sc->sbm_txdma), 0);
 
 	if (isr & (M_MAC_INT_CHANNEL << S_MAC_RX_CH0)) {
-		if (netif_rx_schedule_prep(dev, &sc->napi)) {
+		if (netif_rx_schedule_prep(&sc->napi)) {
 			__raw_writeq(0, sc->sbm_imr);
-			__netif_rx_schedule(dev, &sc->napi);
+			__netif_rx_schedule(&sc->napi);
 			/* Depend on the exit from poll to reenable intr */
 		}
 		else {
@@ -2667,7 +2667,7 @@ static int sbmac_poll(struct napi_struct *napi, int budget)
 	sbdma_tx_process(sc, &(sc->sbm_txdma), 1);
 
 	if (work_done < budget) {
-		netif_rx_complete(dev, napi);
+		netif_rx_complete(napi);
 
 #ifdef CONFIG_SBMAC_COALESCE
 		__raw_writeq(((M_MAC_INT_EOP_COUNT | M_MAC_INT_EOP_TIMER) << S_MAC_TX_CH0) |
diff --git a/drivers/net/sfc/efx.c b/drivers/net/sfc/efx.c
index 086629c0fe5..42934ba2030 100644
--- a/drivers/net/sfc/efx.c
+++ b/drivers/net/sfc/efx.c
@@ -230,7 +230,7 @@ static int efx_poll(struct napi_struct *napi, int budget)
 		 * since efx_channel_processed() will have no effect if
 		 * interrupts have already been disabled.
 		 */
-		netif_rx_complete(napi_dev, napi);
+		netif_rx_complete(napi);
 		efx_channel_processed(channel);
 	}
 
diff --git a/drivers/net/sfc/efx.h b/drivers/net/sfc/efx.h
index dd0d45b9e71..0dd7a532c78 100644
--- a/drivers/net/sfc/efx.h
+++ b/drivers/net/sfc/efx.h
@@ -77,7 +77,7 @@ static inline void efx_schedule_channel(struct efx_channel *channel)
 		  channel->channel, raw_smp_processor_id());
 	channel->work_pending = true;
 
-	netif_rx_schedule(channel->napi_dev, &channel->napi_str);
+	netif_rx_schedule(&channel->napi_str);
 }
 
 #endif /* EFX_EFX_H */
diff --git a/drivers/net/skge.c b/drivers/net/skge.c
index f73ee797400..c9dbb06f8c9 100644
--- a/drivers/net/skge.c
+++ b/drivers/net/skge.c
@@ -3214,7 +3214,7 @@ static int skge_poll(struct napi_struct *napi, int to_do)
 		unsigned long flags;
 
 		spin_lock_irqsave(&hw->hw_lock, flags);
-		__netif_rx_complete(dev, napi);
+		__netif_rx_complete(napi);
 		hw->intr_mask |= napimask[skge->port];
 		skge_write32(hw, B0_IMSK, hw->intr_mask);
 		skge_read32(hw, B0_IMSK);
@@ -3377,7 +3377,7 @@ static irqreturn_t skge_intr(int irq, void *dev_id)
 	if (status & (IS_XA1_F|IS_R1_F)) {
 		struct skge_port *skge = netdev_priv(hw->dev[0]);
 		hw->intr_mask &= ~(IS_XA1_F|IS_R1_F);
-		netif_rx_schedule(hw->dev[0], &skge->napi);
+		netif_rx_schedule(&skge->napi);
 	}
 
 	if (status & IS_PA_TO_TX1)
@@ -3397,7 +3397,7 @@ static irqreturn_t skge_intr(int irq, void *dev_id)
 
 		if (status & (IS_XA2_F|IS_R2_F)) {
 			hw->intr_mask &= ~(IS_XA2_F|IS_R2_F);
-			netif_rx_schedule(hw->dev[1], &skge->napi);
+			netif_rx_schedule(&skge->napi);
 		}
 
 		if (status & IS_PA_TO_RX2) {
diff --git a/drivers/net/smsc911x.c b/drivers/net/smsc911x.c
index fa28542b47d..ecdde03d416 100644
--- a/drivers/net/smsc911x.c
+++ b/drivers/net/smsc911x.c
@@ -984,7 +984,7 @@ static int smsc911x_poll(struct napi_struct *napi, int budget)
 			/* We processed all packets available.  Tell NAPI it can
 			 * stop polling then re-enable rx interrupts */
 			smsc911x_reg_write(pdata, INT_STS, INT_STS_RSFL_);
-			netif_rx_complete(dev, napi);
+			netif_rx_complete(napi);
 			temp = smsc911x_reg_read(pdata, INT_EN);
 			temp |= INT_EN_RSFL_EN_;
 			smsc911x_reg_write(pdata, INT_EN, temp);
diff --git a/drivers/net/smsc9420.c b/drivers/net/smsc9420.c
index 940220f6092..27e017d9696 100644
--- a/drivers/net/smsc9420.c
+++ b/drivers/net/smsc9420.c
@@ -666,7 +666,7 @@ static irqreturn_t smsc9420_isr(int irq, void *dev_id)
 			smsc9420_pci_flush_write(pd);
 
 			ints_to_clear |= (DMAC_STS_RX_ | DMAC_STS_NIS_);
-			netif_rx_schedule(pd->dev, &pd->napi);
+			netif_rx_schedule(&pd->napi);
 		}
 
 		if (ints_to_clear)
@@ -889,7 +889,7 @@ static int smsc9420_rx_poll(struct napi_struct *napi, int budget)
 	smsc9420_pci_flush_write(pd);
 
 	if (work_done < budget) {
-		netif_rx_complete(dev, &pd->napi);
+		netif_rx_complete(&pd->napi);
 
 		/* re-enable RX DMA interrupts */
 		dma_intr_ena = smsc9420_reg_read(pd, DMAC_INTR_ENA);
diff --git a/drivers/net/spider_net.c b/drivers/net/spider_net.c
index 325fbc9612c..c5c123d3af5 100644
--- a/drivers/net/spider_net.c
+++ b/drivers/net/spider_net.c
@@ -1302,7 +1302,7 @@ static int spider_net_poll(struct napi_struct *napi, int budget)
 	/* if all packets are in the stack, enable interrupts and return 0 */
 	/* if not, return 1 */
 	if (packets_done < budget) {
-		netif_rx_complete(netdev, napi);
+		netif_rx_complete(napi);
 		spider_net_rx_irq_on(card);
 		card->ignore_rx_ramfull = 0;
 	}
@@ -1529,8 +1529,7 @@ spider_net_handle_error_irq(struct spider_net_card *card, u32 status_reg,
 			spider_net_refill_rx_chain(card);
 			spider_net_enable_rxdmac(card);
 			card->num_rx_ints ++;
-			netif_rx_schedule(card->netdev,
-					  &card->napi);
+			netif_rx_schedule(&card->napi);
 		}
 		show_error = 0;
 		break;
@@ -1550,8 +1549,7 @@ spider_net_handle_error_irq(struct spider_net_card *card, u32 status_reg,
 		spider_net_refill_rx_chain(card);
 		spider_net_enable_rxdmac(card);
 		card->num_rx_ints ++;
-		netif_rx_schedule(card->netdev,
-				  &card->napi);
+		netif_rx_schedule(&card->napi);
 		show_error = 0;
 		break;
 
@@ -1565,8 +1563,7 @@ spider_net_handle_error_irq(struct spider_net_card *card, u32 status_reg,
 		spider_net_refill_rx_chain(card);
 		spider_net_enable_rxdmac(card);
 		card->num_rx_ints ++;
-		netif_rx_schedule(card->netdev,
-				  &card->napi);
+		netif_rx_schedule(&card->napi);
 		show_error = 0;
 		break;
 
@@ -1660,11 +1657,11 @@ spider_net_interrupt(int irq, void *ptr)
 
 	if (status_reg & SPIDER_NET_RXINT ) {
 		spider_net_rx_irq_off(card);
-		netif_rx_schedule(netdev, &card->napi);
+		netif_rx_schedule(&card->napi);
 		card->num_rx_ints ++;
 	}
 	if (status_reg & SPIDER_NET_TXINT)
-		netif_rx_schedule(netdev, &card->napi);
+		netif_rx_schedule(&card->napi);
 
 	if (status_reg & SPIDER_NET_LINKINT)
 		spider_net_link_reset(netdev);
diff --git a/drivers/net/starfire.c b/drivers/net/starfire.c
index 0358809f409..d5b9dd842c6 100644
--- a/drivers/net/starfire.c
+++ b/drivers/net/starfire.c
@@ -1290,8 +1290,8 @@ static irqreturn_t intr_handler(int irq, void *dev_instance)
 		if (intr_status & (IntrRxDone | IntrRxEmpty)) {
 			u32 enable;
 
-			if (likely(netif_rx_schedule_prep(dev, &np->napi))) {
-				__netif_rx_schedule(dev, &np->napi);
+			if (likely(netif_rx_schedule_prep(&np->napi))) {
+				__netif_rx_schedule(&np->napi);
 				enable = readl(ioaddr + IntrEnable);
 				enable &= ~(IntrRxDone | IntrRxEmpty);
 				writel(enable, ioaddr + IntrEnable);
@@ -1530,7 +1530,7 @@ static int netdev_poll(struct napi_struct *napi, int budget)
 		intr_status = readl(ioaddr + IntrStatus);
 	} while (intr_status & (IntrRxDone | IntrRxEmpty));
 
-	netif_rx_complete(dev, napi);
+	netif_rx_complete(napi);
 	intr_status = readl(ioaddr + IntrEnable);
 	intr_status |= IntrRxDone | IntrRxEmpty;
 	writel(intr_status, ioaddr + IntrEnable);
diff --git a/drivers/net/sungem.c b/drivers/net/sungem.c
index f4b0beec4d1..8a746041248 100644
--- a/drivers/net/sungem.c
+++ b/drivers/net/sungem.c
@@ -921,7 +921,7 @@ static int gem_poll(struct napi_struct *napi, int budget)
 		gp->status = readl(gp->regs + GREG_STAT);
 	} while (gp->status & GREG_STAT_NAPI);
 
-	__netif_rx_complete(dev, napi);
+	__netif_rx_complete(napi);
 	gem_enable_ints(gp);
 
 	spin_unlock_irqrestore(&gp->lock, flags);
@@ -944,7 +944,7 @@ static irqreturn_t gem_interrupt(int irq, void *dev_id)
 
 	spin_lock_irqsave(&gp->lock, flags);
 
-	if (netif_rx_schedule_prep(dev, &gp->napi)) {
+	if (netif_rx_schedule_prep(&gp->napi)) {
 		u32 gem_status = readl(gp->regs + GREG_STAT);
 
 		if (gem_status == 0) {
@@ -954,7 +954,7 @@ static irqreturn_t gem_interrupt(int irq, void *dev_id)
 		}
 		gp->status = gem_status;
 		gem_disable_ints(gp);
-		__netif_rx_schedule(dev, &gp->napi);
+		__netif_rx_schedule(&gp->napi);
 	}
 
 	spin_unlock_irqrestore(&gp->lock, flags);
diff --git a/drivers/net/tc35815.c b/drivers/net/tc35815.c
index 308f365270e..bcd0e60cbda 100644
--- a/drivers/net/tc35815.c
+++ b/drivers/net/tc35815.c
@@ -1609,8 +1609,8 @@ static irqreturn_t tc35815_interrupt(int irq, void *dev_id)
 	if (!(dmactl & DMA_IntMask)) {
 		/* disable interrupts */
 		tc_writel(dmactl | DMA_IntMask, &tr->DMA_Ctl);
-		if (netif_rx_schedule_prep(dev, &lp->napi))
-			__netif_rx_schedule(dev, &lp->napi);
+		if (netif_rx_schedule_prep(&lp->napi))
+			__netif_rx_schedule(&lp->napi);
 		else {
 			printk(KERN_ERR "%s: interrupt taken in poll\n",
 			       dev->name);
@@ -1919,7 +1919,7 @@ static int tc35815_poll(struct napi_struct *napi, int budget)
 	spin_unlock(&lp->lock);
 
 	if (received < budget) {
-		netif_rx_complete(dev, napi);
+		netif_rx_complete(napi);
 		/* enable interrupts */
 		tc_writel(tc_readl(&tr->DMA_Ctl) & ~DMA_IntMask, &tr->DMA_Ctl);
 	}
diff --git a/drivers/net/tehuti.c b/drivers/net/tehuti.c
index 5b83fbb0201..a10a83a11d9 100644
--- a/drivers/net/tehuti.c
+++ b/drivers/net/tehuti.c
@@ -265,8 +265,8 @@ static irqreturn_t bdx_isr_napi(int irq, void *dev)
 		bdx_isr_extra(priv, isr);
 
 	if (isr & (IR_RX_DESC_0 | IR_TX_FREE_0)) {
-		if (likely(netif_rx_schedule_prep(ndev, &priv->napi))) {
-			__netif_rx_schedule(ndev, &priv->napi);
+		if (likely(netif_rx_schedule_prep(&priv->napi))) {
+			__netif_rx_schedule(&priv->napi);
 			RET(IRQ_HANDLED);
 		} else {
 			/* NOTE: we get here if intr has slipped into window
@@ -289,7 +289,6 @@ static irqreturn_t bdx_isr_napi(int irq, void *dev)
 static int bdx_poll(struct napi_struct *napi, int budget)
 {
 	struct bdx_priv *priv = container_of(napi, struct bdx_priv, napi);
-	struct net_device *dev = priv->ndev;
 	int work_done;
 
 	ENTER;
@@ -303,7 +302,7 @@ static int bdx_poll(struct napi_struct *napi, int budget)
 		 * device lock and allow waiting tasks (eg rmmod) to advance) */
 		priv->napi_stop = 0;
 
-		netif_rx_complete(dev, napi);
+		netif_rx_complete(napi);
 		bdx_enable_interrupts(priv);
 	}
 	return work_done;
diff --git a/drivers/net/tg3.c b/drivers/net/tg3.c
index 7971d802508..04ae1e86aea 100644
--- a/drivers/net/tg3.c
+++ b/drivers/net/tg3.c
@@ -4451,7 +4451,7 @@ static int tg3_poll(struct napi_struct *napi, int budget)
 			sblk->status &= ~SD_STATUS_UPDATED;
 
 		if (likely(!tg3_has_work(tp))) {
-			netif_rx_complete(tp->dev, napi);
+			netif_rx_complete(napi);
 			tg3_restart_ints(tp);
 			break;
 		}
@@ -4461,7 +4461,7 @@ static int tg3_poll(struct napi_struct *napi, int budget)
 
 tx_recovery:
 	/* work_done is guaranteed to be less than budget. */
-	netif_rx_complete(tp->dev, napi);
+	netif_rx_complete(napi);
 	schedule_work(&tp->reset_task);
 	return work_done;
 }
@@ -4510,7 +4510,7 @@ static irqreturn_t tg3_msi_1shot(int irq, void *dev_id)
 	prefetch(&tp->rx_rcb[tp->rx_rcb_ptr]);
 
 	if (likely(!tg3_irq_sync(tp)))
-		netif_rx_schedule(dev, &tp->napi);
+		netif_rx_schedule(&tp->napi);
 
 	return IRQ_HANDLED;
 }
@@ -4535,7 +4535,7 @@ static irqreturn_t tg3_msi(int irq, void *dev_id)
 	 */
 	tw32_mailbox(MAILBOX_INTERRUPT_0 + TG3_64BIT_REG_LOW, 0x00000001);
 	if (likely(!tg3_irq_sync(tp)))
-		netif_rx_schedule(dev, &tp->napi);
+		netif_rx_schedule(&tp->napi);
 
 	return IRQ_RETVAL(1);
 }
@@ -4577,7 +4577,7 @@ static irqreturn_t tg3_interrupt(int irq, void *dev_id)
 	sblk->status &= ~SD_STATUS_UPDATED;
 	if (likely(tg3_has_work(tp))) {
 		prefetch(&tp->rx_rcb[tp->rx_rcb_ptr]);
-		netif_rx_schedule(dev, &tp->napi);
+		netif_rx_schedule(&tp->napi);
 	} else {
 		/* No work, shared interrupt perhaps?  re-enable
 		 * interrupts, and flush that PCI write
@@ -4623,7 +4623,7 @@ static irqreturn_t tg3_interrupt_tagged(int irq, void *dev_id)
 	tw32_mailbox_f(MAILBOX_INTERRUPT_0 + TG3_64BIT_REG_LOW, 0x00000001);
 	if (tg3_irq_sync(tp))
 		goto out;
-	if (netif_rx_schedule_prep(dev, &tp->napi)) {
+	if (netif_rx_schedule_prep(&tp->napi)) {
 		prefetch(&tp->rx_rcb[tp->rx_rcb_ptr]);
 		/* Update last_tag to mark that this status has been
 		 * seen. Because interrupt may be shared, we may be
@@ -4631,7 +4631,7 @@ static irqreturn_t tg3_interrupt_tagged(int irq, void *dev_id)
 		 * if tg3_poll() is not scheduled.
 		 */
 		tp->last_tag = sblk->status_tag;
-		__netif_rx_schedule(dev, &tp->napi);
+		__netif_rx_schedule(&tp->napi);
 	}
 out:
 	return IRQ_RETVAL(handled);
diff --git a/drivers/net/tsi108_eth.c b/drivers/net/tsi108_eth.c
index 271bc230c8a..75461dbd487 100644
--- a/drivers/net/tsi108_eth.c
+++ b/drivers/net/tsi108_eth.c
@@ -888,7 +888,7 @@ static int tsi108_poll(struct napi_struct *napi, int budget)
 
 	if (num_received < budget) {
 		data->rxpending = 0;
-		netif_rx_complete(dev, napi);
+		netif_rx_complete(napi);
 
 		TSI_WRITE(TSI108_EC_INTMASK,
 				     TSI_READ(TSI108_EC_INTMASK)
@@ -919,7 +919,7 @@ static void tsi108_rx_int(struct net_device *dev)
 	 * from tsi108_check_rxring().
 	 */
 
-	if (netif_rx_schedule_prep(dev, &data->napi)) {
+	if (netif_rx_schedule_prep(&data->napi)) {
 		/* Mask, rather than ack, the receive interrupts.  The ack
 		 * will happen in tsi108_poll().
 		 */
@@ -930,7 +930,7 @@ static void tsi108_rx_int(struct net_device *dev)
 				     | TSI108_INT_RXTHRESH |
 				     TSI108_INT_RXOVERRUN | TSI108_INT_RXERROR |
 				     TSI108_INT_RXWAIT);
-		__netif_rx_schedule(dev, &data->napi);
+		__netif_rx_schedule(&data->napi);
 	} else {
 		if (!netif_running(dev)) {
 			/* This can happen if an interrupt occurs while the
diff --git a/drivers/net/tulip/interrupt.c b/drivers/net/tulip/interrupt.c
index 739d610d18c..6c3428a37c0 100644
--- a/drivers/net/tulip/interrupt.c
+++ b/drivers/net/tulip/interrupt.c
@@ -103,7 +103,7 @@ void oom_timer(unsigned long data)
 {
         struct net_device *dev = (struct net_device *)data;
 	struct tulip_private *tp = netdev_priv(dev);
-	netif_rx_schedule(dev, &tp->napi);
+	netif_rx_schedule(&tp->napi);
 }
 
 int tulip_poll(struct napi_struct *napi, int budget)
@@ -300,7 +300,7 @@ int tulip_poll(struct napi_struct *napi, int budget)
 
          /* Remove us from polling list and enable RX intr. */
 
-         netif_rx_complete(dev, napi);
+         netif_rx_complete(napi);
          iowrite32(tulip_tbl[tp->chip_id].valid_intrs, tp->base_addr+CSR7);
 
          /* The last op happens after poll completion. Which means the following:
@@ -336,7 +336,7 @@ int tulip_poll(struct napi_struct *napi, int budget)
           * before we did netif_rx_complete(). See? We would lose it. */
 
          /* remove ourselves from the polling list */
-         netif_rx_complete(dev, napi);
+         netif_rx_complete(napi);
 
          return work_done;
 }
@@ -519,7 +519,7 @@ irqreturn_t tulip_interrupt(int irq, void *dev_instance)
 			rxd++;
 			/* Mask RX intrs and add the device to poll list. */
 			iowrite32(tulip_tbl[tp->chip_id].valid_intrs&~RxPollInt, ioaddr + CSR7);
-			netif_rx_schedule(dev, &tp->napi);
+			netif_rx_schedule(&tp->napi);
 
 			if (!(csr5&~(AbnormalIntr|NormalIntr|RxPollInt|TPLnkPass)))
                                break;
diff --git a/drivers/net/typhoon.c b/drivers/net/typhoon.c
index 5386d9b73e6..0009f4e3443 100644
--- a/drivers/net/typhoon.c
+++ b/drivers/net/typhoon.c
@@ -1755,7 +1755,6 @@ static int
 typhoon_poll(struct napi_struct *napi, int budget)
 {
 	struct typhoon *tp = container_of(napi, struct typhoon, napi);
-	struct net_device *dev = tp->dev;
 	struct typhoon_indexes *indexes = tp->indexes;
 	int work_done;
 
@@ -1784,7 +1783,7 @@ typhoon_poll(struct napi_struct *napi, int budget)
 	}
 
 	if (work_done < budget) {
-		netif_rx_complete(dev, napi);
+		netif_rx_complete(napi);
 		iowrite32(TYPHOON_INTR_NONE,
 				tp->ioaddr + TYPHOON_REG_INTR_MASK);
 		typhoon_post_pci_writes(tp->ioaddr);
@@ -1807,10 +1806,10 @@ typhoon_interrupt(int irq, void *dev_instance)
 
 	iowrite32(intr_status, ioaddr + TYPHOON_REG_INTR_STATUS);
 
-	if (netif_rx_schedule_prep(dev, &tp->napi)) {
+	if (netif_rx_schedule_prep(&tp->napi)) {
 		iowrite32(TYPHOON_INTR_ALL, ioaddr + TYPHOON_REG_INTR_MASK);
 		typhoon_post_pci_writes(ioaddr);
-		__netif_rx_schedule(dev, &tp->napi);
+		__netif_rx_schedule(&tp->napi);
 	} else {
 		printk(KERN_ERR "%s: Error, poll already scheduled\n",
                        dev->name);
diff --git a/drivers/net/ucc_geth.c b/drivers/net/ucc_geth.c
index 5c82f147f15..78a2ede19c5 100644
--- a/drivers/net/ucc_geth.c
+++ b/drivers/net/ucc_geth.c
@@ -3330,7 +3330,7 @@ static int ucc_geth_poll(struct napi_struct *napi, int budget)
 		struct ucc_fast_private *uccf;
 		u32 uccm;
 
-		netif_rx_complete(dev, napi);
+		netif_rx_complete(napi);
 		uccf = ugeth->uccf;
 		uccm = in_be32(uccf->p_uccm);
 		uccm |= UCCE_RX_EVENTS;
@@ -3364,10 +3364,10 @@ static irqreturn_t ucc_geth_irq_handler(int irq, void *info)
 
 	/* check for receive events that require processing */
 	if (ucce & UCCE_RX_EVENTS) {
-		if (netif_rx_schedule_prep(dev, &ugeth->napi)) {
+		if (netif_rx_schedule_prep(&ugeth->napi)) {
 			uccm &= ~UCCE_RX_EVENTS;
 			out_be32(uccf->p_uccm, uccm);
-			__netif_rx_schedule(dev, &ugeth->napi);
+			__netif_rx_schedule(&ugeth->napi);
 		}
 	}
 
diff --git a/drivers/net/via-rhine.c b/drivers/net/via-rhine.c
index 8d405c83df8..ac07cc6e3cb 100644
--- a/drivers/net/via-rhine.c
+++ b/drivers/net/via-rhine.c
@@ -589,7 +589,7 @@ static int rhine_napipoll(struct napi_struct *napi, int budget)
 	work_done = rhine_rx(dev, budget);
 
 	if (work_done < budget) {
-		netif_rx_complete(dev, napi);
+		netif_rx_complete(napi);
 
 		iowrite16(IntrRxDone | IntrRxErr | IntrRxEmpty| IntrRxOverflow |
 			  IntrRxDropped | IntrRxNoBuf | IntrTxAborted |
@@ -1318,7 +1318,7 @@ static irqreturn_t rhine_interrupt(int irq, void *dev_instance)
 				  IntrPCIErr | IntrStatsMax | IntrLinkChange,
 				  ioaddr + IntrEnable);
 
-			netif_rx_schedule(dev, &rp->napi);
+			netif_rx_schedule(&rp->napi);
 		}
 
 		if (intr_status & (IntrTxErrSummary | IntrTxDone)) {
diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index 71ca29cc184..b7004ff3645 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -374,9 +374,9 @@ static void skb_recv_done(struct virtqueue *rvq)
 {
 	struct virtnet_info *vi = rvq->vdev->priv;
 	/* Schedule NAPI, Suppress further interrupts if successful. */
-	if (netif_rx_schedule_prep(vi->dev, &vi->napi)) {
+	if (netif_rx_schedule_prep(&vi->napi)) {
 		rvq->vq_ops->disable_cb(rvq);
-		__netif_rx_schedule(vi->dev, &vi->napi);
+		__netif_rx_schedule(&vi->napi);
 	}
 }
 
@@ -402,11 +402,11 @@ again:
 
 	/* Out of packets? */
 	if (received < budget) {
-		netif_rx_complete(vi->dev, napi);
+		netif_rx_complete(napi);
 		if (unlikely(!vi->rvq->vq_ops->enable_cb(vi->rvq))
 		    && napi_schedule_prep(napi)) {
 			vi->rvq->vq_ops->disable_cb(vi->rvq);
-			__netif_rx_schedule(vi->dev, napi);
+			__netif_rx_schedule(napi);
 			goto again;
 		}
 	}
@@ -580,9 +580,9 @@ static int virtnet_open(struct net_device *dev)
 	 * won't get another interrupt, so process any outstanding packets
 	 * now.  virtnet_poll wants re-enable the queue, so we disable here.
 	 * We synchronize against interrupts via NAPI_STATE_SCHED */
-	if (netif_rx_schedule_prep(dev, &vi->napi)) {
+	if (netif_rx_schedule_prep(&vi->napi)) {
 		vi->rvq->vq_ops->disable_cb(vi->rvq);
-		__netif_rx_schedule(dev, &vi->napi);
+		__netif_rx_schedule(&vi->napi);
 	}
 	return 0;
 }
diff --git a/drivers/net/wan/hd64572.c b/drivers/net/wan/hd64572.c
index 0bcc0b5f22d..08b3536944f 100644
--- a/drivers/net/wan/hd64572.c
+++ b/drivers/net/wan/hd64572.c
@@ -341,7 +341,7 @@ static int sca_poll(struct napi_struct *napi, int budget)
 		received = sca_rx_done(port, budget);
 
 	if (received < budget) {
-		netif_rx_complete(port->netdev, napi);
+		netif_rx_complete(napi);
 		enable_intr(port);
 	}
 
@@ -359,7 +359,7 @@ static irqreturn_t sca_intr(int irq, void *dev_id)
 		if (port && (isr0 & (i ? 0x08002200 : 0x00080022))) {
 			handled = 1;
 			disable_intr(port);
-			netif_rx_schedule(port->netdev, &port->napi);
+			netif_rx_schedule(&port->napi);
 		}
 	}
 
diff --git a/drivers/net/xen-netfront.c b/drivers/net/xen-netfront.c
index fe376fde4e8..761635be910 100644
--- a/drivers/net/xen-netfront.c
+++ b/drivers/net/xen-netfront.c
@@ -196,7 +196,7 @@ static void rx_refill_timeout(unsigned long data)
 {
 	struct net_device *dev = (struct net_device *)data;
 	struct netfront_info *np = netdev_priv(dev);
-	netif_rx_schedule(dev, &np->napi);
+	netif_rx_schedule(&np->napi);
 }
 
 static int netfront_tx_slot_available(struct netfront_info *np)
@@ -328,7 +328,7 @@ static int xennet_open(struct net_device *dev)
 		xennet_alloc_rx_buffers(dev);
 		np->rx.sring->rsp_event = np->rx.rsp_cons + 1;
 		if (RING_HAS_UNCONSUMED_RESPONSES(&np->rx))
-			netif_rx_schedule(dev, &np->napi);
+			netif_rx_schedule(&np->napi);
 	}
 	spin_unlock_bh(&np->rx_lock);
 
@@ -979,7 +979,7 @@ err:
 
 		RING_FINAL_CHECK_FOR_RESPONSES(&np->rx, more_to_do);
 		if (!more_to_do)
-			__netif_rx_complete(dev, napi);
+			__netif_rx_complete(napi);
 
 		local_irq_restore(flags);
 	}
@@ -1310,7 +1310,7 @@ static irqreturn_t xennet_interrupt(int irq, void *dev_id)
 		xennet_tx_buf_gc(dev);
 		/* Under tx_lock: protects access to rx shared-ring indexes. */
 		if (RING_HAS_UNCONSUMED_RESPONSES(&np->rx))
-			netif_rx_schedule(dev, &np->napi);
+			netif_rx_schedule(&np->napi);
 	}
 
 	spin_unlock_irqrestore(&np->tx_lock, flags);
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 58856b6737f..41e1224651c 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1555,8 +1555,7 @@ static inline u32 netif_msg_init(int debug_value, int default_msg_enable_bits)
 }
 
 /* Test if receive needs to be scheduled but only if up */
-static inline int netif_rx_schedule_prep(struct net_device *dev,
-					 struct napi_struct *napi)
+static inline int netif_rx_schedule_prep(struct napi_struct *napi)
 {
 	return napi_schedule_prep(napi);
 }
@@ -1564,27 +1563,24 @@ static inline int netif_rx_schedule_prep(struct net_device *dev,
 /* Add interface to tail of rx poll list. This assumes that _prep has
  * already been called and returned 1.
  */
-static inline void __netif_rx_schedule(struct net_device *dev,
-				       struct napi_struct *napi)
+static inline void __netif_rx_schedule(struct napi_struct *napi)
 {
 	__napi_schedule(napi);
 }
 
 /* Try to reschedule poll. Called by irq handler. */
 
-static inline void netif_rx_schedule(struct net_device *dev,
-				     struct napi_struct *napi)
+static inline void netif_rx_schedule(struct napi_struct *napi)
 {
-	if (netif_rx_schedule_prep(dev, napi))
-		__netif_rx_schedule(dev, napi);
+	if (netif_rx_schedule_prep(napi))
+		__netif_rx_schedule(napi);
 }
 
 /* Try to reschedule poll. Called by dev->poll() after netif_rx_complete().  */
-static inline int netif_rx_reschedule(struct net_device *dev,
-				      struct napi_struct *napi)
+static inline int netif_rx_reschedule(struct napi_struct *napi)
 {
 	if (napi_schedule_prep(napi)) {
-		__netif_rx_schedule(dev, napi);
+		__netif_rx_schedule(napi);
 		return 1;
 	}
 	return 0;
@@ -1593,8 +1589,7 @@ static inline int netif_rx_reschedule(struct net_device *dev,
 /* same as netif_rx_complete, except that local_irq_save(flags)
  * has already been issued
  */
-static inline void __netif_rx_complete(struct net_device *dev,
-				       struct napi_struct *napi)
+static inline void __netif_rx_complete(struct napi_struct *napi)
 {
 	__napi_complete(napi);
 }
@@ -1604,8 +1599,7 @@ static inline void __netif_rx_complete(struct net_device *dev,
  * it completes the work. The device cannot be out of poll list at this
  * moment, it is BUG().
  */
-static inline void netif_rx_complete(struct net_device *dev,
-				     struct napi_struct *napi)
+static inline void netif_rx_complete(struct napi_struct *napi)
 {
 	napi_complete(napi);
 }
-- 
cgit v1.2.3-70-g09d2


From 88a9fe8cae3bb52e82489447f45e8d7ba1409ca8 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Tue, 23 Dec 2008 15:21:31 -0500
Subject: SUNRPC: Remove the last remnant of the BKL...

Somehow, this escaped the previous purge. There should be no need to keep
any extra locks in the XDR callbacks.

The NFS client XDR code only writes into private objects, whereas all reads
of shared objects are confined to fields that do not change, such as
filehandles...

Ditto for lockd, the NFSv2/v3 client mount code, and rpcbind.

The nfsd XDR code may require the BKL, but since it does a synchronous RPC
call from a thread that already holds the lock, that issue is moot.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 include/linux/sunrpc/xdr.h     | 15 ---------------
 net/sunrpc/auth.c              |  4 ++--
 net/sunrpc/auth_gss/auth_gss.c | 10 +++++-----
 3 files changed, 7 insertions(+), 22 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/xdr.h b/include/linux/sunrpc/xdr.h
index e4057d729f0..49e1eb45446 100644
--- a/include/linux/sunrpc/xdr.h
+++ b/include/linux/sunrpc/xdr.h
@@ -36,21 +36,6 @@ struct xdr_netobj {
  */
 typedef int	(*kxdrproc_t)(void *rqstp, __be32 *data, void *obj);
 
-/*
- * We're still requiring the BKL in the xdr code until it's been
- * more carefully audited, at which point this wrapper will become
- * unnecessary.
- */
-static inline int rpc_call_xdrproc(kxdrproc_t xdrproc, void *rqstp, __be32 *data, void *obj)
-{
-	int ret;
-
-	lock_kernel();
-	ret = xdrproc(rqstp, data, obj);
-	unlock_kernel();
-	return ret;
-}
-
 /*
  * Basic structure for transmission/reception of a client XDR message.
  * Features a header (for a linear buffer containing RPC headers
diff --git a/net/sunrpc/auth.c b/net/sunrpc/auth.c
index cb216b2df66..6e28744b170 100644
--- a/net/sunrpc/auth.c
+++ b/net/sunrpc/auth.c
@@ -513,7 +513,7 @@ rpcauth_wrap_req(struct rpc_task *task, kxdrproc_t encode, void *rqstp,
 	if (cred->cr_ops->crwrap_req)
 		return cred->cr_ops->crwrap_req(task, encode, rqstp, data, obj);
 	/* By default, we encode the arguments normally. */
-	return rpc_call_xdrproc(encode, rqstp, data, obj);
+	return encode(rqstp, data, obj);
 }
 
 int
@@ -528,7 +528,7 @@ rpcauth_unwrap_resp(struct rpc_task *task, kxdrproc_t decode, void *rqstp,
 		return cred->cr_ops->crunwrap_resp(task, decode, rqstp,
 						   data, obj);
 	/* By default, we decode the arguments normally. */
-	return rpc_call_xdrproc(decode, rqstp, data, obj);
+	return decode(rqstp, data, obj);
 }
 
 int
diff --git a/net/sunrpc/auth_gss/auth_gss.c b/net/sunrpc/auth_gss/auth_gss.c
index 853a4142cea..b8561597f0c 100644
--- a/net/sunrpc/auth_gss/auth_gss.c
+++ b/net/sunrpc/auth_gss/auth_gss.c
@@ -1017,7 +1017,7 @@ gss_wrap_req_integ(struct rpc_cred *cred, struct gss_cl_ctx *ctx,
 	offset = (u8 *)p - (u8 *)snd_buf->head[0].iov_base;
 	*p++ = htonl(rqstp->rq_seqno);
 
-	status = rpc_call_xdrproc(encode, rqstp, p, obj);
+	status = encode(rqstp, p, obj);
 	if (status)
 		return status;
 
@@ -1111,7 +1111,7 @@ gss_wrap_req_priv(struct rpc_cred *cred, struct gss_cl_ctx *ctx,
 	offset = (u8 *)p - (u8 *)snd_buf->head[0].iov_base;
 	*p++ = htonl(rqstp->rq_seqno);
 
-	status = rpc_call_xdrproc(encode, rqstp, p, obj);
+	status = encode(rqstp, p, obj);
 	if (status)
 		return status;
 
@@ -1170,12 +1170,12 @@ gss_wrap_req(struct rpc_task *task,
 		/* The spec seems a little ambiguous here, but I think that not
 		 * wrapping context destruction requests makes the most sense.
 		 */
-		status = rpc_call_xdrproc(encode, rqstp, p, obj);
+		status = encode(rqstp, p, obj);
 		goto out;
 	}
 	switch (gss_cred->gc_service) {
 		case RPC_GSS_SVC_NONE:
-			status = rpc_call_xdrproc(encode, rqstp, p, obj);
+			status = encode(rqstp, p, obj);
 			break;
 		case RPC_GSS_SVC_INTEGRITY:
 			status = gss_wrap_req_integ(cred, ctx, encode,
@@ -1291,7 +1291,7 @@ gss_unwrap_resp(struct rpc_task *task,
 	cred->cr_auth->au_rslack = cred->cr_auth->au_verfsize + (p - savedp)
 						+ (savedlen - head->iov_len);
 out_decode:
-	status = rpc_call_xdrproc(decode, rqstp, p, obj);
+	status = decode(rqstp, p, obj);
 out:
 	gss_put_ctx(ctx);
 	dprintk("RPC: %5u gss_unwrap_resp returning %d\n", task->tk_pid,
-- 
cgit v1.2.3-70-g09d2


From 146ec944bbd31d241a44a00518b054fb01921d22 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Tue, 23 Dec 2008 15:21:34 -0500
Subject: NFS: Move declaration of nfs_mount() to fs/nfs/internal.h

Clean up:  The nfs_mount() function is not to be used outside of the
NFS client.  Move its public declaration to fs/nfs/internal.h.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/internal.h      | 4 ++++
 fs/nfs/nfsroot.c       | 2 ++
 include/linux/nfs_fs.h | 6 ------
 3 files changed, 6 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index d212ee41caf..7a38cc7b413 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -63,6 +63,10 @@ struct nfs_parsed_mount_data {
 	struct security_mnt_opts lsm_opts;
 };
 
+/* mount_clnt.c */
+extern int nfs_mount(struct sockaddr *, size_t, char *, char *,
+					int, int, struct nfs_fh *);
+
 /* client.c */
 extern struct rpc_program nfs_program;
 
diff --git a/fs/nfs/nfsroot.c b/fs/nfs/nfsroot.c
index f1c0a066439..a96e5fdb38a 100644
--- a/fs/nfs/nfsroot.c
+++ b/fs/nfs/nfsroot.c
@@ -86,6 +86,8 @@
 #include <net/ipconfig.h>
 #include <linux/parser.h>
 
+#include "internal.h"
+
 /* Define this to allow debugging output */
 #undef NFSROOT_DEBUG
 #define NFSDBG_FACILITY NFSDBG_ROOT
diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index 4eaa8347a0d..f11077285a6 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -532,12 +532,6 @@ static inline void nfs3_forget_cached_acls(struct inode *inode)
 }
 #endif /* CONFIG_NFS_V3_ACL */
 
-/*
- * linux/fs/mount_clnt.c
- */
-extern int  nfs_mount(struct sockaddr *, size_t, char *, char *,
-		      int, int, struct nfs_fh *);
-
 /*
  * inline functions
  */
-- 
cgit v1.2.3-70-g09d2


From d740351bf0960e89ce1aef45cfe00167cb0f9e5b Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Tue, 23 Dec 2008 15:21:37 -0500
Subject: NFS: add "[no]resvport" mount option

The standard default security setting for NFS is AUTH_SYS.  An NFS
client connects to NFS servers via a privileged source port and a
fixed standard destination port (2049).  The client sends raw uid and
gid numbers to identify users making NFS requests, and the server
assumes an appropriate authority on the client has vetted these
values because the source port is privileged.

On Linux, by default in-kernel RPC services use a privileged port in
the range between 650 and 1023 to avoid using source ports of well-
known IP services.  Using such a small range limits the number of NFS
mount points and the number of unique NFS servers to which a client
can connect concurrently.

An NFS client can use unprivileged source ports to expand the range of
source port numbers, allowing more concurrent server connections and
more NFS mount points.  Servers must explicitly allow NFS connections
from unprivileged ports for this to work.

In the past, bumping the value of the sunrpc.max_resvport sysctl on
the client would permit the NFS client to use unprivileged ports.
Bumping this setting also changes the maximum port number used by
other in-kernel RPC services, some of which still required a port
number less than 1023.

This is exacerbated by the way source port numbers are chosen by the
Linux RPC client, which starts at the top of the range and works
downwards.  It means that bumping the maximum means all RPC services
requesting a source port will likely get an unprivileged port instead
of a privileged one.

Changing this setting effects all NFS mount points on a client.  A
sysadmin could not selectively choose which mount points would use
non-privileged ports and which could not.

Lastly, this mechanism of expanding the limit on the number of NFS
mount points was entirely undocumented.

To address the need for the NFS client to use a large range of source
ports without interfering with the activity of other in-kernel RPC
services, we introduce a new NFS mount option.  This option explicitly
tells only the NFS client to use a non-privileged source port when
communicating with the NFS server for one specific mount point.

This new mount option is called "resvport," like the similar NFS mount
option on FreeBSD and Mac OS X.  A sister patch for nfs-utils will be
submitted that documents this new option in nfs(5).

The default setting for this new mount option requires the NFS client
to use a privileged port, as before.  Explicitly specifying the
"noresvport" mount option allows the NFS client to use an unprivileged
source port for this mount point when connecting to the NFS server
port.

This mount option is supported only for text-based NFS mounts.

[ Sidebar: it is widely known that security mechanisms based on the
  use of privileged source ports are ineffective.  However, the NFS
  client can combine the use of unprivileged ports with the use of
  secure authentication mechanisms, such as Kerberos.  This allows a
  large number of connections and mount points while ensuring a useful
  level of security.

  Eventually we may change the default setting for this option
  depending on the security flavor used for the mount.  For example,
  if the mount is using only AUTH_SYS, then the default setting will
  be "resvport;" if the mount is using a strong security flavor such
  as krb5, the default setting will be "noresvport." ]

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
[Trond.Myklebust@netapp.com: Fixed a bug whereby nfs4_init_client()
was being called with incorrect arguments.]
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/client.c           | 11 +++++++----
 fs/nfs/super.c            | 12 +++++++++++-
 include/linux/nfs_mount.h |  3 ++-
 3 files changed, 20 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index 27190337fc1..3a69cacc4fa 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -627,7 +627,8 @@ static int nfs_init_client(struct nfs_client *clp,
 	 * Create a client RPC handle for doing FSSTAT with UNIX auth only
 	 * - RFC 2623, sec 2.3.2
 	 */
-	error = nfs_create_rpc_client(clp, timeparms, RPC_AUTH_UNIX, 0, 0);
+	error = nfs_create_rpc_client(clp, timeparms, RPC_AUTH_UNIX,
+				      0, data->flags & NFS_MOUNT_NORESVPORT);
 	if (error < 0)
 		goto error;
 	nfs_mark_client_ready(clp, NFS_CS_READY);
@@ -969,7 +970,8 @@ error:
 static int nfs4_init_client(struct nfs_client *clp,
 		const struct rpc_timeout *timeparms,
 		const char *ip_addr,
-		rpc_authflavor_t authflavour)
+		rpc_authflavor_t authflavour,
+		int flags)
 {
 	int error;
 
@@ -983,7 +985,7 @@ static int nfs4_init_client(struct nfs_client *clp,
 	clp->rpc_ops = &nfs_v4_clientops;
 
 	error = nfs_create_rpc_client(clp, timeparms, authflavour,
-				      1, 0);
+				      1, flags & NFS_MOUNT_NORESVPORT);
 	if (error < 0)
 		goto error;
 	memcpy(clp->cl_ipaddr, ip_addr, sizeof(clp->cl_ipaddr));
@@ -1034,7 +1036,8 @@ static int nfs4_set_client(struct nfs_server *server,
 		error = PTR_ERR(clp);
 		goto error;
 	}
-	error = nfs4_init_client(clp, timeparms, ip_addr, authflavour);
+	error = nfs4_init_client(clp, timeparms, ip_addr, authflavour,
+					server->flags);
 	if (error < 0)
 		goto error_put;
 
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index 2b0c8e132b5..e05a77be306 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -75,6 +75,7 @@ enum {
 	Opt_acl, Opt_noacl,
 	Opt_rdirplus, Opt_nordirplus,
 	Opt_sharecache, Opt_nosharecache,
+	Opt_resvport, Opt_noresvport,
 
 	/* Mount options that take integer arguments */
 	Opt_port,
@@ -129,6 +130,8 @@ static const match_table_t nfs_mount_option_tokens = {
 	{ Opt_nordirplus, "nordirplus" },
 	{ Opt_sharecache, "sharecache" },
 	{ Opt_nosharecache, "nosharecache" },
+	{ Opt_resvport, "resvport" },
+	{ Opt_noresvport, "noresvport" },
 
 	{ Opt_port, "port=%u" },
 	{ Opt_rsize, "rsize=%u" },
@@ -514,7 +517,8 @@ static void nfs_show_mount_options(struct seq_file *m, struct nfs_server *nfss,
 		{ NFS_MOUNT_NONLM, ",nolock", "" },
 		{ NFS_MOUNT_NOACL, ",noacl", "" },
 		{ NFS_MOUNT_NORDIRPLUS, ",nordirplus", "" },
-		{ NFS_MOUNT_UNSHARED, ",nosharecache", ""},
+		{ NFS_MOUNT_UNSHARED, ",nosharecache", "" },
+		{ NFS_MOUNT_NORESVPORT, ",noresvport", "" },
 		{ 0, NULL, NULL }
 	};
 	const struct proc_nfs_info *nfs_infop;
@@ -1035,6 +1039,12 @@ static int nfs_parse_mount_options(char *raw,
 		case Opt_nosharecache:
 			mnt->flags |= NFS_MOUNT_UNSHARED;
 			break;
+		case Opt_resvport:
+			mnt->flags &= ~NFS_MOUNT_NORESVPORT;
+			break;
+		case Opt_noresvport:
+			mnt->flags |= NFS_MOUNT_NORESVPORT;
+			break;
 
 		/*
 		 * options that take numeric values
diff --git a/include/linux/nfs_mount.h b/include/linux/nfs_mount.h
index 6549a06ac16..4499016e6d0 100644
--- a/include/linux/nfs_mount.h
+++ b/include/linux/nfs_mount.h
@@ -45,7 +45,7 @@ struct nfs_mount_data {
 	char		context[NFS_MAX_CONTEXT_LEN + 1];	/* 6 */
 };
 
-/* bits in the flags field */
+/* bits in the flags field visible to user space */
 
 #define NFS_MOUNT_SOFT		0x0001	/* 1 */
 #define NFS_MOUNT_INTR		0x0002	/* 1 */ /* now unused, but ABI */
@@ -68,5 +68,6 @@ struct nfs_mount_data {
 /* The following are for internal use only */
 #define NFS_MOUNT_LOOKUP_CACHE_NONEG	0x10000
 #define NFS_MOUNT_LOOKUP_CACHE_NONE	0x20000
+#define NFS_MOUNT_NORESVPORT		0x40000
 
 #endif
-- 
cgit v1.2.3-70-g09d2


From 0cb2659b818eca99235e17c04291cfa9985c14f7 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Tue, 23 Dec 2008 15:21:38 -0500
Subject: NLM: allow lockd requests from an unprivileged port

If the admin has specified the "noresvport" option for an NFS mount
point, the kernel's NFS client uses an unprivileged source port for
the main NFS transport.  The kernel's lockd client should use an
unprivileged port in this case as well.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/lockd/clntlock.c         |  2 +-
 fs/lockd/host.c             | 10 +++++++++-
 fs/nfs/client.c             |  2 ++
 include/linux/lockd/bind.h  |  1 +
 include/linux/lockd/lockd.h |  4 +++-
 5 files changed, 16 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/fs/lockd/clntlock.c b/fs/lockd/clntlock.c
index 94d42cc4e39..1f3b0fc0d35 100644
--- a/fs/lockd/clntlock.c
+++ b/fs/lockd/clntlock.c
@@ -61,7 +61,7 @@ struct nlm_host *nlmclnt_init(const struct nlmclnt_initdata *nlm_init)
 
 	host = nlmclnt_lookup_host(nlm_init->address, nlm_init->addrlen,
 				   nlm_init->protocol, nlm_version,
-				   nlm_init->hostname);
+				   nlm_init->hostname, nlm_init->noresvport);
 	if (host == NULL) {
 		lockd_down();
 		return ERR_PTR(-ENOLCK);
diff --git a/fs/lockd/host.c b/fs/lockd/host.c
index 70fc63a1727..acc2aa5021d 100644
--- a/fs/lockd/host.c
+++ b/fs/lockd/host.c
@@ -48,6 +48,7 @@ struct nlm_lookup_host_info {
 	const size_t		hostname_len;	/* it's length */
 	const struct sockaddr	*src_sap;	/* our address (optional) */
 	const size_t		src_len;	/* it's length */
+	const int		noresvport;	/* use non-priv port */
 };
 
 /*
@@ -222,6 +223,7 @@ static struct nlm_host *nlm_lookup_host(struct nlm_lookup_host_info *ni)
 	host->h_nsmstate   = 0;			/* real NSM state */
 	host->h_nsmhandle  = nsm;
 	host->h_server	   = ni->server;
+	host->h_noresvport = ni->noresvport;
 	hlist_add_head(&host->h_hash, chain);
 	INIT_LIST_HEAD(&host->h_lockowners);
 	spin_lock_init(&host->h_lock);
@@ -272,6 +274,7 @@ nlm_destroy_host(struct nlm_host *host)
  * @protocol: transport protocol to use
  * @version: NLM protocol version
  * @hostname: '\0'-terminated hostname of server
+ * @noresvport: 1 if non-privileged port should be used
  *
  * Returns an nlm_host structure that matches the passed-in
  * [server address, transport protocol, NLM version, server hostname].
@@ -281,7 +284,9 @@ nlm_destroy_host(struct nlm_host *host)
 struct nlm_host *nlmclnt_lookup_host(const struct sockaddr *sap,
 				     const size_t salen,
 				     const unsigned short protocol,
-				     const u32 version, const char *hostname)
+				     const u32 version,
+				     const char *hostname,
+				     int noresvport)
 {
 	const struct sockaddr source = {
 		.sa_family	= AF_UNSPEC,
@@ -296,6 +301,7 @@ struct nlm_host *nlmclnt_lookup_host(const struct sockaddr *sap,
 		.hostname_len	= strlen(hostname),
 		.src_sap	= &source,
 		.src_len	= sizeof(source),
+		.noresvport	= noresvport,
 	};
 
 	dprintk("lockd: %s(host='%s', vers=%u, proto=%s)\n", __func__,
@@ -417,6 +423,8 @@ nlm_bind_host(struct nlm_host *host)
 		 */
 		if (!host->h_server)
 			args.flags |= RPC_CLNT_CREATE_HARDRTRY;
+		if (host->h_noresvport)
+			args.flags |= RPC_CLNT_CREATE_NONPRIVPORT;
 
 		clnt = rpc_create(&args);
 		if (!IS_ERR(clnt))
diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index 3a69cacc4fa..70b6d9e8517 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -526,6 +526,8 @@ static int nfs_start_lockd(struct nfs_server *server)
 		.protocol	= server->flags & NFS_MOUNT_TCP ?
 						IPPROTO_TCP : IPPROTO_UDP,
 		.nfs_version	= clp->rpc_ops->version,
+		.noresvport	= server->flags & NFS_MOUNT_NORESVPORT ?
+					1 : 0,
 	};
 
 	if (nlm_init.nfs_version > 3)
diff --git a/include/linux/lockd/bind.h b/include/linux/lockd/bind.h
index e5872dc994c..fbc48f89852 100644
--- a/include/linux/lockd/bind.h
+++ b/include/linux/lockd/bind.h
@@ -41,6 +41,7 @@ struct nlmclnt_initdata {
 	size_t			addrlen;
 	unsigned short		protocol;
 	u32			nfs_version;
+	int			noresvport;
 };
 
 /*
diff --git a/include/linux/lockd/lockd.h b/include/linux/lockd/lockd.h
index b56d5aa9b19..23da3fa69ef 100644
--- a/include/linux/lockd/lockd.h
+++ b/include/linux/lockd/lockd.h
@@ -49,6 +49,7 @@ struct nlm_host {
 	unsigned short		h_proto;	/* transport proto */
 	unsigned short		h_reclaiming : 1,
 				h_server     : 1, /* server side, not client side */
+				h_noresvport : 1,
 				h_inuse      : 1;
 	wait_queue_head_t	h_gracewait;	/* wait while reclaiming */
 	struct rw_semaphore	h_rwsem;	/* Reboot recovery lock */
@@ -220,7 +221,8 @@ struct nlm_host  *nlmclnt_lookup_host(const struct sockaddr *sap,
 					const size_t salen,
 					const unsigned short protocol,
 					const u32 version,
-					const char *hostname);
+					const char *hostname,
+					int noresvport);
 struct nlm_host  *nlmsvc_lookup_host(const struct svc_rqst *rqstp,
 					const char *hostname,
 					const size_t hostname_len);
-- 
cgit v1.2.3-70-g09d2


From 95d35cb4c473c754824967c0b069bbeb7efa4847 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Tue, 23 Dec 2008 15:21:45 -0500
Subject: NFSv4: Remove nfs_client->cl_sem

Now that we're using the flags to indicate state that needs to be
recovered, as well as having implemented proper refcounting and spinlocking
on the state and open_owners, we can get rid of nfs_client->cl_sem. The
only remaining case that was dubious was the file locking, and that case is
now covered by the nfsi->rwsem.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/client.c           |  1 -
 fs/nfs/delegation.c       |  5 -----
 fs/nfs/nfs4proc.c         | 20 +-------------------
 fs/nfs/nfs4renewd.c       |  3 ---
 fs/nfs/nfs4state.c        | 17 -----------------
 include/linux/nfs_fs_sb.h |  6 ------
 6 files changed, 1 insertion(+), 51 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index 70b6d9e8517..b643f0ec5c2 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -143,7 +143,6 @@ static struct nfs_client *nfs_alloc_client(const struct nfs_client_initdata *cl_
 	clp->cl_proto = cl_init->proto;
 
 #ifdef CONFIG_NFS_V4
-	init_rwsem(&clp->cl_sem);
 	INIT_LIST_HEAD(&clp->cl_delegations);
 	spin_lock_init(&clp->cl_lock);
 	INIT_DELAYED_WORK(&clp->cl_renewd, nfs4_renew_state);
diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c
index 646ba3e75a1..ebc06f2da31 100644
--- a/fs/nfs/delegation.c
+++ b/fs/nfs/delegation.c
@@ -243,16 +243,13 @@ static void nfs_msync_inode(struct inode *inode)
  */
 static int __nfs_inode_return_delegation(struct inode *inode, struct nfs_delegation *delegation)
 {
-	struct nfs_client *clp = NFS_SERVER(inode)->nfs_client;
 	struct nfs_inode *nfsi = NFS_I(inode);
 
 	nfs_msync_inode(inode);
-	down_read(&clp->cl_sem);
 	/* Guard against new delegated open calls */
 	down_write(&nfsi->rwsem);
 	nfs_delegation_claim_opens(inode, &delegation->stateid);
 	up_write(&nfsi->rwsem);
-	up_read(&clp->cl_sem);
 	nfs_msync_inode(inode);
 
 	return nfs_do_return_delegation(inode, delegation, 1);
@@ -425,7 +422,6 @@ static int recall_thread(void *data)
 	daemonize("nfsv4-delegreturn");
 
 	nfs_msync_inode(inode);
-	down_read(&clp->cl_sem);
 	down_write(&nfsi->rwsem);
 	spin_lock(&clp->cl_lock);
 	delegation = nfs_detach_delegation_locked(nfsi, args->stateid);
@@ -437,7 +433,6 @@ static int recall_thread(void *data)
 	complete(&args->started);
 	nfs_delegation_claim_opens(inode, args->stateid);
 	up_write(&nfsi->rwsem);
-	up_read(&clp->cl_sem);
 	nfs_msync_inode(inode);
 
 	if (delegation != NULL)
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index aec4e47c462..26dcd77abb0 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -207,12 +207,8 @@ static int nfs4_wait_clnt_recover(struct nfs_client *clp)
 
 	might_sleep();
 
-	rwsem_acquire(&clp->cl_sem.dep_map, 0, 0, _RET_IP_);
-
 	res = wait_on_bit(&clp->cl_state, NFS4CLNT_STATE_RECOVER,
 			nfs4_wait_bit_killable, TASK_KILLABLE);
-
-	rwsem_release(&clp->cl_sem.dep_map, 1, _RET_IP_);
 	return res;
 }
 
@@ -1135,7 +1131,6 @@ static int _nfs4_do_open(struct inode *dir, struct path *path, int flags, struct
 	struct nfs4_state_owner  *sp;
 	struct nfs4_state     *state = NULL;
 	struct nfs_server       *server = NFS_SERVER(dir);
-	struct nfs_client *clp = server->nfs_client;
 	struct nfs4_opendata *opendata;
 	int status;
 
@@ -1150,11 +1145,10 @@ static int _nfs4_do_open(struct inode *dir, struct path *path, int flags, struct
 		goto err_put_state_owner;
 	if (path->dentry->d_inode != NULL)
 		nfs4_return_incompatible_delegation(path->dentry->d_inode, flags & (FMODE_READ|FMODE_WRITE));
-	down_read(&clp->cl_sem);
 	status = -ENOMEM;
 	opendata = nfs4_opendata_alloc(path, sp, flags, sattr);
 	if (opendata == NULL)
-		goto err_release_rwsem;
+		goto err_put_state_owner;
 
 	if (path->dentry->d_inode != NULL)
 		opendata->state = nfs4_get_open_state(path->dentry->d_inode, sp);
@@ -1172,13 +1166,10 @@ static int _nfs4_do_open(struct inode *dir, struct path *path, int flags, struct
 		goto err_opendata_put;
 	nfs4_opendata_put(opendata);
 	nfs4_put_state_owner(sp);
-	up_read(&clp->cl_sem);
 	*res = state;
 	return 0;
 err_opendata_put:
 	nfs4_opendata_put(opendata);
-err_release_rwsem:
-	up_read(&clp->cl_sem);
 err_put_state_owner:
 	nfs4_put_state_owner(sp);
 out_err:
@@ -3099,7 +3090,6 @@ static int _nfs4_proc_getlk(struct nfs4_state *state, int cmd, struct file_lock
 	struct nfs4_lock_state *lsp;
 	int status;
 
-	down_read(&clp->cl_sem);
 	arg.lock_owner.clientid = clp->cl_clientid;
 	status = nfs4_set_lock_state(state, request);
 	if (status != 0)
@@ -3116,7 +3106,6 @@ static int _nfs4_proc_getlk(struct nfs4_state *state, int cmd, struct file_lock
 	}
 	request->fl_ops->fl_release_private(request);
 out:
-	up_read(&clp->cl_sem);
 	return status;
 }
 
@@ -3273,7 +3262,6 @@ static struct rpc_task *nfs4_do_unlck(struct file_lock *fl,
 
 static int nfs4_proc_unlck(struct nfs4_state *state, int cmd, struct file_lock *request)
 {
-	struct nfs_client *clp = state->owner->so_client;
 	struct nfs_inode *nfsi = NFS_I(state->inode);
 	struct nfs_seqid *seqid;
 	struct nfs4_lock_state *lsp;
@@ -3284,15 +3272,12 @@ static int nfs4_proc_unlck(struct nfs4_state *state, int cmd, struct file_lock *
 	status = nfs4_set_lock_state(state, request);
 	/* Unlock _before_ we do the RPC call */
 	request->fl_flags |= FL_EXISTS;
-	down_read(&clp->cl_sem);
 	down_read(&nfsi->rwsem);
 	if (do_vfs_lock(request->fl_file, request) == -ENOENT) {
 		up_read(&nfsi->rwsem);
-		up_read(&clp->cl_sem);
 		goto out;
 	}
 	up_read(&nfsi->rwsem);
-	up_read(&clp->cl_sem);
 	if (status != 0)
 		goto out;
 	/* Is this a delegated lock? */
@@ -3518,7 +3503,6 @@ static int nfs4_lock_expired(struct nfs4_state *state, struct file_lock *request
 
 static int _nfs4_proc_setlk(struct nfs4_state *state, int cmd, struct file_lock *request)
 {
-	struct nfs_client *clp = state->owner->so_client;
 	struct nfs_inode *nfsi = NFS_I(state->inode);
 	unsigned char fl_flags = request->fl_flags;
 	int status;
@@ -3531,7 +3515,6 @@ static int _nfs4_proc_setlk(struct nfs4_state *state, int cmd, struct file_lock
 	status = do_vfs_lock(request->fl_file, request);
 	if (status < 0)
 		goto out;
-	down_read(&clp->cl_sem);
 	down_read(&nfsi->rwsem);
 	if (test_bit(NFS_DELEGATED_STATE, &state->flags)) {
 		/* Yes: cache locks! */
@@ -3549,7 +3532,6 @@ static int _nfs4_proc_setlk(struct nfs4_state *state, int cmd, struct file_lock
 		printk(KERN_WARNING "%s: VFS is out of sync with lock manager!\n", __func__);
 out_unlock:
 	up_read(&nfsi->rwsem);
-	up_read(&clp->cl_sem);
 out:
 	request->fl_flags = fl_flags;
 	return status;
diff --git a/fs/nfs/nfs4renewd.c b/fs/nfs/nfs4renewd.c
index 9fe8640a88e..6101f955f23 100644
--- a/fs/nfs/nfs4renewd.c
+++ b/fs/nfs/nfs4renewd.c
@@ -65,7 +65,6 @@ nfs4_renew_state(struct work_struct *work)
 	long lease, timeout;
 	unsigned long last, now;
 
-	down_read(&clp->cl_sem);
 	dprintk("%s: start\n", __func__);
 	/* Are there any active superblocks? */
 	if (list_empty(&clp->cl_superblocks))
@@ -101,11 +100,9 @@ nfs4_renew_state(struct work_struct *work)
 	schedule_delayed_work(&clp->cl_renewd, timeout);
 	spin_unlock(&clp->cl_lock);
 out:
-	up_read(&clp->cl_sem);
 	dprintk("%s: done\n", __func__);
 }
 
-/* Must be called with clp->cl_sem locked for writes */
 void
 nfs4_schedule_state_renewal(struct nfs_client *clp)
 {
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index 16c9fbdf97b..ec0cbe00a80 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -305,10 +305,6 @@ nfs4_drop_state_owner(struct nfs4_state_owner *sp)
 	}
 }
 
-/*
- * Note: must be called with clp->cl_sem held in order to prevent races
- *       with reboot recovery!
- */
 struct nfs4_state_owner *nfs4_get_state_owner(struct nfs_server *server, struct rpc_cred *cred)
 {
 	struct nfs_client *clp = server->nfs_client;
@@ -337,10 +333,6 @@ struct nfs4_state_owner *nfs4_get_state_owner(struct nfs_server *server, struct
 	return sp;
 }
 
-/*
- * Must be called with clp->cl_sem held in order to avoid races
- * with state recovery...
- */
 void nfs4_put_state_owner(struct nfs4_state_owner *sp)
 {
 	struct nfs_client *clp = sp->so_client;
@@ -442,10 +434,6 @@ out:
 	return state;
 }
 
-/*
- * Beware! Caller must be holding exactly one
- * reference to clp->cl_sem!
- */
 void nfs4_put_open_state(struct nfs4_state *state)
 {
 	struct inode *inode = state->inode;
@@ -578,7 +566,6 @@ static void nfs4_free_lock_state(struct nfs4_lock_state *lsp)
  * Return a compatible lock_state. If no initialized lock_state structure
  * exists, return an uninitialized one.
  *
- * The caller must be holding clp->cl_sem
  */
 static struct nfs4_lock_state *nfs4_get_lock_state(struct nfs4_state *state, fl_owner_t owner)
 {
@@ -1127,7 +1114,6 @@ static int reclaimer(void *ptr)
 	allow_signal(SIGKILL);
 
 	/* Ensure exclusive access to NFSv4 state */
-	down_write(&clp->cl_sem);
 	while (!list_empty(&clp->cl_superblocks)) {
 		if (test_and_clear_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state)) {
 			/* We're going to have to re-establish a clientid */
@@ -1149,7 +1135,6 @@ static int reclaimer(void *ptr)
 
 		/* First recover reboot state... */
 		if (test_and_clear_bit(NFS4CLNT_RECLAIM_REBOOT, &clp->cl_state)) {
-			/* Note: list is protected by exclusive lock on cl->cl_sem */
 			status = nfs4_do_reclaim(clp, &nfs4_reboot_recovery_ops);
 			if (status == -NFS4ERR_STALE_CLIENTID)
 				continue;
@@ -1159,7 +1144,6 @@ static int reclaimer(void *ptr)
 
 		/* Now recover expired state... */
 		if (test_and_clear_bit(NFS4CLNT_RECLAIM_NOGRACE, &clp->cl_state)) {
-			/* Note: list is protected by exclusive lock on cl->cl_sem */
 			status = nfs4_do_reclaim(clp, &nfs4_nograce_recovery_ops);
 			if (status < 0) {
 				set_bit(NFS4CLNT_RECLAIM_NOGRACE, &clp->cl_state);
@@ -1175,7 +1159,6 @@ static int reclaimer(void *ptr)
 		break;
 	}
 out:
-	up_write(&clp->cl_sem);
 	if (test_and_clear_bit(NFS4CLNT_CB_PATH_DOWN, &clp->cl_state))
 		nfs_handle_cb_pathdown(clp);
 	nfs4_clear_recover_bit(clp);
diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h
index 4e477ae5869..9bb81aec91c 100644
--- a/include/linux/nfs_fs_sb.h
+++ b/include/linux/nfs_fs_sb.h
@@ -42,12 +42,6 @@ struct nfs_client {
 	struct rb_root		cl_openowner_id;
 	struct rb_root		cl_lockowner_id;
 
-	/*
-	 * The following rwsem ensures exclusive access to the server
-	 * while we recover the state following a lease expiration.
-	 */
-	struct rw_semaphore	cl_sem;
-
 	struct list_head	cl_delegations;
 	struct rb_root		cl_state_owners;
 	spinlock_t		cl_lock;
-- 
cgit v1.2.3-70-g09d2


From bd7bf9d540c001055fba796ebf146d90e4dd2eb2 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Tue, 23 Dec 2008 15:21:53 -0500
Subject: NFSv4: Convert delegation->type field to fmode_t

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/delegation.c     | 2 +-
 fs/nfs/delegation.h     | 6 +++---
 fs/nfs/nfs4xdr.c        | 4 ++--
 include/linux/nfs_fs.h  | 2 +-
 include/linux/nfs_xdr.h | 4 ++--
 5 files changed, 9 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c
index e75f2f8c524..968225a8801 100644
--- a/fs/nfs/delegation.c
+++ b/fs/nfs/delegation.c
@@ -48,7 +48,7 @@ void nfs_mark_delegation_referenced(struct nfs_delegation *delegation)
 	set_bit(NFS_DELEGATION_REFERENCED, &delegation->flags);
 }
 
-int nfs_have_delegation(struct inode *inode, int flags)
+int nfs_have_delegation(struct inode *inode, fmode_t flags)
 {
 	struct nfs_delegation *delegation;
 	int ret = 0;
diff --git a/fs/nfs/delegation.h b/fs/nfs/delegation.h
index 2a74882e5d5..09f38379517 100644
--- a/fs/nfs/delegation.h
+++ b/fs/nfs/delegation.h
@@ -17,7 +17,7 @@ struct nfs_delegation {
 	struct rpc_cred *cred;
 	struct inode *inode;
 	nfs4_stateid stateid;
-	int type;
+	fmode_t type;
 	loff_t maxsize;
 	__u64 change_attr;
 	unsigned long flags;
@@ -54,10 +54,10 @@ int nfs4_lock_delegation_recall(struct nfs4_state *state, struct file_lock *fl);
 int nfs4_copy_delegation_stateid(nfs4_stateid *dst, struct inode *inode);
 
 void nfs_mark_delegation_referenced(struct nfs_delegation *delegation);
-int nfs_have_delegation(struct inode *inode, int flags);
+int nfs_have_delegation(struct inode *inode, fmode_t flags);
 
 #else
-static inline int nfs_have_delegation(struct inode *inode, int flags)
+static inline int nfs_have_delegation(struct inode *inode, fmode_t flags)
 {
 	return 0;
 }
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index b916297d233..879911c9903 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -1024,7 +1024,7 @@ static void encode_opentype(struct xdr_stream *xdr, const struct nfs_openargs *a
 	}
 }
 
-static inline void encode_delegation_type(struct xdr_stream *xdr, int delegation_type)
+static inline void encode_delegation_type(struct xdr_stream *xdr, fmode_t delegation_type)
 {
 	__be32 *p;
 
@@ -1053,7 +1053,7 @@ static inline void encode_claim_null(struct xdr_stream *xdr, const struct qstr *
 	encode_string(xdr, name->len, name->name);
 }
 
-static inline void encode_claim_previous(struct xdr_stream *xdr, int type)
+static inline void encode_claim_previous(struct xdr_stream *xdr, fmode_t type)
 {
 	__be32 *p;
 
diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index f11077285a6..8d71d7b7c78 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -180,7 +180,7 @@ struct nfs_inode {
         /* NFSv4 state */
 	struct list_head	open_states;
 	struct nfs_delegation	*delegation;
-	int			 delegation_state;
+	fmode_t			 delegation_state;
 	struct rw_semaphore	rwsem;
 #endif /* CONFIG_NFS_V4*/
 	struct inode		vfs_inode;
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index c1c31acb8a2..32c1a0ecdbf 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -126,7 +126,7 @@ struct nfs_openargs {
 		struct iattr *  attrs;    /* UNCHECKED, GUARDED */
 		nfs4_verifier   verifier; /* EXCLUSIVE */
 		nfs4_stateid	delegation;		/* CLAIM_DELEGATE_CUR */
-		int		delegation_type;	/* CLAIM_PREVIOUS */
+		fmode_t		delegation_type;	/* CLAIM_PREVIOUS */
 	} u;
 	const struct qstr *	name;
 	const struct nfs_server *server;	 /* Needed for ID mapping */
@@ -143,7 +143,7 @@ struct nfs_openres {
 	struct nfs_fattr *      dir_attr;
 	struct nfs_seqid *	seqid;
 	const struct nfs_server *server;
-	int			delegation_type;
+	fmode_t			delegation_type;
 	nfs4_stateid		delegation;
 	__u32			do_recall;
 	__u64			maxsize;
-- 
cgit v1.2.3-70-g09d2


From dc0b027dfadfcb8a5504f7d8052754bf8d501ab9 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Tue, 23 Dec 2008 15:21:56 -0500
Subject: NFSv4: Convert the open and close ops to use fmode

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/inode.c          |   2 +-
 fs/nfs/nfs4_fs.h        |   8 +--
 fs/nfs/nfs4proc.c       | 126 ++++++++++++++++++++++++++----------------------
 fs/nfs/nfs4state.c      |  24 ++++-----
 fs/nfs/nfs4xdr.c        |  10 ++--
 include/linux/nfs_fs.h  |   4 +-
 include/linux/nfs_xdr.h |   3 +-
 7 files changed, 94 insertions(+), 83 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index d22eb383e1c..1cf39202c3e 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -592,7 +592,7 @@ static void nfs_file_set_open_context(struct file *filp, struct nfs_open_context
 /*
  * Given an inode, search for an open context with the desired characteristics
  */
-struct nfs_open_context *nfs_find_open_context(struct inode *inode, struct rpc_cred *cred, int mode)
+struct nfs_open_context *nfs_find_open_context(struct inode *inode, struct rpc_cred *cred, fmode_t mode)
 {
 	struct nfs_inode *nfsi = NFS_I(inode);
 	struct nfs_open_context *pos, *ctx = NULL;
diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index ee5f51ef696..d5f5efaf2b2 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -161,7 +161,7 @@ struct nfs4_state {
 	unsigned int n_rdonly;		/* Number of read-only references */
 	unsigned int n_wronly;		/* Number of write-only references */
 	unsigned int n_rdwr;		/* Number of read/write references */
-	int state;			/* State on the server (R,W, or RW) */
+	fmode_t state;			/* State on the server (R,W, or RW) */
 	atomic_t count;
 };
 
@@ -223,9 +223,9 @@ extern struct nfs4_state_owner * nfs4_get_state_owner(struct nfs_server *, struc
 extern void nfs4_put_state_owner(struct nfs4_state_owner *);
 extern struct nfs4_state * nfs4_get_open_state(struct inode *, struct nfs4_state_owner *);
 extern void nfs4_put_open_state(struct nfs4_state *);
-extern void nfs4_close_state(struct path *, struct nfs4_state *, mode_t);
-extern void nfs4_close_sync(struct path *, struct nfs4_state *, mode_t);
-extern void nfs4_state_set_mode_locked(struct nfs4_state *, mode_t);
+extern void nfs4_close_state(struct path *, struct nfs4_state *, fmode_t);
+extern void nfs4_close_sync(struct path *, struct nfs4_state *, fmode_t);
+extern void nfs4_state_set_mode_locked(struct nfs4_state *, fmode_t);
 extern void nfs4_schedule_state_recovery(struct nfs_client *);
 extern void nfs4_schedule_state_manager(struct nfs_client *);
 extern int nfs4_state_mark_reclaim_nograce(struct nfs_client *clp, struct nfs4_state *state);
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index fc0c9d255cf..e8df52dc2bc 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -323,7 +323,7 @@ static void nfs4_init_opendata_res(struct nfs4_opendata *p)
 }
 
 static struct nfs4_opendata *nfs4_opendata_alloc(struct path *path,
-		struct nfs4_state_owner *sp, int flags,
+		struct nfs4_state_owner *sp, fmode_t fmode, int flags,
 		const struct iattr *attrs)
 {
 	struct dentry *parent = dget_parent(path->dentry);
@@ -343,7 +343,8 @@ static struct nfs4_opendata *nfs4_opendata_alloc(struct path *path,
 	p->owner = sp;
 	atomic_inc(&sp->so_count);
 	p->o_arg.fh = NFS_FH(dir);
-	p->o_arg.open_flags = flags,
+	p->o_arg.open_flags = flags;
+	p->o_arg.fmode = fmode & (FMODE_READ|FMODE_WRITE);
 	p->o_arg.clientid = server->nfs_client->cl_clientid;
 	p->o_arg.id = sp->so_owner_id.id;
 	p->o_arg.name = &p->path.dentry->d_name;
@@ -399,10 +400,13 @@ static int nfs4_wait_for_completion_rpc_task(struct rpc_task *task)
 	return ret;
 }
 
-static int can_open_cached(struct nfs4_state *state, int mode)
+static int can_open_cached(struct nfs4_state *state, fmode_t mode, int open_mode)
 {
 	int ret = 0;
-	switch (mode & (FMODE_READ|FMODE_WRITE|O_EXCL)) {
+
+	if (open_mode & O_EXCL)
+		goto out;
+	switch (mode & (FMODE_READ|FMODE_WRITE)) {
 		case FMODE_READ:
 			ret |= test_bit(NFS_O_RDONLY_STATE, &state->flags) != 0;
 			break;
@@ -412,12 +416,13 @@ static int can_open_cached(struct nfs4_state *state, int mode)
 		case FMODE_READ|FMODE_WRITE:
 			ret |= test_bit(NFS_O_RDWR_STATE, &state->flags) != 0;
 	}
+out:
 	return ret;
 }
 
-static int can_open_delegated(struct nfs_delegation *delegation, mode_t open_flags)
+static int can_open_delegated(struct nfs_delegation *delegation, fmode_t fmode)
 {
-	if ((delegation->type & open_flags) != open_flags)
+	if ((delegation->type & fmode) != fmode)
 		return 0;
 	if (test_bit(NFS_DELEGATION_NEED_RECLAIM, &delegation->flags))
 		return 0;
@@ -425,9 +430,9 @@ static int can_open_delegated(struct nfs_delegation *delegation, mode_t open_fla
 	return 1;
 }
 
-static void update_open_stateflags(struct nfs4_state *state, mode_t open_flags)
+static void update_open_stateflags(struct nfs4_state *state, fmode_t fmode)
 {
-	switch (open_flags) {
+	switch (fmode) {
 		case FMODE_WRITE:
 			state->n_wronly++;
 			break;
@@ -437,15 +442,15 @@ static void update_open_stateflags(struct nfs4_state *state, mode_t open_flags)
 		case FMODE_READ|FMODE_WRITE:
 			state->n_rdwr++;
 	}
-	nfs4_state_set_mode_locked(state, state->state | open_flags);
+	nfs4_state_set_mode_locked(state, state->state | fmode);
 }
 
-static void nfs_set_open_stateid_locked(struct nfs4_state *state, nfs4_stateid *stateid, int open_flags)
+static void nfs_set_open_stateid_locked(struct nfs4_state *state, nfs4_stateid *stateid, fmode_t fmode)
 {
 	if (test_bit(NFS_DELEGATED_STATE, &state->flags) == 0)
 		memcpy(state->stateid.data, stateid->data, sizeof(state->stateid.data));
 	memcpy(state->open_stateid.data, stateid->data, sizeof(state->open_stateid.data));
-	switch (open_flags) {
+	switch (fmode) {
 		case FMODE_READ:
 			set_bit(NFS_O_RDONLY_STATE, &state->flags);
 			break;
@@ -457,14 +462,14 @@ static void nfs_set_open_stateid_locked(struct nfs4_state *state, nfs4_stateid *
 	}
 }
 
-static void nfs_set_open_stateid(struct nfs4_state *state, nfs4_stateid *stateid, int open_flags)
+static void nfs_set_open_stateid(struct nfs4_state *state, nfs4_stateid *stateid, fmode_t fmode)
 {
 	write_seqlock(&state->seqlock);
-	nfs_set_open_stateid_locked(state, stateid, open_flags);
+	nfs_set_open_stateid_locked(state, stateid, fmode);
 	write_sequnlock(&state->seqlock);
 }
 
-static void __update_open_stateid(struct nfs4_state *state, nfs4_stateid *open_stateid, const nfs4_stateid *deleg_stateid, int open_flags)
+static void __update_open_stateid(struct nfs4_state *state, nfs4_stateid *open_stateid, const nfs4_stateid *deleg_stateid, fmode_t fmode)
 {
 	/*
 	 * Protect the call to nfs4_state_set_mode_locked and
@@ -476,20 +481,20 @@ static void __update_open_stateid(struct nfs4_state *state, nfs4_stateid *open_s
 		set_bit(NFS_DELEGATED_STATE, &state->flags);
 	}
 	if (open_stateid != NULL)
-		nfs_set_open_stateid_locked(state, open_stateid, open_flags);
+		nfs_set_open_stateid_locked(state, open_stateid, fmode);
 	write_sequnlock(&state->seqlock);
 	spin_lock(&state->owner->so_lock);
-	update_open_stateflags(state, open_flags);
+	update_open_stateflags(state, fmode);
 	spin_unlock(&state->owner->so_lock);
 }
 
-static int update_open_stateid(struct nfs4_state *state, nfs4_stateid *open_stateid, nfs4_stateid *delegation, int open_flags)
+static int update_open_stateid(struct nfs4_state *state, nfs4_stateid *open_stateid, nfs4_stateid *delegation, fmode_t fmode)
 {
 	struct nfs_inode *nfsi = NFS_I(state->inode);
 	struct nfs_delegation *deleg_cur;
 	int ret = 0;
 
-	open_flags &= (FMODE_READ|FMODE_WRITE);
+	fmode &= (FMODE_READ|FMODE_WRITE);
 
 	rcu_read_lock();
 	deleg_cur = rcu_dereference(nfsi->delegation);
@@ -498,7 +503,7 @@ static int update_open_stateid(struct nfs4_state *state, nfs4_stateid *open_stat
 
 	spin_lock(&deleg_cur->lock);
 	if (nfsi->delegation != deleg_cur ||
-	    (deleg_cur->type & open_flags) != open_flags)
+	    (deleg_cur->type & fmode) != fmode)
 		goto no_delegation_unlock;
 
 	if (delegation == NULL)
@@ -507,7 +512,7 @@ static int update_open_stateid(struct nfs4_state *state, nfs4_stateid *open_stat
 		goto no_delegation_unlock;
 
 	nfs_mark_delegation_referenced(deleg_cur);
-	__update_open_stateid(state, open_stateid, &deleg_cur->stateid, open_flags);
+	__update_open_stateid(state, open_stateid, &deleg_cur->stateid, fmode);
 	ret = 1;
 no_delegation_unlock:
 	spin_unlock(&deleg_cur->lock);
@@ -515,7 +520,7 @@ no_delegation:
 	rcu_read_unlock();
 
 	if (!ret && open_stateid != NULL) {
-		__update_open_stateid(state, open_stateid, NULL, open_flags);
+		__update_open_stateid(state, open_stateid, NULL, fmode);
 		ret = 1;
 	}
 
@@ -523,13 +528,13 @@ no_delegation:
 }
 
 
-static void nfs4_return_incompatible_delegation(struct inode *inode, mode_t open_flags)
+static void nfs4_return_incompatible_delegation(struct inode *inode, fmode_t fmode)
 {
 	struct nfs_delegation *delegation;
 
 	rcu_read_lock();
 	delegation = rcu_dereference(NFS_I(inode)->delegation);
-	if (delegation == NULL || (delegation->type & open_flags) == open_flags) {
+	if (delegation == NULL || (delegation->type & fmode) == fmode) {
 		rcu_read_unlock();
 		return;
 	}
@@ -542,15 +547,16 @@ static struct nfs4_state *nfs4_try_open_cached(struct nfs4_opendata *opendata)
 	struct nfs4_state *state = opendata->state;
 	struct nfs_inode *nfsi = NFS_I(state->inode);
 	struct nfs_delegation *delegation;
-	int open_mode = opendata->o_arg.open_flags & (FMODE_READ|FMODE_WRITE|O_EXCL);
+	int open_mode = opendata->o_arg.open_flags & O_EXCL;
+	fmode_t fmode = opendata->o_arg.fmode;
 	nfs4_stateid stateid;
 	int ret = -EAGAIN;
 
 	for (;;) {
-		if (can_open_cached(state, open_mode)) {
+		if (can_open_cached(state, fmode, open_mode)) {
 			spin_lock(&state->owner->so_lock);
-			if (can_open_cached(state, open_mode)) {
-				update_open_stateflags(state, open_mode);
+			if (can_open_cached(state, fmode, open_mode)) {
+				update_open_stateflags(state, fmode);
 				spin_unlock(&state->owner->so_lock);
 				goto out_return_state;
 			}
@@ -559,7 +565,7 @@ static struct nfs4_state *nfs4_try_open_cached(struct nfs4_opendata *opendata)
 		rcu_read_lock();
 		delegation = rcu_dereference(nfsi->delegation);
 		if (delegation == NULL ||
-		    !can_open_delegated(delegation, open_mode)) {
+		    !can_open_delegated(delegation, fmode)) {
 			rcu_read_unlock();
 			break;
 		}
@@ -572,7 +578,7 @@ static struct nfs4_state *nfs4_try_open_cached(struct nfs4_opendata *opendata)
 		ret = -EAGAIN;
 
 		/* Try to update the stateid using the delegation */
-		if (update_open_stateid(state, NULL, &stateid, open_mode))
+		if (update_open_stateid(state, NULL, &stateid, fmode))
 			goto out_return_state;
 	}
 out:
@@ -624,7 +630,7 @@ static struct nfs4_state *nfs4_opendata_to_nfs4_state(struct nfs4_opendata *data
 	}
 
 	update_open_stateid(state, &data->o_res.stateid, NULL,
-			data->o_arg.open_flags);
+			data->o_arg.fmode);
 	iput(inode);
 out:
 	return state;
@@ -655,7 +661,7 @@ static struct nfs4_opendata *nfs4_open_recoverdata_alloc(struct nfs_open_context
 {
 	struct nfs4_opendata *opendata;
 
-	opendata = nfs4_opendata_alloc(&ctx->path, state->owner, 0, NULL);
+	opendata = nfs4_opendata_alloc(&ctx->path, state->owner, 0, 0, NULL);
 	if (opendata == NULL)
 		return ERR_PTR(-ENOMEM);
 	opendata->state = state;
@@ -663,12 +669,13 @@ static struct nfs4_opendata *nfs4_open_recoverdata_alloc(struct nfs_open_context
 	return opendata;
 }
 
-static int nfs4_open_recover_helper(struct nfs4_opendata *opendata, mode_t openflags, struct nfs4_state **res)
+static int nfs4_open_recover_helper(struct nfs4_opendata *opendata, fmode_t fmode, struct nfs4_state **res)
 {
 	struct nfs4_state *newstate;
 	int ret;
 
-	opendata->o_arg.open_flags = openflags;
+	opendata->o_arg.open_flags = 0;
+	opendata->o_arg.fmode = fmode;
 	memset(&opendata->o_res, 0, sizeof(opendata->o_res));
 	memset(&opendata->c_res, 0, sizeof(opendata->c_res));
 	nfs4_init_opendata_res(opendata);
@@ -678,7 +685,7 @@ static int nfs4_open_recover_helper(struct nfs4_opendata *opendata, mode_t openf
 	newstate = nfs4_opendata_to_nfs4_state(opendata);
 	if (IS_ERR(newstate))
 		return PTR_ERR(newstate);
-	nfs4_close_state(&opendata->path, newstate, openflags);
+	nfs4_close_state(&opendata->path, newstate, fmode);
 	*res = newstate;
 	return 0;
 }
@@ -734,7 +741,7 @@ static int _nfs4_do_open_reclaim(struct nfs_open_context *ctx, struct nfs4_state
 {
 	struct nfs_delegation *delegation;
 	struct nfs4_opendata *opendata;
-	int delegation_type = 0;
+	fmode_t delegation_type = 0;
 	int status;
 
 	opendata = nfs4_open_recoverdata_alloc(ctx, state);
@@ -847,7 +854,7 @@ static void nfs4_open_confirm_release(void *calldata)
 		goto out_free;
 	state = nfs4_opendata_to_nfs4_state(data);
 	if (!IS_ERR(state))
-		nfs4_close_state(&data->path, state, data->o_arg.open_flags);
+		nfs4_close_state(&data->path, state, data->o_arg.fmode);
 out_free:
 	nfs4_opendata_put(data);
 }
@@ -911,7 +918,7 @@ static void nfs4_open_prepare(struct rpc_task *task, void *calldata)
 	if (data->state != NULL) {
 		struct nfs_delegation *delegation;
 
-		if (can_open_cached(data->state, data->o_arg.open_flags & (FMODE_READ|FMODE_WRITE|O_EXCL)))
+		if (can_open_cached(data->state, data->o_arg.fmode, data->o_arg.open_flags))
 			goto out_no_action;
 		rcu_read_lock();
 		delegation = rcu_dereference(NFS_I(data->state->inode)->delegation);
@@ -980,7 +987,7 @@ static void nfs4_open_release(void *calldata)
 		goto out_free;
 	state = nfs4_opendata_to_nfs4_state(data);
 	if (!IS_ERR(state))
-		nfs4_close_state(&data->path, state, data->o_arg.open_flags);
+		nfs4_close_state(&data->path, state, data->o_arg.fmode);
 out_free:
 	nfs4_opendata_put(data);
 }
@@ -1135,7 +1142,7 @@ static inline void nfs4_exclusive_attrset(struct nfs4_opendata *opendata, struct
 /*
  * Returns a referenced nfs4_state
  */
-static int _nfs4_do_open(struct inode *dir, struct path *path, int flags, struct iattr *sattr, struct rpc_cred *cred, struct nfs4_state **res)
+static int _nfs4_do_open(struct inode *dir, struct path *path, fmode_t fmode, int flags, struct iattr *sattr, struct rpc_cred *cred, struct nfs4_state **res)
 {
 	struct nfs4_state_owner  *sp;
 	struct nfs4_state     *state = NULL;
@@ -1153,9 +1160,9 @@ static int _nfs4_do_open(struct inode *dir, struct path *path, int flags, struct
 	if (status != 0)
 		goto err_put_state_owner;
 	if (path->dentry->d_inode != NULL)
-		nfs4_return_incompatible_delegation(path->dentry->d_inode, flags & (FMODE_READ|FMODE_WRITE));
+		nfs4_return_incompatible_delegation(path->dentry->d_inode, fmode);
 	status = -ENOMEM;
-	opendata = nfs4_opendata_alloc(path, sp, flags, sattr);
+	opendata = nfs4_opendata_alloc(path, sp, fmode, flags, sattr);
 	if (opendata == NULL)
 		goto err_put_state_owner;
 
@@ -1187,14 +1194,14 @@ out_err:
 }
 
 
-static struct nfs4_state *nfs4_do_open(struct inode *dir, struct path *path, int flags, struct iattr *sattr, struct rpc_cred *cred)
+static struct nfs4_state *nfs4_do_open(struct inode *dir, struct path *path, fmode_t fmode, int flags, struct iattr *sattr, struct rpc_cred *cred)
 {
 	struct nfs4_exception exception = { };
 	struct nfs4_state *res;
 	int status;
 
 	do {
-		status = _nfs4_do_open(dir, path, flags, sattr, cred, &res);
+		status = _nfs4_do_open(dir, path, fmode, flags, sattr, cred, &res);
 		if (status == 0)
 			break;
 		/* NOTE: BAD_SEQID means the server and client disagree about the
@@ -1332,7 +1339,7 @@ static void nfs4_close_done(struct rpc_task *task, void *data)
 		case -NFS4ERR_OLD_STATEID:
 		case -NFS4ERR_BAD_STATEID:
 		case -NFS4ERR_EXPIRED:
-			if (calldata->arg.open_flags == 0)
+			if (calldata->arg.fmode == 0)
 				break;
 		default:
 			if (nfs4_async_handle_error(task, server, state) == -EAGAIN) {
@@ -1374,10 +1381,10 @@ static void nfs4_close_prepare(struct rpc_task *task, void *data)
 	nfs_fattr_init(calldata->res.fattr);
 	if (test_bit(NFS_O_RDONLY_STATE, &state->flags) != 0) {
 		task->tk_msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_OPEN_DOWNGRADE];
-		calldata->arg.open_flags = FMODE_READ;
+		calldata->arg.fmode = FMODE_READ;
 	} else if (test_bit(NFS_O_WRONLY_STATE, &state->flags) != 0) {
 		task->tk_msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_OPEN_DOWNGRADE];
-		calldata->arg.open_flags = FMODE_WRITE;
+		calldata->arg.fmode = FMODE_WRITE;
 	}
 	calldata->timestamp = jiffies;
 	rpc_call_start(task);
@@ -1430,7 +1437,7 @@ int nfs4_do_close(struct path *path, struct nfs4_state *state, int wait)
 	calldata->arg.seqid = nfs_alloc_seqid(&state->owner->so_seqid);
 	if (calldata->arg.seqid == NULL)
 		goto out_free_calldata;
-	calldata->arg.open_flags = 0;
+	calldata->arg.fmode = 0;
 	calldata->arg.bitmask = server->attr_bitmask;
 	calldata->res.fattr = &calldata->fattr;
 	calldata->res.seqid = calldata->arg.seqid;
@@ -1457,13 +1464,13 @@ out:
 	return status;
 }
 
-static int nfs4_intent_set_file(struct nameidata *nd, struct path *path, struct nfs4_state *state)
+static int nfs4_intent_set_file(struct nameidata *nd, struct path *path, struct nfs4_state *state, fmode_t fmode)
 {
 	struct file *filp;
 	int ret;
 
 	/* If the open_intent is for execute, we have an extra check to make */
-	if (nd->intent.open.flags & FMODE_EXEC) {
+	if (fmode & FMODE_EXEC) {
 		ret = nfs_may_open(state->inode,
 				state->owner->so_cred,
 				nd->intent.open.flags);
@@ -1479,7 +1486,7 @@ static int nfs4_intent_set_file(struct nameidata *nd, struct path *path, struct
 	}
 	ret = PTR_ERR(filp);
 out_close:
-	nfs4_close_sync(path, state, nd->intent.open.flags);
+	nfs4_close_sync(path, state, fmode & (FMODE_READ|FMODE_WRITE));
 	return ret;
 }
 
@@ -1495,6 +1502,7 @@ nfs4_atomic_open(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
 	struct rpc_cred *cred;
 	struct nfs4_state *state;
 	struct dentry *res;
+	fmode_t fmode = nd->intent.open.flags & (FMODE_READ | FMODE_WRITE | FMODE_EXEC);
 
 	if (nd->flags & LOOKUP_CREATE) {
 		attr.ia_mode = nd->intent.open.create_mode;
@@ -1512,7 +1520,7 @@ nfs4_atomic_open(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
 	parent = dentry->d_parent;
 	/* Protect against concurrent sillydeletes */
 	nfs_block_sillyrename(parent);
-	state = nfs4_do_open(dir, &path, nd->intent.open.flags, &attr, cred);
+	state = nfs4_do_open(dir, &path, fmode, nd->intent.open.flags, &attr, cred);
 	put_rpccred(cred);
 	if (IS_ERR(state)) {
 		if (PTR_ERR(state) == -ENOENT) {
@@ -1527,7 +1535,7 @@ nfs4_atomic_open(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
 		path.dentry = res;
 	nfs_set_verifier(path.dentry, nfs_save_change_attribute(dir));
 	nfs_unblock_sillyrename(parent);
-	nfs4_intent_set_file(nd, &path, state);
+	nfs4_intent_set_file(nd, &path, state, fmode);
 	return res;
 }
 
@@ -1540,11 +1548,12 @@ nfs4_open_revalidate(struct inode *dir, struct dentry *dentry, int openflags, st
 	};
 	struct rpc_cred *cred;
 	struct nfs4_state *state;
+	fmode_t fmode = openflags & (FMODE_READ | FMODE_WRITE);
 
 	cred = rpc_lookup_cred();
 	if (IS_ERR(cred))
 		return PTR_ERR(cred);
-	state = nfs4_do_open(dir, &path, openflags, NULL, cred);
+	state = nfs4_do_open(dir, &path, fmode, openflags, NULL, cred);
 	put_rpccred(cred);
 	if (IS_ERR(state)) {
 		switch (PTR_ERR(state)) {
@@ -1561,10 +1570,10 @@ nfs4_open_revalidate(struct inode *dir, struct dentry *dentry, int openflags, st
 	}
 	if (state->inode == dentry->d_inode) {
 		nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
-		nfs4_intent_set_file(nd, &path, state);
+		nfs4_intent_set_file(nd, &path, state, fmode);
 		return 1;
 	}
-	nfs4_close_sync(&path, state, openflags);
+	nfs4_close_sync(&path, state, fmode);
 out_drop:
 	d_drop(dentry);
 	return 0;
@@ -1990,6 +1999,7 @@ nfs4_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
 	};
 	struct nfs4_state *state;
 	struct rpc_cred *cred;
+	fmode_t fmode = flags & (FMODE_READ | FMODE_WRITE);
 	int status = 0;
 
 	cred = rpc_lookup_cred();
@@ -1997,7 +2007,7 @@ nfs4_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
 		status = PTR_ERR(cred);
 		goto out;
 	}
-	state = nfs4_do_open(dir, &path, flags, sattr, cred);
+	state = nfs4_do_open(dir, &path, fmode, flags, sattr, cred);
 	d_drop(dentry);
 	if (IS_ERR(state)) {
 		status = PTR_ERR(state);
@@ -2013,9 +2023,9 @@ nfs4_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
 		nfs_post_op_update_inode(state->inode, &fattr);
 	}
 	if (status == 0 && (nd->flags & LOOKUP_OPEN) != 0)
-		status = nfs4_intent_set_file(nd, &path, state);
+		status = nfs4_intent_set_file(nd, &path, state, fmode);
 	else
-		nfs4_close_sync(&path, state, flags);
+		nfs4_close_sync(&path, state, fmode);
 out_putcred:
 	put_rpccred(cred);
 out:
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index cd16b301f13..2022fe47966 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -363,18 +363,18 @@ nfs4_alloc_open_state(void)
 }
 
 void
-nfs4_state_set_mode_locked(struct nfs4_state *state, mode_t mode)
+nfs4_state_set_mode_locked(struct nfs4_state *state, fmode_t fmode)
 {
-	if (state->state == mode)
+	if (state->state == fmode)
 		return;
 	/* NB! List reordering - see the reclaim code for why.  */
-	if ((mode & FMODE_WRITE) != (state->state & FMODE_WRITE)) {
-		if (mode & FMODE_WRITE)
+	if ((fmode & FMODE_WRITE) != (state->state & FMODE_WRITE)) {
+		if (fmode & FMODE_WRITE)
 			list_move(&state->open_states, &state->owner->so_states);
 		else
 			list_move_tail(&state->open_states, &state->owner->so_states);
 	}
-	state->state = mode;
+	state->state = fmode;
 }
 
 static struct nfs4_state *
@@ -454,16 +454,16 @@ void nfs4_put_open_state(struct nfs4_state *state)
 /*
  * Close the current file.
  */
-static void __nfs4_close(struct path *path, struct nfs4_state *state, mode_t mode, int wait)
+static void __nfs4_close(struct path *path, struct nfs4_state *state, fmode_t fmode, int wait)
 {
 	struct nfs4_state_owner *owner = state->owner;
 	int call_close = 0;
-	int newstate;
+	fmode_t newstate;
 
 	atomic_inc(&owner->so_count);
 	/* Protect against nfs4_find_state() */
 	spin_lock(&owner->so_lock);
-	switch (mode & (FMODE_READ | FMODE_WRITE)) {
+	switch (fmode & (FMODE_READ | FMODE_WRITE)) {
 		case FMODE_READ:
 			state->n_rdonly--;
 			break;
@@ -498,14 +498,14 @@ static void __nfs4_close(struct path *path, struct nfs4_state *state, mode_t mod
 		nfs4_do_close(path, state, wait);
 }
 
-void nfs4_close_state(struct path *path, struct nfs4_state *state, mode_t mode)
+void nfs4_close_state(struct path *path, struct nfs4_state *state, fmode_t fmode)
 {
-	__nfs4_close(path, state, mode, 0);
+	__nfs4_close(path, state, fmode, 0);
 }
 
-void nfs4_close_sync(struct path *path, struct nfs4_state *state, mode_t mode)
+void nfs4_close_sync(struct path *path, struct nfs4_state *state, fmode_t fmode)
 {
-	__nfs4_close(path, state, mode, 1);
+	__nfs4_close(path, state, fmode, 1);
 }
 
 /*
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 879911c9903..d8ddfc5467d 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -953,12 +953,12 @@ static int encode_lookup(struct xdr_stream *xdr, const struct qstr *name)
 	return 0;
 }
 
-static void encode_share_access(struct xdr_stream *xdr, int open_flags)
+static void encode_share_access(struct xdr_stream *xdr, fmode_t fmode)
 {
 	__be32 *p;
 
 	RESERVE_SPACE(8);
-	switch (open_flags & (FMODE_READ|FMODE_WRITE)) {
+	switch (fmode & (FMODE_READ|FMODE_WRITE)) {
 		case FMODE_READ:
 			WRITE32(NFS4_SHARE_ACCESS_READ);
 			break;
@@ -969,7 +969,7 @@ static void encode_share_access(struct xdr_stream *xdr, int open_flags)
 			WRITE32(NFS4_SHARE_ACCESS_BOTH);
 			break;
 		default:
-			BUG();
+			WRITE32(0);
 	}
 	WRITE32(0);		/* for linux, share_deny = 0 always */
 }
@@ -984,7 +984,7 @@ static inline void encode_openhdr(struct xdr_stream *xdr, const struct nfs_opena
 	RESERVE_SPACE(8);
 	WRITE32(OP_OPEN);
 	WRITE32(arg->seqid->sequence->counter);
-	encode_share_access(xdr, arg->open_flags);
+	encode_share_access(xdr, arg->fmode);
 	RESERVE_SPACE(28);
 	WRITE64(arg->clientid);
 	WRITE32(16);
@@ -1112,7 +1112,7 @@ static int encode_open_downgrade(struct xdr_stream *xdr, const struct nfs_closea
 	WRITE32(OP_OPEN_DOWNGRADE);
 	WRITEMEM(arg->stateid->data, NFS4_STATEID_SIZE);
 	WRITE32(arg->seqid->sequence->counter);
-	encode_share_access(xdr, arg->open_flags);
+	encode_share_access(xdr, arg->fmode);
 	return 0;
 }
 
diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index 8d71d7b7c78..b8d9c6dd4f6 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -83,7 +83,7 @@ struct nfs_open_context {
 	struct rpc_cred *cred;
 	struct nfs4_state *state;
 	fl_owner_t lockowner;
-	int mode;
+	fmode_t mode;
 
 	unsigned long flags;
 #define NFS_CONTEXT_ERROR_WRITE		(0)
@@ -342,7 +342,7 @@ extern int nfs_setattr(struct dentry *, struct iattr *);
 extern void nfs_setattr_update_inode(struct inode *inode, struct iattr *attr);
 extern struct nfs_open_context *get_nfs_open_context(struct nfs_open_context *ctx);
 extern void put_nfs_open_context(struct nfs_open_context *ctx);
-extern struct nfs_open_context *nfs_find_open_context(struct inode *inode, struct rpc_cred *cred, int mode);
+extern struct nfs_open_context *nfs_find_open_context(struct inode *inode, struct rpc_cred *cred, fmode_t mode);
 extern u64 nfs_compat_user_ino64(u64 fileid);
 extern void nfs_fattr_init(struct nfs_fattr *fattr);
 
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index 32c1a0ecdbf..a550b528319 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -120,6 +120,7 @@ struct nfs_openargs {
 	const struct nfs_fh *	fh;
 	struct nfs_seqid *	seqid;
 	int			open_flags;
+	fmode_t			fmode;
 	__u64                   clientid;
 	__u64                   id;
 	union {
@@ -171,7 +172,7 @@ struct nfs_closeargs {
 	struct nfs_fh *         fh;
 	nfs4_stateid *		stateid;
 	struct nfs_seqid *	seqid;
-	int			open_flags;
+	fmode_t			fmode;
 	const u32 *		bitmask;
 };
 
-- 
cgit v1.2.3-70-g09d2


From 64672d55d93c26fb4035fd1a84a803cbc09cb058 Mon Sep 17 00:00:00 2001
From: Peter Staubach <staubach@redhat.com>
Date: Tue, 23 Dec 2008 15:21:56 -0500
Subject: optimize attribute timeouts for "noac" and "actimeo=0"

Hi.

I've been looking at a bugzilla which describes a problem where
a customer was advised to use either the "noac" or "actimeo=0"
mount options to solve a consistency problem that they were
seeing in the file attributes.  It turned out that this solution
did not work reliably for them because sometimes, the local
attribute cache was believed to be valid and not timed out.
(With an attribute cache timeout of 0, the cache should always
appear to be timed out.)

In looking at this situation, it appears to me that the problem
is that the attribute cache timeout code has an off-by-one
error in it.  It is assuming that the cache is valid in the
region, [read_cache_jiffies, read_cache_jiffies + attrtimeo].  The
cache should be considered valid only in the region,
[read_cache_jiffies, read_cache_jiffies + attrtimeo).  With this
change, the options, "noac" and "actimeo=0", work as originally
expected.

This problem was previously addressed by special casing the
attrtimeo == 0 case.  However, since the problem is only an off-
by-one error, the cleaner solution is address the off-by-one
error and thus, not require the special case.

    Thanx...

        ps

Signed-off-by: Peter Staubach <staubach@redhat.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/dir.c            |  2 +-
 fs/nfs/inode.c          | 11 ++---------
 include/linux/jiffies.h | 10 ++++++++++
 include/linux/nfs_fs.h  |  5 ++++-
 net/sunrpc/auth.c       |  2 +-
 5 files changed, 18 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index ed7024c3488..e35c8199f82 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -1798,7 +1798,7 @@ static int nfs_access_get_cached(struct inode *inode, struct rpc_cred *cred, str
 	if (cache == NULL)
 		goto out;
 	if (!nfs_have_delegation(inode, FMODE_READ) &&
-	    !time_in_range(jiffies, cache->jiffies, cache->jiffies + nfsi->attrtimeo))
+	    !time_in_range_open(jiffies, cache->jiffies, cache->jiffies + nfsi->attrtimeo))
 		goto out_stale;
 	res->jiffies = cache->jiffies;
 	res->cred = cache->cred;
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 1cf39202c3e..0c381686171 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -712,14 +712,7 @@ int nfs_attribute_timeout(struct inode *inode)
 
 	if (nfs_have_delegation(inode, FMODE_READ))
 		return 0;
-	/*
-	 * Special case: if the attribute timeout is set to 0, then always
-	 * 		 treat the cache as having expired (unless holding
-	 * 		 a delegation).
-	 */
-	if (nfsi->attrtimeo == 0)
-		return 1;
-	return !time_in_range(jiffies, nfsi->read_cache_jiffies, nfsi->read_cache_jiffies + nfsi->attrtimeo);
+	return !time_in_range_open(jiffies, nfsi->read_cache_jiffies, nfsi->read_cache_jiffies + nfsi->attrtimeo);
 }
 
 /**
@@ -1182,7 +1175,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
 		nfsi->attrtimeo_timestamp = now;
 		nfsi->attr_gencount = nfs_inc_attr_generation_counter();
 	} else {
-		if (!time_in_range(now, nfsi->attrtimeo_timestamp, nfsi->attrtimeo_timestamp + nfsi->attrtimeo)) {
+		if (!time_in_range_open(now, nfsi->attrtimeo_timestamp, nfsi->attrtimeo_timestamp + nfsi->attrtimeo)) {
 			if ((nfsi->attrtimeo <<= 1) > NFS_MAXATTRTIMEO(inode))
 				nfsi->attrtimeo = NFS_MAXATTRTIMEO(inode);
 			nfsi->attrtimeo_timestamp = now;
diff --git a/include/linux/jiffies.h b/include/linux/jiffies.h
index abb6ac639e8..1a9cf78bfce 100644
--- a/include/linux/jiffies.h
+++ b/include/linux/jiffies.h
@@ -115,10 +115,20 @@ static inline u64 get_jiffies_64(void)
 	 ((long)(a) - (long)(b) >= 0))
 #define time_before_eq(a,b)	time_after_eq(b,a)
 
+/*
+ * Calculate whether a is in the range of [b, c].
+ */
 #define time_in_range(a,b,c) \
 	(time_after_eq(a,b) && \
 	 time_before_eq(a,c))
 
+/*
+ * Calculate whether a is in the range of [b, c).
+ */
+#define time_in_range_open(a,b,c) \
+	(time_after_eq(a,b) && \
+	 time_before(a,c))
+
 /* Same as above, but does so with platform independent 64bit types.
  * These must be used when utilizing jiffies_64 (i.e. return value of
  * get_jiffies_64() */
diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index b8d9c6dd4f6..db867b04ac3 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -130,7 +130,10 @@ struct nfs_inode {
 	 *
 	 * We need to revalidate the cached attrs for this inode if
 	 *
-	 *	jiffies - read_cache_jiffies > attrtimeo
+	 *	jiffies - read_cache_jiffies >= attrtimeo
+	 *
+	 * Please note the comparison is greater than or equal
+	 * so that zero timeout values can be specified.
 	 */
 	unsigned long		read_cache_jiffies;
 	unsigned long		attrtimeo;
diff --git a/net/sunrpc/auth.c b/net/sunrpc/auth.c
index 6e28744b170..050e4e84d9e 100644
--- a/net/sunrpc/auth.c
+++ b/net/sunrpc/auth.c
@@ -234,7 +234,7 @@ rpcauth_prune_expired(struct list_head *free, int nr_to_scan)
 	list_for_each_entry_safe(cred, next, &cred_unused, cr_lru) {
 
 		/* Enforce a 60 second garbage collection moratorium */
-		if (time_in_range(cred->cr_expire, expired, jiffies) &&
+		if (time_in_range_open(cred->cr_expire, expired, jiffies) &&
 		    test_bit(RPCAUTH_CRED_HASHED, &cred->cr_flags) != 0)
 			continue;
 
-- 
cgit v1.2.3-70-g09d2


From c977a2ef40a38c45537ad03823d0a004f06373f0 Mon Sep 17 00:00:00 2001
From: Benny Halevy <bhalevy@panasas.com>
Date: Tue, 23 Dec 2008 16:06:13 -0500
Subject: sunrpc: get rid of rpc_rqst.rq_bufsize

rq_bufsize is not used.

Signed-off-by: Mike Sager <Mike.Sager@netapp.com>
Signed-off-by: Benny Halevy <bhalevy@panasas.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 include/linux/sunrpc/xprt.h | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/xprt.h b/include/linux/sunrpc/xprt.h
index 4d80a118d53..11fc71d50c1 100644
--- a/include/linux/sunrpc/xprt.h
+++ b/include/linux/sunrpc/xprt.h
@@ -76,8 +76,7 @@ struct rpc_rqst {
 	struct list_head	rq_list;
 
 	__u32 *			rq_buffer;	/* XDR encode buffer */
-	size_t			rq_bufsize,
-				rq_callsize,
+	size_t			rq_callsize,
 				rq_rcvsize;
 
 	struct xdr_buf		rq_private_buf;		/* The receive buffer
-- 
cgit v1.2.3-70-g09d2


From c381060869317b3c84430d4f54965d409cbfe65f Mon Sep 17 00:00:00 2001
From: "\\\"J. Bruce Fields\\" <bfields@citi.umich.edu>
Date: Tue, 23 Dec 2008 16:08:32 -0500
Subject: rpc: add an rpc_pipe_open method

We want to transition to a new gssd upcall which is text-based and more
easily extensible.

To simplify upgrades, as well as testing and debugging, it will help if
we can upgrade gssd (to a version which understands the new upcall)
without having to choose at boot (or module-load) time whether we want
the new or the old upcall.

We will do this by providing two different pipes: one named, as
currently, after the mechanism (normally "krb5"), and supporting the
old upcall.  One named "gssd" and supporting the new upcall version.

We allow gssd to indicate which version it supports by its choice of
which pipe to open.

As we have no interest in supporting *simultaneous* use of both
versions, we'll forbid opening both pipes at the same time.

So, add a new pipe_open callback to the rpc_pipefs api, which the gss
code can use to track which pipes have been open, and to refuse opens of
incompatible pipes.

We only need this to be called on the first open of a given pipe.

Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 include/linux/sunrpc/rpc_pipe_fs.h |  1 +
 net/sunrpc/rpc_pipe.c              | 22 +++++++++++++++-------
 2 files changed, 16 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/rpc_pipe_fs.h b/include/linux/sunrpc/rpc_pipe_fs.h
index 51b977a4ca2..cea764c2359 100644
--- a/include/linux/sunrpc/rpc_pipe_fs.h
+++ b/include/linux/sunrpc/rpc_pipe_fs.h
@@ -15,6 +15,7 @@ struct rpc_pipe_ops {
 	ssize_t (*upcall)(struct file *, struct rpc_pipe_msg *, char __user *, size_t);
 	ssize_t (*downcall)(struct file *, const char __user *, size_t);
 	void (*release_pipe)(struct inode *);
+	int (*open_pipe)(struct inode *);
 	void (*destroy_msg)(struct rpc_pipe_msg *);
 };
 
diff --git a/net/sunrpc/rpc_pipe.c b/net/sunrpc/rpc_pipe.c
index 55b2049834c..c9b57f47108 100644
--- a/net/sunrpc/rpc_pipe.c
+++ b/net/sunrpc/rpc_pipe.c
@@ -169,16 +169,24 @@ static int
 rpc_pipe_open(struct inode *inode, struct file *filp)
 {
 	struct rpc_inode *rpci = RPC_I(inode);
+	int first_open;
 	int res = -ENXIO;
 
 	mutex_lock(&inode->i_mutex);
-	if (rpci->ops != NULL) {
-		if (filp->f_mode & FMODE_READ)
-			rpci->nreaders ++;
-		if (filp->f_mode & FMODE_WRITE)
-			rpci->nwriters ++;
-		res = 0;
+	if (rpci->ops == NULL)
+		goto out;
+	first_open = rpci->nreaders == 0 && rpci->nwriters == 0;
+	if (first_open && rpci->ops->open_pipe) {
+		res = rpci->ops->open_pipe(inode);
+		if (res)
+			goto out;
 	}
+	if (filp->f_mode & FMODE_READ)
+		rpci->nreaders++;
+	if (filp->f_mode & FMODE_WRITE)
+		rpci->nwriters++;
+	res = 0;
+out:
 	mutex_unlock(&inode->i_mutex);
 	return res;
 }
@@ -748,7 +756,7 @@ rpc_rmdir(struct dentry *dentry)
  * @name: name of pipe
  * @private: private data to associate with the pipe, for the caller's use
  * @ops: operations defining the behavior of the pipe: upcall, downcall,
- *	release_pipe, and destroy_msg.
+ *	release_pipe, open_pipe, and destroy_msg.
  * @flags: rpc_inode flags
  *
  * Data is made available for userspace to read by calls to
-- 
cgit v1.2.3-70-g09d2


From 68e76ad0baf8f5d5060377c2423ee6eed5c63057 Mon Sep 17 00:00:00 2001
From: Olga Kornievskaia <aglo@citi.umich.edu>
Date: Tue, 23 Dec 2008 16:17:15 -0500
Subject: nfsd: pass client principal name in rsc downcall

Two principals are involved in krb5 authentication: the target, who we
authenticate *to* (normally the name of the server, like
nfs/server.citi.umich.edu@CITI.UMICH.EDU), and the source, we we
authenticate *as* (normally a user, like bfields@UMICH.EDU)

In the case of NFSv4 callbacks, the target of the callback should be the
source of the client's setclientid call, and the source should be the
nfs server's own principal.

Therefore we allow svcgssd to pass down the name of the principal that
just authenticated, so that on setclientid we can store that principal
name with the new client, to be used later on callbacks.

Signed-off-by: Olga Kornievskaia <aglo@citi.umich.edu>
Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfsd/nfs4state.c                | 11 +++++++++++
 include/linux/nfsd/state.h         |  1 +
 include/linux/sunrpc/svcauth_gss.h |  1 +
 net/sunrpc/auth_gss/svcauth_gss.c  | 23 +++++++++++++++++++++++
 4 files changed, 36 insertions(+)

(limited to 'include/linux')

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 1a052ac2bde..f3b9a8d064f 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -54,6 +54,7 @@
 #include <linux/mutex.h>
 #include <linux/lockd/bind.h>
 #include <linux/module.h>
+#include <linux/sunrpc/svcauth_gss.h>
 
 #define NFSDDBG_FACILITY                NFSDDBG_PROC
 
@@ -377,6 +378,7 @@ free_client(struct nfs4_client *clp)
 	shutdown_callback_client(clp);
 	if (clp->cl_cred.cr_group_info)
 		put_group_info(clp->cl_cred.cr_group_info);
+	kfree(clp->cl_principal);
 	kfree(clp->cl_name.data);
 	kfree(clp);
 }
@@ -696,6 +698,7 @@ nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	unsigned int 		strhashval;
 	struct nfs4_client	*conf, *unconf, *new;
 	__be32 			status;
+	char			*princ;
 	char                    dname[HEXDIR_LEN];
 	
 	if (!check_name(clname))
@@ -783,6 +786,14 @@ nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	}
 	copy_verf(new, &clverifier);
 	new->cl_addr = sin->sin_addr.s_addr;
+	princ = svc_gss_principal(rqstp);
+	if (princ) {
+		new->cl_principal = kstrdup(princ, GFP_KERNEL);
+		if (new->cl_principal == NULL) {
+			free_client(new);
+			goto out;
+		}
+	}
 	copy_cred(&new->cl_cred, &rqstp->rq_cred);
 	gen_confirm(new);
 	gen_callback(new, setclid);
diff --git a/include/linux/nfsd/state.h b/include/linux/nfsd/state.h
index d0fe2e37845..ce7cbf4b7c9 100644
--- a/include/linux/nfsd/state.h
+++ b/include/linux/nfsd/state.h
@@ -124,6 +124,7 @@ struct nfs4_client {
 	nfs4_verifier		cl_verifier; 	/* generated by client */
 	time_t                  cl_time;        /* time of last lease renewal */
 	__be32			cl_addr; 	/* client ipaddress */
+	char			*cl_principal;	/* setclientid principal name */
 	struct svc_cred		cl_cred; 	/* setclientid principal */
 	clientid_t		cl_clientid;	/* generated by server */
 	nfs4_verifier		cl_confirm;	/* generated by server */
diff --git a/include/linux/sunrpc/svcauth_gss.h b/include/linux/sunrpc/svcauth_gss.h
index c9165d9771a..ca7d725861f 100644
--- a/include/linux/sunrpc/svcauth_gss.h
+++ b/include/linux/sunrpc/svcauth_gss.h
@@ -20,6 +20,7 @@ int gss_svc_init(void);
 void gss_svc_shutdown(void);
 int svcauth_gss_register_pseudoflavor(u32 pseudoflavor, char * name);
 u32 svcauth_gss_flavor(struct auth_domain *dom);
+char *svc_gss_principal(struct svc_rqst *);
 
 #endif /* __KERNEL__ */
 #endif /* _LINUX_SUNRPC_SVCAUTH_GSS_H */
diff --git a/net/sunrpc/auth_gss/svcauth_gss.c b/net/sunrpc/auth_gss/svcauth_gss.c
index 12803da95dc..e9baa6ebb1d 100644
--- a/net/sunrpc/auth_gss/svcauth_gss.c
+++ b/net/sunrpc/auth_gss/svcauth_gss.c
@@ -332,6 +332,7 @@ struct rsc {
 	struct svc_cred		cred;
 	struct gss_svc_seq_data	seqdata;
 	struct gss_ctx		*mechctx;
+	char			*client_name;
 };
 
 static struct cache_head *rsc_table[RSC_HASHMAX];
@@ -346,6 +347,7 @@ static void rsc_free(struct rsc *rsci)
 		gss_delete_sec_context(&rsci->mechctx);
 	if (rsci->cred.cr_group_info)
 		put_group_info(rsci->cred.cr_group_info);
+	kfree(rsci->client_name);
 }
 
 static void rsc_put(struct kref *ref)
@@ -383,6 +385,7 @@ rsc_init(struct cache_head *cnew, struct cache_head *ctmp)
 	tmp->handle.data = NULL;
 	new->mechctx = NULL;
 	new->cred.cr_group_info = NULL;
+	new->client_name = NULL;
 }
 
 static void
@@ -397,6 +400,8 @@ update_rsc(struct cache_head *cnew, struct cache_head *ctmp)
 	spin_lock_init(&new->seqdata.sd_lock);
 	new->cred = tmp->cred;
 	tmp->cred.cr_group_info = NULL;
+	new->client_name = tmp->client_name;
+	tmp->client_name = NULL;
 }
 
 static struct cache_head *
@@ -486,6 +491,15 @@ static int rsc_parse(struct cache_detail *cd,
 		status = gss_import_sec_context(buf, len, gm, &rsci.mechctx);
 		if (status)
 			goto out;
+
+		/* get client name */
+		len = qword_get(&mesg, buf, mlen);
+		if (len > 0) {
+			rsci.client_name = kstrdup(buf, GFP_KERNEL);
+			if (!rsci.client_name)
+				goto out;
+		}
+
 	}
 	rsci.h.expiry_time = expiry;
 	rscp = rsc_update(&rsci, rscp);
@@ -913,6 +927,15 @@ struct gss_svc_data {
 	struct rsc			*rsci;
 };
 
+char *svc_gss_principal(struct svc_rqst *rqstp)
+{
+	struct gss_svc_data *gd = (struct gss_svc_data *)rqstp->rq_auth_data;
+
+	if (gd && gd->rsci)
+		return gd->rsci->client_name;
+	return NULL;
+}
+
 static int
 svcauth_gss_set_client(struct svc_rqst *rqstp)
 {
-- 
cgit v1.2.3-70-g09d2


From 608207e8884e083ad8b8d33eda868da70f0d63e8 Mon Sep 17 00:00:00 2001
From: Olga Kornievskaia <aglo@citi.umich.edu>
Date: Tue, 23 Dec 2008 16:17:40 -0500
Subject: rpc: pass target name down to rpc level on callbacks

The rpc client needs to know the principal that the setclientid was done
as, so it can tell gssd who to authenticate to.

Signed-off-by: Olga Kornievskaia <aglo@citi.umich.edu>
Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfsd/nfs4callback.c      |  6 ++++++
 include/linux/sunrpc/clnt.h |  2 ++
 net/sunrpc/clnt.c           | 16 ++++++++++++++++
 3 files changed, 24 insertions(+)

(limited to 'include/linux')

diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index 094747a1227..3ca14178214 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -384,6 +384,7 @@ static int do_probe_callback(void *data)
 		.version	= nfs_cb_version[1]->number,
 		.authflavor	= RPC_AUTH_UNIX, /* XXX: need AUTH_GSS... */
 		.flags		= (RPC_CLNT_CREATE_NOPING | RPC_CLNT_CREATE_QUIET),
+		.client_name    = clp->cl_principal,
 	};
 	struct rpc_message msg = {
 		.rpc_proc       = &nfs4_cb_procedures[NFSPROC4_CLNT_CB_NULL],
@@ -392,6 +393,11 @@ static int do_probe_callback(void *data)
 	struct rpc_clnt *client;
 	int status;
 
+	if (!clp->cl_principal && (clp->cl_flavor >= RPC_AUTH_GSS_KRB5)) {
+		status = nfserr_cb_path_down;
+		goto out_err;
+	}
+
 	/* Initialize address */
 	memset(&addr, 0, sizeof(addr));
 	addr.sin_family = AF_INET;
diff --git a/include/linux/sunrpc/clnt.h b/include/linux/sunrpc/clnt.h
index 6f0ee1b84a4..c39a21040dc 100644
--- a/include/linux/sunrpc/clnt.h
+++ b/include/linux/sunrpc/clnt.h
@@ -58,6 +58,7 @@ struct rpc_clnt {
 	struct rpc_timeout	cl_timeout_default;
 	struct rpc_program *	cl_program;
 	char			cl_inline_name[32];
+	char			*cl_principal;	/* target to authenticate to */
 };
 
 /*
@@ -108,6 +109,7 @@ struct rpc_create_args {
 	u32			version;
 	rpc_authflavor_t	authflavor;
 	unsigned long		flags;
+	char			*client_name;
 };
 
 /* Values for "flags" field */
diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index 4895c341e46..347f2a25abb 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -197,6 +197,12 @@ static struct rpc_clnt * rpc_new_client(const struct rpc_create_args *args, stru
 
 	clnt->cl_rtt = &clnt->cl_rtt_default;
 	rpc_init_rtt(&clnt->cl_rtt_default, clnt->cl_timeout->to_initval);
+	clnt->cl_principal = NULL;
+	if (args->client_name) {
+		clnt->cl_principal = kstrdup(args->client_name, GFP_KERNEL);
+		if (!clnt->cl_principal)
+			goto out_no_principal;
+	}
 
 	kref_init(&clnt->cl_kref);
 
@@ -226,6 +232,8 @@ out_no_auth:
 		rpc_put_mount();
 	}
 out_no_path:
+	kfree(clnt->cl_principal);
+out_no_principal:
 	rpc_free_iostats(clnt->cl_metrics);
 out_no_stats:
 	if (clnt->cl_server != clnt->cl_inline_name)
@@ -354,6 +362,11 @@ rpc_clone_client(struct rpc_clnt *clnt)
 	new->cl_metrics = rpc_alloc_iostats(clnt);
 	if (new->cl_metrics == NULL)
 		goto out_no_stats;
+	if (clnt->cl_principal) {
+		new->cl_principal = kstrdup(clnt->cl_principal, GFP_KERNEL);
+		if (new->cl_principal == NULL)
+			goto out_no_principal;
+	}
 	kref_init(&new->cl_kref);
 	err = rpc_setup_pipedir(new, clnt->cl_program->pipe_dir_name);
 	if (err != 0)
@@ -366,6 +379,8 @@ rpc_clone_client(struct rpc_clnt *clnt)
 	rpciod_up();
 	return new;
 out_no_path:
+	kfree(new->cl_principal);
+out_no_principal:
 	rpc_free_iostats(new->cl_metrics);
 out_no_stats:
 	kfree(new);
@@ -417,6 +432,7 @@ rpc_free_client(struct kref *kref)
 out_free:
 	rpc_unregister_client(clnt);
 	rpc_free_iostats(clnt->cl_metrics);
+	kfree(clnt->cl_principal);
 	clnt->cl_metrics = NULL;
 	xprt_put(clnt->cl_xprt);
 	rpciod_down();
-- 
cgit v1.2.3-70-g09d2


From 61054b14d545e257b9415d5ca0cd5f43762b4d0c Mon Sep 17 00:00:00 2001
From: Olga Kornievskaia <aglo@citi.umich.edu>
Date: Tue, 23 Dec 2008 16:19:00 -0500
Subject: nfsd: support callbacks with gss flavors

This patch adds server-side support for callbacks other than AUTH_SYS.

Signed-off-by: Olga Kornievskaia <aglo@citi.umich.edu>
Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfsd/nfs4callback.c     | 3 ++-
 fs/nfsd/nfs4state.c        | 1 +
 include/linux/nfsd/state.h | 1 +
 net/sunrpc/rpc_pipe.c      | 5 +++++
 4 files changed, 9 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index 3ca14178214..6d7d8c02c19 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -358,6 +358,7 @@ static struct rpc_program cb_program = {
 		.nrvers		= ARRAY_SIZE(nfs_cb_version),
 		.version	= nfs_cb_version,
 		.stats		= &cb_stats,
+		.pipe_dir_name  = "/nfsd4_cb",
 };
 
 /* Reference counting, callback cleanup, etc., all look racy as heck.
@@ -382,7 +383,7 @@ static int do_probe_callback(void *data)
 		.program	= &cb_program,
 		.prognumber	= cb->cb_prog,
 		.version	= nfs_cb_version[1]->number,
-		.authflavor	= RPC_AUTH_UNIX, /* XXX: need AUTH_GSS... */
+		.authflavor	= clp->cl_flavor,
 		.flags		= (RPC_CLNT_CREATE_NOPING | RPC_CLNT_CREATE_QUIET),
 		.client_name    = clp->cl_principal,
 	};
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index f3b9a8d064f..07db31568ac 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -786,6 +786,7 @@ nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	}
 	copy_verf(new, &clverifier);
 	new->cl_addr = sin->sin_addr.s_addr;
+	new->cl_flavor = rqstp->rq_flavor;
 	princ = svc_gss_principal(rqstp);
 	if (princ) {
 		new->cl_principal = kstrdup(princ, GFP_KERNEL);
diff --git a/include/linux/nfsd/state.h b/include/linux/nfsd/state.h
index ce7cbf4b7c9..128298c0362 100644
--- a/include/linux/nfsd/state.h
+++ b/include/linux/nfsd/state.h
@@ -124,6 +124,7 @@ struct nfs4_client {
 	nfs4_verifier		cl_verifier; 	/* generated by client */
 	time_t                  cl_time;        /* time of last lease renewal */
 	__be32			cl_addr; 	/* client ipaddress */
+	u32			cl_flavor;	/* setclientid pseudoflavor */
 	char			*cl_principal;	/* setclientid principal name */
 	struct svc_cred		cl_cred; 	/* setclientid principal */
 	clientid_t		cl_clientid;	/* generated by server */
diff --git a/net/sunrpc/rpc_pipe.c b/net/sunrpc/rpc_pipe.c
index 3105efbb182..19245324887 100644
--- a/net/sunrpc/rpc_pipe.c
+++ b/net/sunrpc/rpc_pipe.c
@@ -407,6 +407,7 @@ enum {
 	RPCAUTH_nfs,
 	RPCAUTH_portmap,
 	RPCAUTH_statd,
+	RPCAUTH_nfsd4_cb,
 	RPCAUTH_RootEOF
 };
 
@@ -440,6 +441,10 @@ static struct rpc_filelist files[] = {
 		.name = "statd",
 		.mode = S_IFDIR | S_IRUGO | S_IXUGO,
 	},
+	[RPCAUTH_nfsd4_cb] = {
+		.name = "nfsd4_cb",
+		.mode = S_IFDIR | S_IRUGO | S_IXUGO,
+	},
 };
 
 enum {
-- 
cgit v1.2.3-70-g09d2


From 4a7794860ba2b56693b1d89fd485fd08cdc763e3 Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Sat, 13 Sep 2008 18:19:03 -0700
Subject: crypto: api - Move type exit function into crypto_tfm

The type exit function needs to undo any allocations done by the type
init function.  However, the type init function may differ depending
on the upper-level type of the transform (e.g., a crypto_blkcipher
instantiated as a crypto_ablkcipher).

So we need to move the exit function out of the lower-level
structure and into crypto_tfm itself.

As it stands this is a no-op since nobody uses exit functions at
all.  However, all cases where a lower-level type is instantiated
as a different upper-level type (such as blkcipher as ablkcipher)
will be converted such that they allocate the underlying transform
and use that instead of casting (e.g., crypto_ablkcipher casted
into crypto_blkcipher).  That will need to use a different exit
function depending on the upper-level type.

This patch also allows the type init/exit functions to call (or not)
cra_init/cra_exit instead of always calling them from the top level.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 crypto/api.c            | 13 ++++++-------
 include/crypto/algapi.h |  1 -
 include/linux/crypto.h  |  2 ++
 3 files changed, 8 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/crypto/api.c b/crypto/api.c
index 0444d242e98..cbaaf346ad1 100644
--- a/crypto/api.c
+++ b/crypto/api.c
@@ -300,8 +300,8 @@ static void crypto_exit_ops(struct crypto_tfm *tfm)
 	const struct crypto_type *type = tfm->__crt_alg->cra_type;
 
 	if (type) {
-		if (type->exit)
-			type->exit(tfm);
+		if (tfm->exit)
+			tfm->exit(tfm);
 		return;
 	}
 
@@ -379,17 +379,16 @@ struct crypto_tfm *__crypto_alloc_tfm(struct crypto_alg *alg, u32 type,
 	if (err)
 		goto out_free_tfm;
 
-	if (alg->cra_init && (err = alg->cra_init(tfm))) {
-		if (err == -EAGAIN)
-			crypto_shoot_alg(alg);
+	if (!tfm->exit && alg->cra_init && (err = alg->cra_init(tfm)))
 		goto cra_init_failed;
-	}
 
 	goto out;
 
 cra_init_failed:
 	crypto_exit_ops(tfm);
 out_free_tfm:
+	if (err == -EAGAIN)
+		crypto_shoot_alg(alg);
 	kfree(tfm);
 out_err:
 	tfm = ERR_PTR(err);
@@ -469,7 +468,7 @@ void crypto_free_tfm(struct crypto_tfm *tfm)
 	alg = tfm->__crt_alg;
 	size = sizeof(*tfm) + alg->cra_ctxsize;
 
-	if (alg->cra_exit)
+	if (!tfm->exit && alg->cra_exit)
 		alg->cra_exit(tfm);
 	crypto_exit_ops(tfm);
 	crypto_mod_put(alg);
diff --git a/include/crypto/algapi.h b/include/crypto/algapi.h
index 60d06e784be..5fb6d8618d4 100644
--- a/include/crypto/algapi.h
+++ b/include/crypto/algapi.h
@@ -23,7 +23,6 @@ struct seq_file;
 struct crypto_type {
 	unsigned int (*ctxsize)(struct crypto_alg *alg, u32 type, u32 mask);
 	int (*init)(struct crypto_tfm *tfm, u32 type, u32 mask);
-	void (*exit)(struct crypto_tfm *tfm);
 	void (*show)(struct seq_file *m, struct crypto_alg *alg);
 };
 
diff --git a/include/linux/crypto.h b/include/linux/crypto.h
index 3d2317e4af2..ea52cd944fd 100644
--- a/include/linux/crypto.h
+++ b/include/linux/crypto.h
@@ -480,6 +480,8 @@ struct crypto_tfm {
 		struct compress_tfm compress;
 		struct rng_tfm rng;
 	} crt_u;
+
+	void (*exit)(struct crypto_tfm *tfm);
 	
 	struct crypto_alg *__crt_alg;
 
-- 
cgit v1.2.3-70-g09d2


From 7b0bac64cd5b74d6f1147524c26216de13a501fd Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Sun, 21 Sep 2008 06:52:53 +0900
Subject: crypto: api - Rebirth of crypto_alloc_tfm

This patch reintroduces a completely revamped crypto_alloc_tfm.
The biggest change is that we now take two crypto_type objects
when allocating a tfm, a frontend and a backend.  In fact this
simply formalises what we've been doing behind the API's back.

For example, as it stands crypto_alloc_ahash may use an
actual ahash algorithm or a crypto_hash algorithm.  Putting
this in the API allows us to do this much more cleanly.

The existing types will be converted across gradually.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 crypto/api.c            | 108 ++++++++++++++++++++++++++++++++++++++++++++++++
 crypto/internal.h       |   2 +
 include/crypto/algapi.h |  10 +++++
 include/linux/crypto.h  |   4 +-
 4 files changed, 123 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/crypto/api.c b/crypto/api.c
index cbaaf346ad1..9975a7bd246 100644
--- a/crypto/api.c
+++ b/crypto/api.c
@@ -403,6 +403,9 @@ EXPORT_SYMBOL_GPL(__crypto_alloc_tfm);
  *	@type: Type of algorithm
  *	@mask: Mask for type comparison
  *
+ *	This function should not be used by new algorithm types.
+ *	Plesae use crypto_alloc_tfm instead.
+ *
  *	crypto_alloc_base() will first attempt to locate an already loaded
  *	algorithm.  If that fails and the kernel supports dynamically loadable
  *	modules, it will then attempt to load a module of the same name or
@@ -449,6 +452,111 @@ err:
 	return ERR_PTR(err);
 }
 EXPORT_SYMBOL_GPL(crypto_alloc_base);
+
+struct crypto_tfm *crypto_create_tfm(struct crypto_alg *alg,
+				     const struct crypto_type *frontend)
+{
+	char *mem;
+	struct crypto_tfm *tfm = NULL;
+	unsigned int tfmsize;
+	unsigned int total;
+	int err = -ENOMEM;
+
+	tfmsize = frontend->tfmsize;
+	total = tfmsize + sizeof(*tfm) + frontend->extsize(alg, frontend);
+
+	mem = kzalloc(total, GFP_KERNEL);
+	if (mem == NULL)
+		goto out_err;
+
+	tfm = (struct crypto_tfm *)(mem + tfmsize);
+	tfm->__crt_alg = alg;
+
+	err = frontend->init_tfm(tfm, frontend);
+	if (err)
+		goto out_free_tfm;
+
+	if (!tfm->exit && alg->cra_init && (err = alg->cra_init(tfm)))
+		goto cra_init_failed;
+
+	goto out;
+
+cra_init_failed:
+	crypto_exit_ops(tfm);
+out_free_tfm:
+	if (err == -EAGAIN)
+		crypto_shoot_alg(alg);
+	kfree(mem);
+out_err:
+	tfm = ERR_PTR(err);
+out:
+	return tfm;
+}
+EXPORT_SYMBOL_GPL(crypto_create_tfm);
+
+/*
+ *	crypto_alloc_tfm - Locate algorithm and allocate transform
+ *	@alg_name: Name of algorithm
+ *	@frontend: Frontend algorithm type
+ *	@type: Type of algorithm
+ *	@mask: Mask for type comparison
+ *
+ *	crypto_alloc_tfm() will first attempt to locate an already loaded
+ *	algorithm.  If that fails and the kernel supports dynamically loadable
+ *	modules, it will then attempt to load a module of the same name or
+ *	alias.  If that fails it will send a query to any loaded crypto manager
+ *	to construct an algorithm on the fly.  A refcount is grabbed on the
+ *	algorithm which is then associated with the new transform.
+ *
+ *	The returned transform is of a non-determinate type.  Most people
+ *	should use one of the more specific allocation functions such as
+ *	crypto_alloc_blkcipher.
+ *
+ *	In case of error the return value is an error pointer.
+ */
+struct crypto_tfm *crypto_alloc_tfm(const char *alg_name,
+				    const struct crypto_type *frontend,
+				    u32 type, u32 mask)
+{
+	struct crypto_alg *(*lookup)(const char *name, u32 type, u32 mask);
+	struct crypto_tfm *tfm;
+	int err;
+
+	type &= frontend->maskclear;
+	mask &= frontend->maskclear;
+	type |= frontend->type;
+	mask |= frontend->maskset;
+
+	lookup = frontend->lookup ?: crypto_alg_mod_lookup;
+
+	for (;;) {
+		struct crypto_alg *alg;
+
+		alg = lookup(alg_name, type, mask);
+		if (IS_ERR(alg)) {
+			err = PTR_ERR(alg);
+			goto err;
+		}
+
+		tfm = crypto_create_tfm(alg, frontend);
+		if (!IS_ERR(tfm))
+			return tfm;
+
+		crypto_mod_put(alg);
+		err = PTR_ERR(tfm);
+
+err:
+		if (err != -EAGAIN)
+			break;
+		if (signal_pending(current)) {
+			err = -EINTR;
+			break;
+		}
+	}
+
+	return ERR_PTR(err);
+}
+EXPORT_SYMBOL_GPL(crypto_alloc_tfm);
  
 /*
  *	crypto_free_tfm - Free crypto transform
diff --git a/crypto/internal.h b/crypto/internal.h
index 8ef72d76092..3c19a27a756 100644
--- a/crypto/internal.h
+++ b/crypto/internal.h
@@ -109,6 +109,8 @@ void crypto_alg_tested(const char *name, int err);
 void crypto_shoot_alg(struct crypto_alg *alg);
 struct crypto_tfm *__crypto_alloc_tfm(struct crypto_alg *alg, u32 type,
 				      u32 mask);
+struct crypto_tfm *crypto_create_tfm(struct crypto_alg *alg,
+				     const struct crypto_type *frontend);
 
 int crypto_register_instance(struct crypto_template *tmpl,
 			     struct crypto_instance *inst);
diff --git a/include/crypto/algapi.h b/include/crypto/algapi.h
index 5fb6d8618d4..986db68548f 100644
--- a/include/crypto/algapi.h
+++ b/include/crypto/algapi.h
@@ -22,8 +22,18 @@ struct seq_file;
 
 struct crypto_type {
 	unsigned int (*ctxsize)(struct crypto_alg *alg, u32 type, u32 mask);
+	unsigned int (*extsize)(struct crypto_alg *alg,
+				const struct crypto_type *frontend);
 	int (*init)(struct crypto_tfm *tfm, u32 type, u32 mask);
+	int (*init_tfm)(struct crypto_tfm *tfm,
+		        const struct crypto_type *frontend);
 	void (*show)(struct seq_file *m, struct crypto_alg *alg);
+	struct crypto_alg *(*lookup)(const char *name, u32 type, u32 mask);
+
+	unsigned int type;
+	unsigned int maskclear;
+	unsigned int maskset;
+	unsigned int tfmsize;
 };
 
 struct crypto_instance {
diff --git a/include/linux/crypto.h b/include/linux/crypto.h
index ea52cd944fd..ffaaa418cf5 100644
--- a/include/linux/crypto.h
+++ b/include/linux/crypto.h
@@ -546,7 +546,9 @@ struct crypto_attr_u32 {
  * Transform user interface.
  */
  
-struct crypto_tfm *crypto_alloc_tfm(const char *alg_name, u32 tfm_flags);
+struct crypto_tfm *crypto_alloc_tfm(const char *alg_name,
+				    const struct crypto_type *frontend,
+				    u32 type, u32 mask);
 struct crypto_tfm *crypto_alloc_base(const char *alg_name, u32 type, u32 mask);
 void crypto_free_tfm(struct crypto_tfm *tfm);
 
-- 
cgit v1.2.3-70-g09d2


From 7b5a080b3c46f0cac71c0d0262634c6517d4ee4f Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Sun, 31 Aug 2008 15:47:27 +1000
Subject: crypto: hash - Add shash interface

The shash interface replaces the current synchronous hash interface.
It improves over hash in two ways.  Firstly shash is reentrant,
meaning that the same tfm may be used by two threads simultaneously
as all hashing state is stored in a local descriptor.

The other enhancement is that shash no longer takes scatter list
entries.  This is because shash is specifically designed for
synchronous algorithms and as such scatter lists are unnecessary.

All existing hash users will be converted to shash once the
algorithms have been completely converted.

There is also a new finup function that combines update with final.
This will be extended to ahash once the algorithm conversion is
done.

This is also the first time that an algorithm type has their own
registration function.  Existing algorithm types will be converted
to this way in due course.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 crypto/Makefile                |   1 +
 crypto/shash.c                 | 239 +++++++++++++++++++++++++++++++++++++++++
 include/crypto/hash.h          | 104 ++++++++++++++++++
 include/crypto/internal/hash.h |   8 ++
 include/linux/crypto.h         |   1 +
 5 files changed, 353 insertions(+)
 create mode 100644 crypto/shash.c

(limited to 'include/linux')

diff --git a/crypto/Makefile b/crypto/Makefile
index cd4a4ed078f..46b08bf2035 100644
--- a/crypto/Makefile
+++ b/crypto/Makefile
@@ -22,6 +22,7 @@ obj-$(CONFIG_CRYPTO_SEQIV) += seqiv.o
 
 crypto_hash-objs := hash.o
 crypto_hash-objs += ahash.o
+crypto_hash-objs += shash.o
 obj-$(CONFIG_CRYPTO_HASH2) += crypto_hash.o
 
 cryptomgr-objs := algboss.o testmgr.o
diff --git a/crypto/shash.c b/crypto/shash.c
new file mode 100644
index 00000000000..82ec4bd8d2f
--- /dev/null
+++ b/crypto/shash.c
@@ -0,0 +1,239 @@
+/*
+ * Synchronous Cryptographic Hash operations.
+ *
+ * Copyright (c) 2008 Herbert Xu <herbert@gondor.apana.org.au>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ *
+ */
+
+#include <crypto/internal/hash.h>
+#include <linux/err.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/seq_file.h>
+
+static inline struct crypto_shash *__crypto_shash_cast(struct crypto_tfm *tfm)
+{
+	return container_of(tfm, struct crypto_shash, base);
+}
+
+static int shash_setkey_unaligned(struct crypto_shash *tfm, const u8 *key,
+				  unsigned int keylen)
+{
+	struct shash_alg *shash = crypto_shash_alg(tfm);
+	unsigned long alignmask = crypto_shash_alignmask(tfm);
+	unsigned long absize;
+	u8 *buffer, *alignbuffer;
+	int err;
+
+	absize = keylen + (alignmask & ~(CRYPTO_MINALIGN - 1));
+	buffer = kmalloc(absize, GFP_KERNEL);
+	if (!buffer)
+		return -ENOMEM;
+
+	alignbuffer = (u8 *)ALIGN((unsigned long)buffer, alignmask + 1);
+	memcpy(alignbuffer, key, keylen);
+	err = shash->setkey(tfm, alignbuffer, keylen);
+	memset(alignbuffer, 0, keylen);
+	kfree(buffer);
+	return err;
+}
+
+int crypto_shash_setkey(struct crypto_shash *tfm, const u8 *key,
+			unsigned int keylen)
+{
+	struct shash_alg *shash = crypto_shash_alg(tfm);
+	unsigned long alignmask = crypto_shash_alignmask(tfm);
+
+	if ((unsigned long)key & alignmask)
+		return shash_setkey_unaligned(tfm, key, keylen);
+
+	return shash->setkey(tfm, key, keylen);
+}
+EXPORT_SYMBOL_GPL(crypto_shash_setkey);
+
+static inline unsigned int shash_align_buffer_size(unsigned len,
+						   unsigned long mask)
+{
+	return len + (mask & ~(__alignof__(u8 __attribute__ ((aligned))) - 1));
+}
+
+static int shash_update_unaligned(struct shash_desc *desc, const u8 *data,
+				  unsigned int len)
+{
+	struct crypto_shash *tfm = desc->tfm;
+	struct shash_alg *shash = crypto_shash_alg(tfm);
+	unsigned long alignmask = crypto_shash_alignmask(tfm);
+	unsigned int unaligned_len = alignmask + 1 -
+				     ((unsigned long)data & alignmask);
+	u8 buf[shash_align_buffer_size(unaligned_len, alignmask)]
+		__attribute__ ((aligned));
+
+	memcpy(buf, data, unaligned_len);
+
+	return shash->update(desc, buf, unaligned_len) ?:
+	       shash->update(desc, data + unaligned_len, len - unaligned_len);
+}
+
+int crypto_shash_update(struct shash_desc *desc, const u8 *data,
+			unsigned int len)
+{
+	struct crypto_shash *tfm = desc->tfm;
+	struct shash_alg *shash = crypto_shash_alg(tfm);
+	unsigned long alignmask = crypto_shash_alignmask(tfm);
+
+	if ((unsigned long)data & alignmask)
+		return shash_update_unaligned(desc, data, len);
+
+	return shash->update(desc, data, len);
+}
+EXPORT_SYMBOL_GPL(crypto_shash_update);
+
+static int shash_final_unaligned(struct shash_desc *desc, u8 *out)
+{
+	struct crypto_shash *tfm = desc->tfm;
+	unsigned long alignmask = crypto_shash_alignmask(tfm);
+	struct shash_alg *shash = crypto_shash_alg(tfm);
+	unsigned int ds = crypto_shash_digestsize(tfm);
+	u8 buf[shash_align_buffer_size(ds, alignmask)]
+		__attribute__ ((aligned));
+	int err;
+
+	err = shash->final(desc, buf);
+	memcpy(out, buf, ds);
+	return err;
+}
+
+int crypto_shash_final(struct shash_desc *desc, u8 *out)
+{
+	struct crypto_shash *tfm = desc->tfm;
+	struct shash_alg *shash = crypto_shash_alg(tfm);
+	unsigned long alignmask = crypto_shash_alignmask(tfm);
+
+	if ((unsigned long)out & alignmask)
+		return shash_final_unaligned(desc, out);
+
+	return shash->final(desc, out);
+}
+EXPORT_SYMBOL_GPL(crypto_shash_final);
+
+static int shash_finup_unaligned(struct shash_desc *desc, const u8 *data,
+				 unsigned int len, u8 *out)
+{
+	return crypto_shash_update(desc, data, len) ?:
+	       crypto_shash_final(desc, out);
+}
+
+int crypto_shash_finup(struct shash_desc *desc, const u8 *data,
+		       unsigned int len, u8 *out)
+{
+	struct crypto_shash *tfm = desc->tfm;
+	struct shash_alg *shash = crypto_shash_alg(tfm);
+	unsigned long alignmask = crypto_shash_alignmask(tfm);
+
+	if (((unsigned long)data | (unsigned long)out) & alignmask ||
+	    !shash->finup)
+		return shash_finup_unaligned(desc, data, len, out);
+
+	return shash->finup(desc, data, len, out);
+}
+EXPORT_SYMBOL_GPL(crypto_shash_finup);
+
+static int shash_digest_unaligned(struct shash_desc *desc, const u8 *data,
+				  unsigned int len, u8 *out)
+{
+	return crypto_shash_init(desc) ?:
+	       crypto_shash_update(desc, data, len) ?:
+	       crypto_shash_final(desc, out);
+}
+
+int crypto_shash_digest(struct shash_desc *desc, const u8 *data,
+			unsigned int len, u8 *out)
+{
+	struct crypto_shash *tfm = desc->tfm;
+	struct shash_alg *shash = crypto_shash_alg(tfm);
+	unsigned long alignmask = crypto_shash_alignmask(tfm);
+
+	if (((unsigned long)data | (unsigned long)out) & alignmask ||
+	    !shash->digest)
+		return shash_digest_unaligned(desc, data, len, out);
+
+	return shash->digest(desc, data, len, out);
+}
+EXPORT_SYMBOL_GPL(crypto_shash_digest);
+
+static int crypto_shash_init_tfm(struct crypto_tfm *tfm,
+				 const struct crypto_type *frontend)
+{
+	if (frontend->type != CRYPTO_ALG_TYPE_SHASH)
+		return -EINVAL;
+	return 0;
+}
+
+static unsigned int crypto_shash_extsize(struct crypto_alg *alg,
+					 const struct crypto_type *frontend)
+{
+	return alg->cra_ctxsize;
+}
+
+static void crypto_shash_show(struct seq_file *m, struct crypto_alg *alg)
+	__attribute__ ((unused));
+static void crypto_shash_show(struct seq_file *m, struct crypto_alg *alg)
+{
+	struct shash_alg *salg = __crypto_shash_alg(alg);
+
+	seq_printf(m, "type         : shash\n");
+	seq_printf(m, "blocksize    : %u\n", alg->cra_blocksize);
+	seq_printf(m, "digestsize   : %u\n", salg->digestsize);
+	seq_printf(m, "descsize     : %u\n", salg->descsize);
+}
+
+static const struct crypto_type crypto_shash_type = {
+	.extsize = crypto_shash_extsize,
+	.init_tfm = crypto_shash_init_tfm,
+#ifdef CONFIG_PROC_FS
+	.show = crypto_shash_show,
+#endif
+	.maskclear = ~CRYPTO_ALG_TYPE_MASK,
+	.maskset = CRYPTO_ALG_TYPE_MASK,
+	.type = CRYPTO_ALG_TYPE_SHASH,
+	.tfmsize = offsetof(struct crypto_shash, base),
+};
+
+struct crypto_shash *crypto_alloc_shash(const char *alg_name, u32 type,
+					u32 mask)
+{
+	return __crypto_shash_cast(
+		crypto_alloc_tfm(alg_name, &crypto_shash_type, type, mask));
+}
+EXPORT_SYMBOL_GPL(crypto_alloc_shash);
+
+int crypto_register_shash(struct shash_alg *alg)
+{
+	struct crypto_alg *base = &alg->base;
+
+	if (alg->digestsize > PAGE_SIZE / 8 ||
+	    alg->descsize > PAGE_SIZE / 8)
+		return -EINVAL;
+
+	base->cra_type = &crypto_shash_type;
+	base->cra_flags &= ~CRYPTO_ALG_TYPE_MASK;
+	base->cra_flags |= CRYPTO_ALG_TYPE_SHASH;
+
+	return crypto_register_alg(base);
+}
+EXPORT_SYMBOL_GPL(crypto_register_shash);
+
+int crypto_unregister_shash(struct shash_alg *alg)
+{
+	return crypto_unregister_alg(&alg->base);
+}
+EXPORT_SYMBOL_GPL(crypto_unregister_shash);
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("Synchronous cryptographic hash type");
diff --git a/include/crypto/hash.h b/include/crypto/hash.h
index ee48ef8fb2e..f9b51d40895 100644
--- a/include/crypto/hash.h
+++ b/include/crypto/hash.h
@@ -15,10 +15,39 @@
 
 #include <linux/crypto.h>
 
+struct shash_desc {
+	struct crypto_shash *tfm;
+	u32 flags;
+
+	void *__ctx[] CRYPTO_MINALIGN_ATTR;
+};
+
+struct shash_alg {
+	int (*init)(struct shash_desc *desc);
+	int (*update)(struct shash_desc *desc, const u8 *data,
+		      unsigned int len);
+	int (*final)(struct shash_desc *desc, u8 *out);
+	int (*finup)(struct shash_desc *desc, const u8 *data,
+		     unsigned int len, u8 *out);
+	int (*digest)(struct shash_desc *desc, const u8 *data,
+		      unsigned int len, u8 *out);
+	int (*setkey)(struct crypto_shash *tfm, const u8 *key,
+		      unsigned int keylen);
+
+	unsigned int descsize;
+	unsigned int digestsize;
+
+	struct crypto_alg base;
+};
+
 struct crypto_ahash {
 	struct crypto_tfm base;
 };
 
+struct crypto_shash {
+	struct crypto_tfm base;
+};
+
 static inline struct crypto_ahash *__crypto_ahash_cast(struct crypto_tfm *tfm)
 {
 	return (struct crypto_ahash *)tfm;
@@ -169,4 +198,79 @@ static inline void ahash_request_set_crypt(struct ahash_request *req,
 	req->result = result;
 }
 
+struct crypto_shash *crypto_alloc_shash(const char *alg_name, u32 type,
+					u32 mask);
+
+static inline struct crypto_tfm *crypto_shash_tfm(struct crypto_shash *tfm)
+{
+	return &tfm->base;
+}
+
+static inline void crypto_free_shash(struct crypto_shash *tfm)
+{
+	crypto_free_tfm(crypto_shash_tfm(tfm));
+}
+
+static inline unsigned int crypto_shash_alignmask(
+	struct crypto_shash *tfm)
+{
+	return crypto_tfm_alg_alignmask(crypto_shash_tfm(tfm));
+}
+
+static inline struct shash_alg *__crypto_shash_alg(struct crypto_alg *alg)
+{
+	return container_of(alg, struct shash_alg, base);
+}
+
+static inline struct shash_alg *crypto_shash_alg(struct crypto_shash *tfm)
+{
+	return __crypto_shash_alg(crypto_shash_tfm(tfm)->__crt_alg);
+}
+
+static inline unsigned int crypto_shash_digestsize(struct crypto_shash *tfm)
+{
+	return crypto_shash_alg(tfm)->digestsize;
+}
+
+static inline u32 crypto_shash_get_flags(struct crypto_shash *tfm)
+{
+	return crypto_tfm_get_flags(crypto_shash_tfm(tfm));
+}
+
+static inline void crypto_shash_set_flags(struct crypto_shash *tfm, u32 flags)
+{
+	crypto_tfm_set_flags(crypto_shash_tfm(tfm), flags);
+}
+
+static inline void crypto_shash_clear_flags(struct crypto_shash *tfm, u32 flags)
+{
+	crypto_tfm_clear_flags(crypto_shash_tfm(tfm), flags);
+}
+
+static inline unsigned int crypto_shash_descsize(struct crypto_shash *tfm)
+{
+	return crypto_shash_alg(tfm)->descsize;
+}
+
+static inline void *shash_desc_ctx(struct shash_desc *desc)
+{
+	return desc->__ctx;
+}
+
+int crypto_shash_setkey(struct crypto_shash *tfm, const u8 *key,
+			unsigned int keylen);
+int crypto_shash_digest(struct shash_desc *desc, const u8 *data,
+			unsigned int len, u8 *out);
+
+static inline int crypto_shash_init(struct shash_desc *desc)
+{
+	return crypto_shash_alg(desc->tfm)->init(desc);
+}
+
+int crypto_shash_update(struct shash_desc *desc, const u8 *data,
+			unsigned int len);
+int crypto_shash_final(struct shash_desc *desc, u8 *out);
+int crypto_shash_finup(struct shash_desc *desc, const u8 *data,
+		       unsigned int len, u8 *out);
+
 #endif	/* _CRYPTO_HASH_H */
diff --git a/include/crypto/internal/hash.h b/include/crypto/internal/hash.h
index 917ae57bad4..32d3a8ed06d 100644
--- a/include/crypto/internal/hash.h
+++ b/include/crypto/internal/hash.h
@@ -40,6 +40,9 @@ int crypto_hash_walk_done(struct crypto_hash_walk *walk, int err);
 int crypto_hash_walk_first(struct ahash_request *req,
 			   struct crypto_hash_walk *walk);
 
+int crypto_register_shash(struct shash_alg *alg);
+int crypto_unregister_shash(struct shash_alg *alg);
+
 static inline void *crypto_ahash_ctx(struct crypto_ahash *tfm)
 {
 	return crypto_tfm_ctx(&tfm->base);
@@ -74,5 +77,10 @@ static inline int ahash_tfm_in_queue(struct crypto_queue *queue,
 	return crypto_tfm_in_queue(queue, crypto_ahash_tfm(tfm));
 }
 
+static inline void *crypto_shash_ctx(struct crypto_shash *tfm)
+{
+	return crypto_tfm_ctx(&tfm->base);
+}
+
 #endif	/* _CRYPTO_INTERNAL_HASH_H */
 
diff --git a/include/linux/crypto.h b/include/linux/crypto.h
index ffaaa418cf5..ee95c748695 100644
--- a/include/linux/crypto.h
+++ b/include/linux/crypto.h
@@ -39,6 +39,7 @@
 #define CRYPTO_ALG_TYPE_HASH		0x00000009
 #define CRYPTO_ALG_TYPE_AHASH		0x0000000a
 #define CRYPTO_ALG_TYPE_RNG		0x0000000c
+#define CRYPTO_ALG_TYPE_SHASH		0x0000000d
 
 #define CRYPTO_ALG_TYPE_HASH_MASK	0x0000000e
 #define CRYPTO_ALG_TYPE_AHASH_MASK	0x0000000c
-- 
cgit v1.2.3-70-g09d2


From 3b2f6df08258e2875f42bd630eece7e7241a053b Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Sun, 31 Aug 2008 18:52:18 +1000
Subject: crypto: hash - Export shash through ahash

This patch allows shash algorithms to be used through the ahash
interface.  This is required before we can convert digest algorithms
over to shash.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 crypto/shash.c         | 143 +++++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/crypto.h |   2 +-
 2 files changed, 144 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/crypto/shash.c b/crypto/shash.c
index 82ec4bd8d2f..3f4c713a21e 100644
--- a/crypto/shash.c
+++ b/crypto/shash.c
@@ -10,6 +10,7 @@
  *
  */
 
+#include <crypto/scatterwalk.h>
 #include <crypto/internal/hash.h>
 #include <linux/err.h>
 #include <linux/kernel.h>
@@ -17,11 +18,15 @@
 #include <linux/slab.h>
 #include <linux/seq_file.h>
 
+static const struct crypto_type crypto_shash_type;
+
 static inline struct crypto_shash *__crypto_shash_cast(struct crypto_tfm *tfm)
 {
 	return container_of(tfm, struct crypto_shash, base);
 }
 
+#include "internal.h"
+
 static int shash_setkey_unaligned(struct crypto_shash *tfm, const u8 *key,
 				  unsigned int keylen)
 {
@@ -167,6 +172,142 @@ int crypto_shash_digest(struct shash_desc *desc, const u8 *data,
 }
 EXPORT_SYMBOL_GPL(crypto_shash_digest);
 
+static int shash_async_setkey(struct crypto_ahash *tfm, const u8 *key,
+			      unsigned int keylen)
+{
+	struct crypto_shash **ctx = crypto_ahash_ctx(tfm);
+
+	return crypto_shash_setkey(*ctx, key, keylen);
+}
+
+static int shash_async_init(struct ahash_request *req)
+{
+	struct crypto_shash **ctx = crypto_ahash_ctx(crypto_ahash_reqtfm(req));
+	struct shash_desc *desc = ahash_request_ctx(req);
+
+	desc->tfm = *ctx;
+	desc->flags = req->base.flags;
+
+	return crypto_shash_init(desc);
+}
+
+static int shash_async_update(struct ahash_request *req)
+{
+	struct shash_desc *desc = ahash_request_ctx(req);
+	struct crypto_hash_walk walk;
+	int nbytes;
+
+	for (nbytes = crypto_hash_walk_first(req, &walk); nbytes > 0;
+	     nbytes = crypto_hash_walk_done(&walk, nbytes))
+		nbytes = crypto_shash_update(desc, walk.data, nbytes);
+
+	return nbytes;
+}
+
+static int shash_async_final(struct ahash_request *req)
+{
+	return crypto_shash_final(ahash_request_ctx(req), req->result);
+}
+
+static int shash_async_digest(struct ahash_request *req)
+{
+	struct scatterlist *sg = req->src;
+	unsigned int offset = sg->offset;
+	unsigned int nbytes = req->nbytes;
+	int err;
+
+	if (nbytes < min(sg->length, ((unsigned int)(PAGE_SIZE)) - offset)) {
+		struct crypto_shash **ctx =
+			crypto_ahash_ctx(crypto_ahash_reqtfm(req));
+		struct shash_desc *desc = ahash_request_ctx(req);
+		void *data;
+
+		desc->tfm = *ctx;
+		desc->flags = req->base.flags;
+
+		data = crypto_kmap(sg_page(sg), 0);
+		err = crypto_shash_digest(desc, data + offset, nbytes,
+					  req->result);
+		crypto_kunmap(data, 0);
+		crypto_yield(desc->flags);
+		goto out;
+	}
+
+	err = shash_async_init(req);
+	if (err)
+		goto out;
+
+	err = shash_async_update(req);
+	if (err)
+		goto out;
+
+	err = shash_async_final(req);
+
+out:
+	return err;
+}
+
+static void crypto_exit_shash_ops_async(struct crypto_tfm *tfm)
+{
+	struct crypto_shash **ctx = crypto_tfm_ctx(tfm);
+
+	crypto_free_shash(*ctx);
+}
+
+static int crypto_init_shash_ops_async(struct crypto_tfm *tfm)
+{
+	struct crypto_alg *calg = tfm->__crt_alg;
+	struct shash_alg *alg = __crypto_shash_alg(calg);
+	struct ahash_tfm *crt = &tfm->crt_ahash;
+	struct crypto_shash **ctx = crypto_tfm_ctx(tfm);
+	struct crypto_shash *shash;
+
+	if (!crypto_mod_get(calg))
+		return -EAGAIN;
+
+	shash = __crypto_shash_cast(crypto_create_tfm(
+		calg, &crypto_shash_type));
+	if (IS_ERR(shash)) {
+		crypto_mod_put(calg);
+		return PTR_ERR(shash);
+	}
+
+	*ctx = shash;
+	tfm->exit = crypto_exit_shash_ops_async;
+
+	crt->init = shash_async_init;
+	crt->update = shash_async_update;
+	crt->final  = shash_async_final;
+	crt->digest = shash_async_digest;
+	crt->setkey = shash_async_setkey;
+
+	crt->digestsize = alg->digestsize;
+	crt->reqsize = sizeof(struct shash_desc) + crypto_shash_descsize(shash);
+
+	return 0;
+}
+
+static int crypto_init_shash_ops(struct crypto_tfm *tfm, u32 type, u32 mask)
+{
+	switch (mask & CRYPTO_ALG_TYPE_MASK) {
+	case CRYPTO_ALG_TYPE_AHASH_MASK:
+		return crypto_init_shash_ops_async(tfm);
+	}
+
+	return -EINVAL;
+}
+
+static unsigned int crypto_shash_ctxsize(struct crypto_alg *alg, u32 type,
+					 u32 mask)
+{
+	switch (mask & CRYPTO_ALG_TYPE_MASK) {
+	case CRYPTO_ALG_TYPE_AHASH_MASK:
+		return sizeof(struct crypto_shash *);
+	}
+
+	return 0;
+}
+
 static int crypto_shash_init_tfm(struct crypto_tfm *tfm,
 				 const struct crypto_type *frontend)
 {
@@ -194,7 +335,9 @@ static void crypto_shash_show(struct seq_file *m, struct crypto_alg *alg)
 }
 
 static const struct crypto_type crypto_shash_type = {
+	.ctxsize = crypto_shash_ctxsize,
 	.extsize = crypto_shash_extsize,
+	.init = crypto_init_shash_ops,
 	.init_tfm = crypto_shash_init_tfm,
 #ifdef CONFIG_PROC_FS
 	.show = crypto_shash_show,
diff --git a/include/linux/crypto.h b/include/linux/crypto.h
index ee95c748695..44c72f0f9b0 100644
--- a/include/linux/crypto.h
+++ b/include/linux/crypto.h
@@ -38,8 +38,8 @@
 #define CRYPTO_ALG_TYPE_DIGEST		0x00000008
 #define CRYPTO_ALG_TYPE_HASH		0x00000009
 #define CRYPTO_ALG_TYPE_AHASH		0x0000000a
+#define CRYPTO_ALG_TYPE_SHASH		0x0000000b
 #define CRYPTO_ALG_TYPE_RNG		0x0000000c
-#define CRYPTO_ALG_TYPE_SHASH		0x0000000d
 
 #define CRYPTO_ALG_TYPE_HASH_MASK	0x0000000e
 #define CRYPTO_ALG_TYPE_AHASH_MASK	0x0000000c
-- 
cgit v1.2.3-70-g09d2


From dec8b78606ebd5f309c38f2fb10196ce996dd18d Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Sun, 2 Nov 2008 21:38:11 +0800
Subject: crypto: hash - Add import/export interface

It is often useful to save the partial state of a hash function
so that it can be used as a base for two or more computations.

The most prominent example is HMAC where all hashes start from
a base determined by the key.  Having an import/export interface
means that we only have to compute that base once rather than
for each message.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 crypto/ahash.c                 | 14 ++++++++++++++
 crypto/shash.c                 | 14 ++++++++++++++
 include/crypto/hash.h          | 21 +++++++++++++++++++++
 include/crypto/internal/hash.h |  5 -----
 include/linux/crypto.h         |  1 +
 5 files changed, 50 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/crypto/ahash.c b/crypto/ahash.c
index 27128f2c687..7d4e33dfe21 100644
--- a/crypto/ahash.c
+++ b/crypto/ahash.c
@@ -146,6 +146,20 @@ static int ahash_setkey(struct crypto_ahash *tfm, const u8 *key,
 	return ahash->setkey(tfm, key, keylen);
 }
 
+int crypto_ahash_import(struct ahash_request *req, const u8 *in)
+{
+	struct crypto_ahash *tfm = crypto_ahash_reqtfm(req);
+	struct ahash_alg *alg = crypto_ahash_alg(tfm);
+
+	memcpy(ahash_request_ctx(req), in, crypto_ahash_reqsize(tfm));
+
+	if (alg->reinit)
+		alg->reinit(req);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(crypto_ahash_import);
+
 static unsigned int crypto_ahash_ctxsize(struct crypto_alg *alg, u32 type,
 					u32 mask)
 {
diff --git a/crypto/shash.c b/crypto/shash.c
index 3f4c713a21e..26aff3feefc 100644
--- a/crypto/shash.c
+++ b/crypto/shash.c
@@ -172,6 +172,20 @@ int crypto_shash_digest(struct shash_desc *desc, const u8 *data,
 }
 EXPORT_SYMBOL_GPL(crypto_shash_digest);
 
+int crypto_shash_import(struct shash_desc *desc, const u8 *in)
+{
+	struct crypto_shash *tfm = desc->tfm;
+	struct shash_alg *alg = crypto_shash_alg(tfm);
+
+	memcpy(shash_desc_ctx(desc), in, crypto_shash_descsize(tfm));
+
+	if (alg->reinit)
+		alg->reinit(desc);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(crypto_shash_import);
+
 static int shash_async_setkey(struct crypto_ahash *tfm, const u8 *key,
 			      unsigned int keylen)
 {
diff --git a/include/crypto/hash.h b/include/crypto/hash.h
index f9b51d40895..cd16d6e668c 100644
--- a/include/crypto/hash.h
+++ b/include/crypto/hash.h
@@ -24,6 +24,7 @@ struct shash_desc {
 
 struct shash_alg {
 	int (*init)(struct shash_desc *desc);
+	int (*reinit)(struct shash_desc *desc);
 	int (*update)(struct shash_desc *desc, const u8 *data,
 		      unsigned int len);
 	int (*final)(struct shash_desc *desc, u8 *out);
@@ -116,6 +117,11 @@ static inline unsigned int crypto_ahash_reqsize(struct crypto_ahash *tfm)
 	return crypto_ahash_crt(tfm)->reqsize;
 }
 
+static inline void *ahash_request_ctx(struct ahash_request *req)
+{
+	return req->__ctx;
+}
+
 static inline int crypto_ahash_setkey(struct crypto_ahash *tfm,
 				      const u8 *key, unsigned int keylen)
 {
@@ -130,6 +136,14 @@ static inline int crypto_ahash_digest(struct ahash_request *req)
 	return crt->digest(req);
 }
 
+static inline void crypto_ahash_export(struct ahash_request *req, u8 *out)
+{
+	memcpy(out, ahash_request_ctx(req),
+	       crypto_ahash_reqsize(crypto_ahash_reqtfm(req)));
+}
+
+int crypto_ahash_import(struct ahash_request *req, const u8 *in);
+
 static inline int crypto_ahash_init(struct ahash_request *req)
 {
 	struct ahash_tfm *crt = crypto_ahash_crt(crypto_ahash_reqtfm(req));
@@ -262,6 +276,13 @@ int crypto_shash_setkey(struct crypto_shash *tfm, const u8 *key,
 int crypto_shash_digest(struct shash_desc *desc, const u8 *data,
 			unsigned int len, u8 *out);
 
+static inline void crypto_shash_export(struct shash_desc *desc, u8 *out)
+{
+	memcpy(out, shash_desc_ctx(desc), crypto_shash_descsize(desc->tfm));
+}
+
+int crypto_shash_import(struct shash_desc *desc, const u8 *in);
+
 static inline int crypto_shash_init(struct shash_desc *desc)
 {
 	return crypto_shash_alg(desc->tfm)->init(desc);
diff --git a/include/crypto/internal/hash.h b/include/crypto/internal/hash.h
index 32d3a8ed06d..92fbe738585 100644
--- a/include/crypto/internal/hash.h
+++ b/include/crypto/internal/hash.h
@@ -66,11 +66,6 @@ static inline struct ahash_request *ahash_dequeue_request(
 	return ahash_request_cast(crypto_dequeue_request(queue));
 }
 
-static inline void *ahash_request_ctx(struct ahash_request *req)
-{
-	return req->__ctx;
-}
-
 static inline int ahash_tfm_in_queue(struct crypto_queue *queue,
 					  struct crypto_ahash *tfm)
 {
diff --git a/include/linux/crypto.h b/include/linux/crypto.h
index 44c72f0f9b0..77a1f3d9416 100644
--- a/include/linux/crypto.h
+++ b/include/linux/crypto.h
@@ -221,6 +221,7 @@ struct ablkcipher_alg {
 
 struct ahash_alg {
 	int (*init)(struct ahash_request *req);
+	int (*reinit)(struct ahash_request *req);
 	int (*update)(struct ahash_request *req);
 	int (*final)(struct ahash_request *req);
 	int (*digest)(struct ahash_request *req);
-- 
cgit v1.2.3-70-g09d2


From 5f7082ed4f482f05db01d84dbf58190492ebf0ad Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Sun, 31 Aug 2008 22:21:09 +1000
Subject: crypto: hash - Export shash through hash

This patch allows shash algorithms to be used through the old hash
interface.  This is a transitional measure so we can convert the
underlying algorithms to shash before converting the users across.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 crypto/ahash.c                 |  16 ++++++
 crypto/authenc.c               |   3 ++
 crypto/hmac.c                  |  10 ++--
 crypto/shash.c                 | 109 +++++++++++++++++++++++++++++++++++++++++
 include/crypto/algapi.h        |   5 ++
 include/crypto/internal/hash.h |   3 ++
 include/linux/crypto.h         |   4 +-
 7 files changed, 144 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/crypto/ahash.c b/crypto/ahash.c
index 7d4e33dfe21..9f98956b17f 100644
--- a/crypto/ahash.c
+++ b/crypto/ahash.c
@@ -112,6 +112,22 @@ int crypto_hash_walk_first(struct ahash_request *req,
 }
 EXPORT_SYMBOL_GPL(crypto_hash_walk_first);
 
+int crypto_hash_walk_first_compat(struct hash_desc *hdesc,
+				  struct crypto_hash_walk *walk,
+				  struct scatterlist *sg, unsigned int len)
+{
+	walk->total = len;
+
+	if (!walk->total)
+		return 0;
+
+	walk->alignmask = crypto_hash_alignmask(hdesc->tfm);
+	walk->sg = sg;
+	walk->flags = hdesc->flags;
+
+	return hash_walk_new_entry(walk);
+}
+
 static int ahash_setkey_unaligned(struct crypto_ahash *tfm, const u8 *key,
 				unsigned int keylen)
 {
diff --git a/crypto/authenc.c b/crypto/authenc.c
index fd9f06c63d7..40b6e9ec9e3 100644
--- a/crypto/authenc.c
+++ b/crypto/authenc.c
@@ -11,6 +11,7 @@
  */
 
 #include <crypto/aead.h>
+#include <crypto/internal/hash.h>
 #include <crypto/internal/skcipher.h>
 #include <crypto/authenc.h>
 #include <crypto/scatterwalk.h>
@@ -431,6 +432,8 @@ static struct crypto_instance *crypto_authenc_alloc(struct rtattr **tb)
 	inst->alg.cra_aead.ivsize = enc->cra_ablkcipher.ivsize;
 	inst->alg.cra_aead.maxauthsize = auth->cra_type == &crypto_hash_type ?
 					 auth->cra_hash.digestsize :
+					 auth->cra_type ?
+					 __crypto_shash_alg(auth)->digestsize :
 					 auth->cra_digest.dia_digestsize;
 
 	inst->alg.cra_ctxsize = sizeof(struct crypto_authenc_ctx);
diff --git a/crypto/hmac.c b/crypto/hmac.c
index 7ff2d6a8c7d..0ad39c37496 100644
--- a/crypto/hmac.c
+++ b/crypto/hmac.c
@@ -16,7 +16,7 @@
  *
  */
 
-#include <crypto/algapi.h>
+#include <crypto/internal/hash.h>
 #include <crypto/scatterwalk.h>
 #include <linux/err.h>
 #include <linux/init.h>
@@ -238,9 +238,11 @@ static struct crypto_instance *hmac_alloc(struct rtattr **tb)
 		return ERR_CAST(alg);
 
 	inst = ERR_PTR(-EINVAL);
-	ds = (alg->cra_flags & CRYPTO_ALG_TYPE_MASK) ==
-	     CRYPTO_ALG_TYPE_HASH ? alg->cra_hash.digestsize :
-				    alg->cra_digest.dia_digestsize;
+	ds = alg->cra_type == &crypto_hash_type ?
+	     alg->cra_hash.digestsize :
+	     alg->cra_type ?
+	     __crypto_shash_alg(alg)->digestsize :
+	     alg->cra_digest.dia_digestsize;
 	if (ds > alg->cra_blocksize)
 		goto out_put_alg;
 
diff --git a/crypto/shash.c b/crypto/shash.c
index 26aff3feefc..50d69a4e4b6 100644
--- a/crypto/shash.c
+++ b/crypto/shash.c
@@ -301,9 +301,114 @@ static int crypto_init_shash_ops_async(struct crypto_tfm *tfm)
 	return 0;
 }
 
+static int shash_compat_setkey(struct crypto_hash *tfm, const u8 *key,
+			       unsigned int keylen)
+{
+	struct shash_desc *desc = crypto_hash_ctx(tfm);
+
+	return crypto_shash_setkey(desc->tfm, key, keylen);
+}
+
+static int shash_compat_init(struct hash_desc *hdesc)
+{
+	struct shash_desc *desc = crypto_hash_ctx(hdesc->tfm);
+
+	desc->flags = hdesc->flags;
+
+	return crypto_shash_init(desc);
+}
+
+static int shash_compat_update(struct hash_desc *hdesc, struct scatterlist *sg,
+			       unsigned int len)
+{
+	struct shash_desc *desc = crypto_hash_ctx(hdesc->tfm);
+	struct crypto_hash_walk walk;
+	int nbytes;
+
+	for (nbytes = crypto_hash_walk_first_compat(hdesc, &walk, sg, len);
+	     nbytes > 0; nbytes = crypto_hash_walk_done(&walk, nbytes))
+		nbytes = crypto_shash_update(desc, walk.data, nbytes);
+
+	return nbytes;
+}
+
+static int shash_compat_final(struct hash_desc *hdesc, u8 *out)
+{
+	return crypto_shash_final(crypto_hash_ctx(hdesc->tfm), out);
+}
+
+static int shash_compat_digest(struct hash_desc *hdesc, struct scatterlist *sg,
+			       unsigned int nbytes, u8 *out)
+{
+	unsigned int offset = sg->offset;
+	int err;
+
+	if (nbytes < min(sg->length, ((unsigned int)(PAGE_SIZE)) - offset)) {
+		struct shash_desc *desc = crypto_hash_ctx(hdesc->tfm);
+		void *data;
+
+		desc->flags = hdesc->flags;
+
+		data = crypto_kmap(sg_page(sg), 0);
+		err = crypto_shash_digest(desc, data + offset, nbytes, out);
+		crypto_kunmap(data, 0);
+		crypto_yield(desc->flags);
+		goto out;
+	}
+
+	err = shash_compat_init(hdesc);
+	if (err)
+		goto out;
+
+	err = shash_compat_update(hdesc, sg, nbytes);
+	if (err)
+		goto out;
+
+	err = shash_compat_final(hdesc, out);
+
+out:
+	return err;
+}
+
+static void crypto_exit_shash_ops_compat(struct crypto_tfm *tfm)
+{
+	struct shash_desc *desc= crypto_tfm_ctx(tfm);
+
+	crypto_free_shash(desc->tfm);
+}
+
+static int crypto_init_shash_ops_compat(struct crypto_tfm *tfm)
+{
+	struct hash_tfm *crt = &tfm->crt_hash;
+	struct crypto_alg *calg = tfm->__crt_alg;
+	struct shash_alg *alg = __crypto_shash_alg(calg);
+	struct shash_desc *desc = crypto_tfm_ctx(tfm);
+	struct crypto_shash *shash;
+
+	shash = __crypto_shash_cast(crypto_create_tfm(
+		calg, &crypto_shash_type));
+	if (IS_ERR(shash))
+		return PTR_ERR(shash);
+
+	desc->tfm = shash;
+	tfm->exit = crypto_exit_shash_ops_compat;
+
+	crt->init = shash_compat_init;
+	crt->update = shash_compat_update;
+	crt->final  = shash_compat_final;
+	crt->digest = shash_compat_digest;
+	crt->setkey = shash_compat_setkey;
+
+	crt->digestsize = alg->digestsize;
+
+	return 0;
+}
+
 static int crypto_init_shash_ops(struct crypto_tfm *tfm, u32 type, u32 mask)
 {
 	switch (mask & CRYPTO_ALG_TYPE_MASK) {
+	case CRYPTO_ALG_TYPE_HASH_MASK:
+		return crypto_init_shash_ops_compat(tfm);
 	case CRYPTO_ALG_TYPE_AHASH_MASK:
 		return crypto_init_shash_ops_async(tfm);
 	}
@@ -314,7 +419,11 @@ static int crypto_init_shash_ops(struct crypto_tfm *tfm, u32 type, u32 mask)
 static unsigned int crypto_shash_ctxsize(struct crypto_alg *alg, u32 type,
 					 u32 mask)
 {
+	struct shash_alg *salg = __crypto_shash_alg(alg);
+
 	switch (mask & CRYPTO_ALG_TYPE_MASK) {
+	case CRYPTO_ALG_TYPE_HASH_MASK:
+		return sizeof(struct shash_desc) + salg->descsize;
 	case CRYPTO_ALG_TYPE_AHASH_MASK:
 		return sizeof(struct crypto_shash *);
 	}
diff --git a/include/crypto/algapi.h b/include/crypto/algapi.h
index 986db68548f..010545436ef 100644
--- a/include/crypto/algapi.h
+++ b/include/crypto/algapi.h
@@ -248,6 +248,11 @@ static inline struct crypto_hash *crypto_spawn_hash(struct crypto_spawn *spawn)
 	return __crypto_hash_cast(crypto_spawn_tfm(spawn, type, mask));
 }
 
+static inline void *crypto_hash_ctx(struct crypto_hash *tfm)
+{
+	return crypto_tfm_ctx(&tfm->base);
+}
+
 static inline void *crypto_hash_ctx_aligned(struct crypto_hash *tfm)
 {
 	return crypto_tfm_ctx_aligned(&tfm->base);
diff --git a/include/crypto/internal/hash.h b/include/crypto/internal/hash.h
index 92fbe738585..82b70564bca 100644
--- a/include/crypto/internal/hash.h
+++ b/include/crypto/internal/hash.h
@@ -39,6 +39,9 @@ extern const struct crypto_type crypto_ahash_type;
 int crypto_hash_walk_done(struct crypto_hash_walk *walk, int err);
 int crypto_hash_walk_first(struct ahash_request *req,
 			   struct crypto_hash_walk *walk);
+int crypto_hash_walk_first_compat(struct hash_desc *hdesc,
+				  struct crypto_hash_walk *walk,
+				  struct scatterlist *sg, unsigned int len);
 
 int crypto_register_shash(struct shash_alg *alg);
 int crypto_unregister_shash(struct shash_alg *alg);
diff --git a/include/linux/crypto.h b/include/linux/crypto.h
index 77a1f3d9416..3bacd71509f 100644
--- a/include/linux/crypto.h
+++ b/include/linux/crypto.h
@@ -36,9 +36,9 @@
 #define CRYPTO_ALG_TYPE_ABLKCIPHER	0x00000005
 #define CRYPTO_ALG_TYPE_GIVCIPHER	0x00000006
 #define CRYPTO_ALG_TYPE_DIGEST		0x00000008
-#define CRYPTO_ALG_TYPE_HASH		0x00000009
+#define CRYPTO_ALG_TYPE_HASH		0x00000008
+#define CRYPTO_ALG_TYPE_SHASH		0x00000009
 #define CRYPTO_ALG_TYPE_AHASH		0x0000000a
-#define CRYPTO_ALG_TYPE_SHASH		0x0000000b
 #define CRYPTO_ALG_TYPE_RNG		0x0000000c
 
 #define CRYPTO_ALG_TYPE_HASH_MASK	0x0000000e
-- 
cgit v1.2.3-70-g09d2


From 69c35efcf1576ab5f00cba83e8ca740923afb6c9 Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Fri, 7 Nov 2008 15:11:47 +0800
Subject: libcrc32c: Move implementation to crypto crc32c

This patch swaps the role of libcrc32c and crc32c.  Previously
the implementation was in libcrc32c and crc32c was a wrapper.
Now the code is in crc32c and libcrc32c just calls the crypto
layer.

The reason for the change is to tap into the algorithm selection
capability of the crypto API so that optimised implementations
such as the one utilising Intel's CRC32C instruction can be
used where available.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 crypto/Kconfig         |   4 +-
 crypto/crc32c.c        | 113 +++++++++++++++++++++++++++++-
 include/linux/crc32c.h |   5 +-
 lib/Kconfig            |   1 +
 lib/libcrc32c.c        | 182 +++++++++----------------------------------------
 5 files changed, 146 insertions(+), 159 deletions(-)

(limited to 'include/linux')

diff --git a/crypto/Kconfig b/crypto/Kconfig
index dc20a34ba5e..aede80246df 100644
--- a/crypto/Kconfig
+++ b/crypto/Kconfig
@@ -256,12 +256,10 @@ comment "Digest"
 config CRYPTO_CRC32C
 	tristate "CRC32c CRC algorithm"
 	select CRYPTO_HASH
-	select LIBCRC32C
 	help
 	  Castagnoli, et al Cyclic Redundancy-Check Algorithm.  Used
 	  by iSCSI for header and data digests and by others.
-	  See Castagnoli93.  This implementation uses lib/libcrc32c.
-	  Module will be crc32c.
+	  See Castagnoli93.  Module will be crc32c.
 
 config CRYPTO_CRC32C_INTEL
 	tristate "CRC32c INTEL hardware acceleration"
diff --git a/crypto/crc32c.c b/crypto/crc32c.c
index b21b93f2bb9..973bc2cfab2 100644
--- a/crypto/crc32c.c
+++ b/crypto/crc32c.c
@@ -3,8 +3,29 @@
  *
  * CRC32C chksum
  *
- * This module file is a wrapper to invoke the lib/crc32c routines.
+ *@Article{castagnoli-crc,
+ * author =       { Guy Castagnoli and Stefan Braeuer and Martin Herrman},
+ * title =        {{Optimization of Cyclic Redundancy-Check Codes with 24
+ *                 and 32 Parity Bits}},
+ * journal =      IEEE Transactions on Communication,
+ * year =         {1993},
+ * volume =       {41},
+ * number =       {6},
+ * pages =        {},
+ * month =        {June},
+ *}
+ * Used by the iSCSI driver, possibly others, and derived from the
+ * the iscsi-crc.c module of the linux-iscsi driver at
+ * http://linux-iscsi.sourceforge.net.
  *
+ * Following the example of lib/crc32, this function is intended to be
+ * flexible and useful for all users.  Modules that currently have their
+ * own crc32c, but hopefully may be able to use this one are:
+ *  net/sctp (please add all your doco to here if you change to
+ *            use this one!)
+ *  <endoflist>
+ *
+ * Copyright (c) 2004 Cisco Systems, Inc.
  * Copyright (c) 2008 Herbert Xu <herbert@gondor.apana.org.au>
  *
  * This program is free software; you can redistribute it and/or modify it
@@ -18,7 +39,6 @@
 #include <linux/init.h>
 #include <linux/module.h>
 #include <linux/string.h>
-#include <linux/crc32c.h>
 #include <linux/kernel.h>
 
 #define CHKSUM_BLOCK_SIZE	1
@@ -32,6 +52,95 @@ struct chksum_desc_ctx {
 	u32 crc;
 };
 
+/*
+ * This is the CRC-32C table
+ * Generated with:
+ * width = 32 bits
+ * poly = 0x1EDC6F41
+ * reflect input bytes = true
+ * reflect output bytes = true
+ */
+
+static const u32 crc32c_table[256] = {
+	0x00000000L, 0xF26B8303L, 0xE13B70F7L, 0x1350F3F4L,
+	0xC79A971FL, 0x35F1141CL, 0x26A1E7E8L, 0xD4CA64EBL,
+	0x8AD958CFL, 0x78B2DBCCL, 0x6BE22838L, 0x9989AB3BL,
+	0x4D43CFD0L, 0xBF284CD3L, 0xAC78BF27L, 0x5E133C24L,
+	0x105EC76FL, 0xE235446CL, 0xF165B798L, 0x030E349BL,
+	0xD7C45070L, 0x25AFD373L, 0x36FF2087L, 0xC494A384L,
+	0x9A879FA0L, 0x68EC1CA3L, 0x7BBCEF57L, 0x89D76C54L,
+	0x5D1D08BFL, 0xAF768BBCL, 0xBC267848L, 0x4E4DFB4BL,
+	0x20BD8EDEL, 0xD2D60DDDL, 0xC186FE29L, 0x33ED7D2AL,
+	0xE72719C1L, 0x154C9AC2L, 0x061C6936L, 0xF477EA35L,
+	0xAA64D611L, 0x580F5512L, 0x4B5FA6E6L, 0xB93425E5L,
+	0x6DFE410EL, 0x9F95C20DL, 0x8CC531F9L, 0x7EAEB2FAL,
+	0x30E349B1L, 0xC288CAB2L, 0xD1D83946L, 0x23B3BA45L,
+	0xF779DEAEL, 0x05125DADL, 0x1642AE59L, 0xE4292D5AL,
+	0xBA3A117EL, 0x4851927DL, 0x5B016189L, 0xA96AE28AL,
+	0x7DA08661L, 0x8FCB0562L, 0x9C9BF696L, 0x6EF07595L,
+	0x417B1DBCL, 0xB3109EBFL, 0xA0406D4BL, 0x522BEE48L,
+	0x86E18AA3L, 0x748A09A0L, 0x67DAFA54L, 0x95B17957L,
+	0xCBA24573L, 0x39C9C670L, 0x2A993584L, 0xD8F2B687L,
+	0x0C38D26CL, 0xFE53516FL, 0xED03A29BL, 0x1F682198L,
+	0x5125DAD3L, 0xA34E59D0L, 0xB01EAA24L, 0x42752927L,
+	0x96BF4DCCL, 0x64D4CECFL, 0x77843D3BL, 0x85EFBE38L,
+	0xDBFC821CL, 0x2997011FL, 0x3AC7F2EBL, 0xC8AC71E8L,
+	0x1C661503L, 0xEE0D9600L, 0xFD5D65F4L, 0x0F36E6F7L,
+	0x61C69362L, 0x93AD1061L, 0x80FDE395L, 0x72966096L,
+	0xA65C047DL, 0x5437877EL, 0x4767748AL, 0xB50CF789L,
+	0xEB1FCBADL, 0x197448AEL, 0x0A24BB5AL, 0xF84F3859L,
+	0x2C855CB2L, 0xDEEEDFB1L, 0xCDBE2C45L, 0x3FD5AF46L,
+	0x7198540DL, 0x83F3D70EL, 0x90A324FAL, 0x62C8A7F9L,
+	0xB602C312L, 0x44694011L, 0x5739B3E5L, 0xA55230E6L,
+	0xFB410CC2L, 0x092A8FC1L, 0x1A7A7C35L, 0xE811FF36L,
+	0x3CDB9BDDL, 0xCEB018DEL, 0xDDE0EB2AL, 0x2F8B6829L,
+	0x82F63B78L, 0x709DB87BL, 0x63CD4B8FL, 0x91A6C88CL,
+	0x456CAC67L, 0xB7072F64L, 0xA457DC90L, 0x563C5F93L,
+	0x082F63B7L, 0xFA44E0B4L, 0xE9141340L, 0x1B7F9043L,
+	0xCFB5F4A8L, 0x3DDE77ABL, 0x2E8E845FL, 0xDCE5075CL,
+	0x92A8FC17L, 0x60C37F14L, 0x73938CE0L, 0x81F80FE3L,
+	0x55326B08L, 0xA759E80BL, 0xB4091BFFL, 0x466298FCL,
+	0x1871A4D8L, 0xEA1A27DBL, 0xF94AD42FL, 0x0B21572CL,
+	0xDFEB33C7L, 0x2D80B0C4L, 0x3ED04330L, 0xCCBBC033L,
+	0xA24BB5A6L, 0x502036A5L, 0x4370C551L, 0xB11B4652L,
+	0x65D122B9L, 0x97BAA1BAL, 0x84EA524EL, 0x7681D14DL,
+	0x2892ED69L, 0xDAF96E6AL, 0xC9A99D9EL, 0x3BC21E9DL,
+	0xEF087A76L, 0x1D63F975L, 0x0E330A81L, 0xFC588982L,
+	0xB21572C9L, 0x407EF1CAL, 0x532E023EL, 0xA145813DL,
+	0x758FE5D6L, 0x87E466D5L, 0x94B49521L, 0x66DF1622L,
+	0x38CC2A06L, 0xCAA7A905L, 0xD9F75AF1L, 0x2B9CD9F2L,
+	0xFF56BD19L, 0x0D3D3E1AL, 0x1E6DCDEEL, 0xEC064EEDL,
+	0xC38D26C4L, 0x31E6A5C7L, 0x22B65633L, 0xD0DDD530L,
+	0x0417B1DBL, 0xF67C32D8L, 0xE52CC12CL, 0x1747422FL,
+	0x49547E0BL, 0xBB3FFD08L, 0xA86F0EFCL, 0x5A048DFFL,
+	0x8ECEE914L, 0x7CA56A17L, 0x6FF599E3L, 0x9D9E1AE0L,
+	0xD3D3E1ABL, 0x21B862A8L, 0x32E8915CL, 0xC083125FL,
+	0x144976B4L, 0xE622F5B7L, 0xF5720643L, 0x07198540L,
+	0x590AB964L, 0xAB613A67L, 0xB831C993L, 0x4A5A4A90L,
+	0x9E902E7BL, 0x6CFBAD78L, 0x7FAB5E8CL, 0x8DC0DD8FL,
+	0xE330A81AL, 0x115B2B19L, 0x020BD8EDL, 0xF0605BEEL,
+	0x24AA3F05L, 0xD6C1BC06L, 0xC5914FF2L, 0x37FACCF1L,
+	0x69E9F0D5L, 0x9B8273D6L, 0x88D28022L, 0x7AB90321L,
+	0xAE7367CAL, 0x5C18E4C9L, 0x4F48173DL, 0xBD23943EL,
+	0xF36E6F75L, 0x0105EC76L, 0x12551F82L, 0xE03E9C81L,
+	0x34F4F86AL, 0xC69F7B69L, 0xD5CF889DL, 0x27A40B9EL,
+	0x79B737BAL, 0x8BDCB4B9L, 0x988C474DL, 0x6AE7C44EL,
+	0xBE2DA0A5L, 0x4C4623A6L, 0x5F16D052L, 0xAD7D5351L
+};
+
+/*
+ * Steps through buffer one byte at at time, calculates reflected
+ * crc using table.
+ */
+
+static u32 crc32c(u32 crc, const u8 *data, unsigned int length)
+{
+	while (length--)
+		crc = crc32c_table[(crc ^ *data++) & 0xFFL] ^ (crc >> 8);
+
+	return crc;
+}
+
 /*
  * Steps through buffer one byte at at time, calculates reflected 
  * crc using table.
diff --git a/include/linux/crc32c.h b/include/linux/crc32c.h
index 508f512e5a2..66fa8ff795e 100644
--- a/include/linux/crc32c.h
+++ b/include/linux/crc32c.h
@@ -3,9 +3,6 @@
 
 #include <linux/types.h>
 
-extern u32 crc32c_le(u32 crc, unsigned char const *address, size_t length);
-extern u32 crc32c_be(u32 crc, unsigned char const *address, size_t length);
-
-#define crc32c(seed, data, length)  crc32c_le(seed, (unsigned char const *)data, length)
+extern u32 crc32c(u32 crc, const void *address, unsigned int length);
 
 #endif	/* _LINUX_CRC32C_H */
diff --git a/lib/Kconfig b/lib/Kconfig
index 85cf7ea978a..ce303f13ed9 100644
--- a/lib/Kconfig
+++ b/lib/Kconfig
@@ -64,6 +64,7 @@ config CRC7
 
 config LIBCRC32C
 	tristate "CRC32c (Castagnoli, et al) Cyclic Redundancy-Check"
+	select CRYPTO_CRC32C
 	help
 	  This option is provided for the case where no in-kernel-tree
 	  modules require CRC32c functions, but a module built outside the
diff --git a/lib/libcrc32c.c b/lib/libcrc32c.c
index b5c3287d8ea..38b17ab52ff 100644
--- a/lib/libcrc32c.c
+++ b/lib/libcrc32c.c
@@ -30,168 +30,50 @@
  * any later version.
  *
  */
-#include <linux/crc32c.h>
-#include <linux/compiler.h>
-#include <linux/module.h>
-
-MODULE_AUTHOR("Clay Haapala <chaapala@cisco.com>");
-MODULE_DESCRIPTION("CRC32c (Castagnoli) calculations");
-MODULE_LICENSE("GPL");
-
-#define CRC32C_POLY_BE 0x1EDC6F41
-#define CRC32C_POLY_LE 0x82F63B78
 
-#ifndef CRC_LE_BITS 
-# define CRC_LE_BITS 8
-#endif
-
-
-/*
- * Haven't generated a big-endian table yet, but the bit-wise version
- * should at least work.
- */
-#if defined CRC_BE_BITS && CRC_BE_BITS != 1
-#undef CRC_BE_BITS
-#endif
-#ifndef CRC_BE_BITS
-# define CRC_BE_BITS 1
-#endif
+#include <crypto/hash.h>
+#include <linux/err.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
 
-EXPORT_SYMBOL(crc32c_le);
+static struct crypto_shash *tfm;
 
-#if CRC_LE_BITS == 1
-/*
- * Compute things bit-wise, as done in crc32.c.  We could share the tight 
- * loop below with crc32 and vary the POLY if we don't find value in terms
- * of space and maintainability in keeping the two modules separate.
- */
-u32 __pure
-crc32c_le(u32 crc, unsigned char const *p, size_t len)
+u32 crc32c(u32 crc, const void *address, unsigned int length)
 {
-	int i;
-	while (len--) {
-		crc ^= *p++;
-		for (i = 0; i < 8; i++)
-			crc = (crc >> 1) ^ ((crc & 1) ? CRC32C_POLY_LE : 0);
-	}
-	return crc;
-}
-#else
+	struct {
+		struct shash_desc shash;
+		char ctx[crypto_shash_descsize(tfm)];
+	} desc;
+	int err;
 
-/*
- * This is the CRC-32C table
- * Generated with:
- * width = 32 bits
- * poly = 0x1EDC6F41
- * reflect input bytes = true
- * reflect output bytes = true
- */
+	desc.shash.tfm = tfm;
+	desc.shash.flags = 0;
+	*(u32 *)desc.ctx = crc;
 
-static const u32 crc32c_table[256] = {
-	0x00000000L, 0xF26B8303L, 0xE13B70F7L, 0x1350F3F4L,
-	0xC79A971FL, 0x35F1141CL, 0x26A1E7E8L, 0xD4CA64EBL,
-	0x8AD958CFL, 0x78B2DBCCL, 0x6BE22838L, 0x9989AB3BL,
-	0x4D43CFD0L, 0xBF284CD3L, 0xAC78BF27L, 0x5E133C24L,
-	0x105EC76FL, 0xE235446CL, 0xF165B798L, 0x030E349BL,
-	0xD7C45070L, 0x25AFD373L, 0x36FF2087L, 0xC494A384L,
-	0x9A879FA0L, 0x68EC1CA3L, 0x7BBCEF57L, 0x89D76C54L,
-	0x5D1D08BFL, 0xAF768BBCL, 0xBC267848L, 0x4E4DFB4BL,
-	0x20BD8EDEL, 0xD2D60DDDL, 0xC186FE29L, 0x33ED7D2AL,
-	0xE72719C1L, 0x154C9AC2L, 0x061C6936L, 0xF477EA35L,
-	0xAA64D611L, 0x580F5512L, 0x4B5FA6E6L, 0xB93425E5L,
-	0x6DFE410EL, 0x9F95C20DL, 0x8CC531F9L, 0x7EAEB2FAL,
-	0x30E349B1L, 0xC288CAB2L, 0xD1D83946L, 0x23B3BA45L,
-	0xF779DEAEL, 0x05125DADL, 0x1642AE59L, 0xE4292D5AL,
-	0xBA3A117EL, 0x4851927DL, 0x5B016189L, 0xA96AE28AL,
-	0x7DA08661L, 0x8FCB0562L, 0x9C9BF696L, 0x6EF07595L,
-	0x417B1DBCL, 0xB3109EBFL, 0xA0406D4BL, 0x522BEE48L,
-	0x86E18AA3L, 0x748A09A0L, 0x67DAFA54L, 0x95B17957L,
-	0xCBA24573L, 0x39C9C670L, 0x2A993584L, 0xD8F2B687L,
-	0x0C38D26CL, 0xFE53516FL, 0xED03A29BL, 0x1F682198L,
-	0x5125DAD3L, 0xA34E59D0L, 0xB01EAA24L, 0x42752927L,
-	0x96BF4DCCL, 0x64D4CECFL, 0x77843D3BL, 0x85EFBE38L,
-	0xDBFC821CL, 0x2997011FL, 0x3AC7F2EBL, 0xC8AC71E8L,
-	0x1C661503L, 0xEE0D9600L, 0xFD5D65F4L, 0x0F36E6F7L,
-	0x61C69362L, 0x93AD1061L, 0x80FDE395L, 0x72966096L,
-	0xA65C047DL, 0x5437877EL, 0x4767748AL, 0xB50CF789L,
-	0xEB1FCBADL, 0x197448AEL, 0x0A24BB5AL, 0xF84F3859L,
-	0x2C855CB2L, 0xDEEEDFB1L, 0xCDBE2C45L, 0x3FD5AF46L,
-	0x7198540DL, 0x83F3D70EL, 0x90A324FAL, 0x62C8A7F9L,
-	0xB602C312L, 0x44694011L, 0x5739B3E5L, 0xA55230E6L,
-	0xFB410CC2L, 0x092A8FC1L, 0x1A7A7C35L, 0xE811FF36L,
-	0x3CDB9BDDL, 0xCEB018DEL, 0xDDE0EB2AL, 0x2F8B6829L,
-	0x82F63B78L, 0x709DB87BL, 0x63CD4B8FL, 0x91A6C88CL,
-	0x456CAC67L, 0xB7072F64L, 0xA457DC90L, 0x563C5F93L,
-	0x082F63B7L, 0xFA44E0B4L, 0xE9141340L, 0x1B7F9043L,
-	0xCFB5F4A8L, 0x3DDE77ABL, 0x2E8E845FL, 0xDCE5075CL,
-	0x92A8FC17L, 0x60C37F14L, 0x73938CE0L, 0x81F80FE3L,
-	0x55326B08L, 0xA759E80BL, 0xB4091BFFL, 0x466298FCL,
-	0x1871A4D8L, 0xEA1A27DBL, 0xF94AD42FL, 0x0B21572CL,
-	0xDFEB33C7L, 0x2D80B0C4L, 0x3ED04330L, 0xCCBBC033L,
-	0xA24BB5A6L, 0x502036A5L, 0x4370C551L, 0xB11B4652L,
-	0x65D122B9L, 0x97BAA1BAL, 0x84EA524EL, 0x7681D14DL,
-	0x2892ED69L, 0xDAF96E6AL, 0xC9A99D9EL, 0x3BC21E9DL,
-	0xEF087A76L, 0x1D63F975L, 0x0E330A81L, 0xFC588982L,
-	0xB21572C9L, 0x407EF1CAL, 0x532E023EL, 0xA145813DL,
-	0x758FE5D6L, 0x87E466D5L, 0x94B49521L, 0x66DF1622L,
-	0x38CC2A06L, 0xCAA7A905L, 0xD9F75AF1L, 0x2B9CD9F2L,
-	0xFF56BD19L, 0x0D3D3E1AL, 0x1E6DCDEEL, 0xEC064EEDL,
-	0xC38D26C4L, 0x31E6A5C7L, 0x22B65633L, 0xD0DDD530L,
-	0x0417B1DBL, 0xF67C32D8L, 0xE52CC12CL, 0x1747422FL,
-	0x49547E0BL, 0xBB3FFD08L, 0xA86F0EFCL, 0x5A048DFFL,
-	0x8ECEE914L, 0x7CA56A17L, 0x6FF599E3L, 0x9D9E1AE0L,
-	0xD3D3E1ABL, 0x21B862A8L, 0x32E8915CL, 0xC083125FL,
-	0x144976B4L, 0xE622F5B7L, 0xF5720643L, 0x07198540L,
-	0x590AB964L, 0xAB613A67L, 0xB831C993L, 0x4A5A4A90L,
-	0x9E902E7BL, 0x6CFBAD78L, 0x7FAB5E8CL, 0x8DC0DD8FL,
-	0xE330A81AL, 0x115B2B19L, 0x020BD8EDL, 0xF0605BEEL,
-	0x24AA3F05L, 0xD6C1BC06L, 0xC5914FF2L, 0x37FACCF1L,
-	0x69E9F0D5L, 0x9B8273D6L, 0x88D28022L, 0x7AB90321L,
-	0xAE7367CAL, 0x5C18E4C9L, 0x4F48173DL, 0xBD23943EL,
-	0xF36E6F75L, 0x0105EC76L, 0x12551F82L, 0xE03E9C81L,
-	0x34F4F86AL, 0xC69F7B69L, 0xD5CF889DL, 0x27A40B9EL,
-	0x79B737BAL, 0x8BDCB4B9L, 0x988C474DL, 0x6AE7C44EL,
-	0xBE2DA0A5L, 0x4C4623A6L, 0x5F16D052L, 0xAD7D5351L
-};
+	err = crypto_shash_update(&desc.shash, address, length);
+	BUG_ON(err);
 
-/*
- * Steps through buffer one byte at at time, calculates reflected 
- * crc using table.
- */
+	return *(u32 *)desc.ctx;
+}
 
-u32 __pure
-crc32c_le(u32 crc, unsigned char const *data, size_t length)
+static int __init libcrc32c_mod_init(void)
 {
-	while (length--)
-		crc =
-		    crc32c_table[(crc ^ *data++) & 0xFFL] ^ (crc >> 8);
+	tfm = crypto_alloc_shash("crc32c", 0, 0);
+	if (IS_ERR(tfm))
+		return PTR_ERR(tfm);
 
-	return crc;
+	return 0;
 }
 
-#endif	/* CRC_LE_BITS == 8 */
-
-EXPORT_SYMBOL(crc32c_be);
-
-#if CRC_BE_BITS == 1
-u32 __pure
-crc32c_be(u32 crc, unsigned char const *p, size_t len)
+static void __exit libcrc32c_mod_fini(void)
 {
-	int i;
-	while (len--) {
-		crc ^= *p++ << 24;
-		for (i = 0; i < 8; i++)
-			crc =
-			    (crc << 1) ^ ((crc & 0x80000000) ? CRC32C_POLY_BE :
-					  0);
-	}
-	return crc;
+	crypto_free_shash(tfm);
 }
-#endif
 
-/*
- * Unit test
- *
- * A small unit test suite is implemented as part of the crypto suite.
- * Select CRYPTO_CRC32C and use the tcrypt module to run the tests.
- */
+module_init(libcrc32c_mod_init);
+module_exit(libcrc32c_mod_fini);
+
+MODULE_AUTHOR("Clay Haapala <chaapala@cisco.com>");
+MODULE_DESCRIPTION("CRC32c (Castagnoli) calculations");
+MODULE_LICENSE("GPL");
-- 
cgit v1.2.3-70-g09d2


From 0426c166424ea6d3d0412f47879c8ba268f874c4 Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Tue, 11 Nov 2008 12:20:06 +0800
Subject: libcrc32c: Add crc32c_le macro

The bnx2x driver actually uses the crc32c_le name so this patch
restores the crc32c_le symbol through a macro.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 include/linux/crc32c.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/crc32c.h b/include/linux/crc32c.h
index 66fa8ff795e..bd8b44d96bd 100644
--- a/include/linux/crc32c.h
+++ b/include/linux/crc32c.h
@@ -5,4 +5,7 @@
 
 extern u32 crc32c(u32 crc, const void *address, unsigned int length);
 
+/* This macro exists for backwards-compatibility. */
+#define crc32c_le crc32c
+
 #endif	/* _LINUX_CRC32C_H */
-- 
cgit v1.2.3-70-g09d2


From f9af0e70911e9d6cc9a68f784dca86415486084d Mon Sep 17 00:00:00 2001
From: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Date: Fri, 26 Dec 2008 12:24:24 +0900
Subject: irq: for_each_irq_desc() move to irqnr.h

Impact: cleanup

before CONFIG_SPARSE_IRQ age, for_each_irq_desc() sat in irqnr.h and
could be called from generic code.

CONFIG_SPARSE_IRQ breaks this assumption, but SPARSE_IRQ version
for_each_irq_desc() also can move into irqnr.h easily.

Also, this patch unifies CONFIG_SPARSE_IRQ and !CONFIG_SPARSE_IRQ
for_each_irq_desc().

Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/irq.h   | 24 ++++--------------------
 include/linux/irqnr.h | 19 +++++++++----------
 kernel/irq/handle.c   | 13 +++++++++++--
 3 files changed, 24 insertions(+), 32 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/irq.h b/include/linux/irq.h
index 98564dc6447..69da275c0eb 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -202,33 +202,17 @@ extern void arch_free_chip_data(struct irq_desc *old_desc, struct irq_desc *desc
 
 #ifndef CONFIG_SPARSE_IRQ
 extern struct irq_desc irq_desc[NR_IRQS];
-
-static inline struct irq_desc *irq_to_desc(unsigned int irq)
-{
-	return (irq < NR_IRQS) ? irq_desc + irq : NULL;
-}
-static inline struct irq_desc *irq_to_desc_alloc_cpu(unsigned int irq, int cpu)
-{
-	return irq_to_desc(irq);
-}
-
-#else
-
-extern struct irq_desc *irq_to_desc(unsigned int irq);
-extern struct irq_desc *irq_to_desc_alloc_cpu(unsigned int irq, int cpu);
+#else /* CONFIG_SPARSE_IRQ */
 extern struct irq_desc *move_irq_desc(struct irq_desc *old_desc, int cpu);
 
-# define for_each_irq_desc(irq, desc)		\
-	for (irq = 0, desc = irq_to_desc(irq); irq < nr_irqs; irq++, desc = irq_to_desc(irq))
-# define for_each_irq_desc_reverse(irq, desc)                          \
-	for (irq = nr_irqs - 1, desc = irq_to_desc(irq); irq >= 0; irq--, desc = irq_to_desc(irq))
-
 #define kstat_irqs_this_cpu(DESC) \
 	((DESC)->kstat_irqs[smp_processor_id()])
 #define kstat_incr_irqs_this_cpu(irqno, DESC) \
 	((DESC)->kstat_irqs[smp_processor_id()]++)
 
-#endif
+#endif /* CONFIG_SPARSE_IRQ */
+
+extern struct irq_desc *irq_to_desc_alloc_cpu(unsigned int irq, int cpu);
 
 static inline struct irq_desc *
 irq_remap_to_desc(unsigned int irq, struct irq_desc *desc)
diff --git a/include/linux/irqnr.h b/include/linux/irqnr.h
index 95d2b74641f..c4a59c7a478 100644
--- a/include/linux/irqnr.h
+++ b/include/linux/irqnr.h
@@ -15,20 +15,19 @@
 
 # define for_each_irq_desc_reverse(irq, desc)                          \
 	for (irq = nr_irqs - 1; irq >= 0; irq--)
-#else
+#else /* CONFIG_GENERIC_HARDIRQS */
 
 extern int nr_irqs;
+extern struct irq_desc *irq_to_desc(unsigned int irq);
 
-#ifndef CONFIG_SPARSE_IRQ
+# define for_each_irq_desc(irq, desc)					\
+	for (irq = 0, desc = irq_to_desc(irq); irq < nr_irqs;		\
+	     irq++, desc = irq_to_desc(irq))
+# define for_each_irq_desc_reverse(irq, desc)				\
+	for (irq = nr_irqs - 1, desc = irq_to_desc(irq); irq >= 0;	\
+	     irq--, desc = irq_to_desc(irq))
 
-struct irq_desc;
-# define for_each_irq_desc(irq, desc)		\
-	for (irq = 0, desc = irq_desc; irq < nr_irqs; irq++, desc++)
-# define for_each_irq_desc_reverse(irq, desc)                          \
-	for (irq = nr_irqs - 1, desc = irq_desc + (nr_irqs - 1);        \
-	    irq >= 0; irq--, desc--)
-#endif
-#endif
+#endif /* CONFIG_GENERIC_HARDIRQS */
 
 #define for_each_irq_nr(irq)                   \
        for (irq = 0; irq < nr_irqs; irq++)
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index 6492400cb50..4db7d2df86b 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -203,7 +203,7 @@ out_unlock:
 	return desc;
 }
 
-#else
+#else /* !CONFIG_SPARSE_IRQ */
 
 struct irq_desc irq_desc[NR_IRQS] __cacheline_aligned_in_smp = {
 	[0 ... NR_IRQS-1] = {
@@ -218,7 +218,16 @@ struct irq_desc irq_desc[NR_IRQS] __cacheline_aligned_in_smp = {
 	}
 };
 
-#endif
+struct irq_desc *irq_to_desc(unsigned int irq)
+{
+	return (irq < NR_IRQS) ? irq_desc + irq : NULL;
+}
+
+struct irq_desc *irq_to_desc_alloc_cpu(unsigned int irq, int cpu)
+{
+	return irq_to_desc(irq);
+}
+#endif /* !CONFIG_SPARSE_IRQ */
 
 /*
  * What should we do if we get a hw irq event on an illegal vector?
-- 
cgit v1.2.3-70-g09d2


From 18eefedfe8ad33e8fc7614c13359e29a9fab4644 Mon Sep 17 00:00:00 2001
From: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Date: Fri, 26 Dec 2008 12:29:48 +0900
Subject: irq: simplify for_each_irq_desc() usage

Impact: cleanup

all for_each_irq_desc() usage point have !desc check.
then its check can move into for_each_irq_desc() macro.

Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Acked-by: Yinghai Lu <yinghai@kernel.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/io_apic.c | 10 ----------
 drivers/xen/events.c      |  3 ---
 include/linux/irqnr.h     |  8 ++++++--
 kernel/irq/autoprobe.c    | 15 ---------------
 kernel/irq/handle.c       |  3 ---
 kernel/irq/spurious.c     |  5 -----
 6 files changed, 6 insertions(+), 38 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/kernel/io_apic.c b/arch/x86/kernel/io_apic.c
index a74887b416c..2fe543f58ac 100644
--- a/arch/x86/kernel/io_apic.c
+++ b/arch/x86/kernel/io_apic.c
@@ -1345,8 +1345,6 @@ void __setup_vector_irq(int cpu)
 
 	/* Mark the inuse vectors */
 	for_each_irq_desc(irq, desc) {
-		if (!desc)
-			continue;
 		cfg = desc->chip_data;
 		if (!cpu_isset(cpu, cfg->domain))
 			continue;
@@ -1730,8 +1728,6 @@ __apicdebuginit(void) print_IO_APIC(void)
 	for_each_irq_desc(irq, desc) {
 		struct irq_pin_list *entry;
 
-		if (!desc)
-			continue;
 		cfg = desc->chip_data;
 		entry = cfg->irq_2_pin;
 		if (!entry)
@@ -2378,9 +2374,6 @@ static void ir_irq_migration(struct work_struct *work)
 	struct irq_desc *desc;
 
 	for_each_irq_desc(irq, desc) {
-		if (!desc)
-			continue;
-
 		if (desc->status & IRQ_MOVE_PENDING) {
 			unsigned long flags;
 
@@ -2671,9 +2664,6 @@ static inline void init_IO_APIC_traps(void)
 	 * 0x80, because int 0x80 is hm, kind of importantish. ;)
 	 */
 	for_each_irq_desc(irq, desc) {
-		if (!desc)
-			continue;
-
 		cfg = desc->chip_data;
 		if (IO_APIC_IRQ(irq) && cfg && !cfg->vector) {
 			/*
diff --git a/drivers/xen/events.c b/drivers/xen/events.c
index 46625cd3874..e26733a9df2 100644
--- a/drivers/xen/events.c
+++ b/drivers/xen/events.c
@@ -142,9 +142,6 @@ static void init_evtchn_cpu_bindings(void)
 
 	/* By default all event channels notify CPU#0. */
 	for_each_irq_desc(i, desc) {
-		if (!desc)
-			continue;
-
 		desc->affinity = cpumask_of_cpu(0);
 	}
 #endif
diff --git a/include/linux/irqnr.h b/include/linux/irqnr.h
index c4a59c7a478..5504a5c9783 100644
--- a/include/linux/irqnr.h
+++ b/include/linux/irqnr.h
@@ -22,10 +22,14 @@ extern struct irq_desc *irq_to_desc(unsigned int irq);
 
 # define for_each_irq_desc(irq, desc)					\
 	for (irq = 0, desc = irq_to_desc(irq); irq < nr_irqs;		\
-	     irq++, desc = irq_to_desc(irq))
+	     irq++, desc = irq_to_desc(irq))				\
+		if (desc)
+
+
 # define for_each_irq_desc_reverse(irq, desc)				\
 	for (irq = nr_irqs - 1, desc = irq_to_desc(irq); irq >= 0;	\
-	     irq--, desc = irq_to_desc(irq))
+	     irq--, desc = irq_to_desc(irq))				\
+		if (desc)
 
 #endif /* CONFIG_GENERIC_HARDIRQS */
 
diff --git a/kernel/irq/autoprobe.c b/kernel/irq/autoprobe.c
index 650ce4102a6..cc0f7321b8c 100644
--- a/kernel/irq/autoprobe.c
+++ b/kernel/irq/autoprobe.c
@@ -40,9 +40,6 @@ unsigned long probe_irq_on(void)
 	 * flush such a longstanding irq before considering it as spurious.
 	 */
 	for_each_irq_desc_reverse(i, desc) {
-		if (!desc)
-			continue;
-
 		spin_lock_irq(&desc->lock);
 		if (!desc->action && !(desc->status & IRQ_NOPROBE)) {
 			/*
@@ -71,9 +68,6 @@ unsigned long probe_irq_on(void)
 	 * happened in the previous stage, it may have masked itself)
 	 */
 	for_each_irq_desc_reverse(i, desc) {
-		if (!desc)
-			continue;
-
 		spin_lock_irq(&desc->lock);
 		if (!desc->action && !(desc->status & IRQ_NOPROBE)) {
 			desc->status |= IRQ_AUTODETECT | IRQ_WAITING;
@@ -92,9 +86,6 @@ unsigned long probe_irq_on(void)
 	 * Now filter out any obviously spurious interrupts
 	 */
 	for_each_irq_desc(i, desc) {
-		if (!desc)
-			continue;
-
 		spin_lock_irq(&desc->lock);
 		status = desc->status;
 
@@ -133,9 +124,6 @@ unsigned int probe_irq_mask(unsigned long val)
 	int i;
 
 	for_each_irq_desc(i, desc) {
-		if (!desc)
-			continue;
-
 		spin_lock_irq(&desc->lock);
 		status = desc->status;
 
@@ -178,9 +166,6 @@ int probe_irq_off(unsigned long val)
 	unsigned int status;
 
 	for_each_irq_desc(i, desc) {
-		if (!desc)
-			continue;
-
 		spin_lock_irq(&desc->lock);
 		status = desc->status;
 
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index 03479dfdebb..7dbdfe52469 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -437,9 +437,6 @@ void early_init_irq_lock_class(void)
 	int i;
 
 	for_each_irq_desc(i, desc) {
-		if (!desc)
-			continue;
-
 		lockdep_set_class(&desc->lock, &irq_desc_lock_class);
 	}
 }
diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c
index 3738107531f..dd364c11e56 100644
--- a/kernel/irq/spurious.c
+++ b/kernel/irq/spurious.c
@@ -91,9 +91,6 @@ static int misrouted_irq(int irq)
 	int i, ok = 0;
 
 	for_each_irq_desc(i, desc) {
-		if (!desc)
-			continue;
-
 		if (!i)
 			 continue;
 
@@ -115,8 +112,6 @@ static void poll_spurious_irqs(unsigned long dummy)
 	for_each_irq_desc(i, desc) {
 		unsigned int status;
 
-		if (!desc)
-			continue;
 		if (!i)
 			 continue;
 
-- 
cgit v1.2.3-70-g09d2


From 13a0c3c269b223f60abfac8a9811d77111a8b4ba Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai@kernel.org>
Date: Fri, 26 Dec 2008 02:05:47 -0800
Subject: sparseirq: work around compiler optimizing away __weak functions

Impact: fix panic on null pointer with sparseirq

Some GCC versions seem to inline the weak global function,
when that function is empty.

Work it around, by making the functions return a (dummy) integer.

Signed-off-by: Yinghai <yinghai@kernel.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/io_apic.c | 8 ++++++--
 include/linux/irq.h       | 6 +++---
 init/main.c               | 7 ++++---
 kernel/irq/handle.c       | 7 ++++---
 4 files changed, 17 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/kernel/io_apic.c b/arch/x86/kernel/io_apic.c
index 2fe543f58ac..97603937784 100644
--- a/arch/x86/kernel/io_apic.c
+++ b/arch/x86/kernel/io_apic.c
@@ -170,7 +170,7 @@ static struct irq_cfg irq_cfgx[NR_IRQS] = {
 	[15] = { .domain = CPU_MASK_ALL, .vector = IRQ15_VECTOR, },
 };
 
-void __init arch_early_irq_init(void)
+int __init arch_early_irq_init(void)
 {
 	struct irq_cfg *cfg;
 	struct irq_desc *desc;
@@ -184,6 +184,8 @@ void __init arch_early_irq_init(void)
 		desc = irq_to_desc(i);
 		desc->chip_data = &cfg[i];
 	}
+
+	return 0;
 }
 
 #ifdef CONFIG_SPARSE_IRQ
@@ -212,7 +214,7 @@ static struct irq_cfg *get_one_free_irq_cfg(int cpu)
 	return cfg;
 }
 
-void arch_init_chip_data(struct irq_desc *desc, int cpu)
+int arch_init_chip_data(struct irq_desc *desc, int cpu)
 {
 	struct irq_cfg *cfg;
 
@@ -224,6 +226,8 @@ void arch_init_chip_data(struct irq_desc *desc, int cpu)
 			BUG_ON(1);
 		}
 	}
+
+	return 0;
 }
 
 #ifdef CONFIG_NUMA_MIGRATE_IRQ_DESC
diff --git a/include/linux/irq.h b/include/linux/irq.h
index 69da275c0eb..0e40af4bac4 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -193,9 +193,9 @@ struct irq_desc {
 	const char		*name;
 } ____cacheline_internodealigned_in_smp;
 
-extern void early_irq_init(void);
-extern void arch_early_irq_init(void);
-extern void arch_init_chip_data(struct irq_desc *desc, int cpu);
+extern int early_irq_init(void);
+extern int arch_early_irq_init(void);
+extern int arch_init_chip_data(struct irq_desc *desc, int cpu);
 extern void arch_init_copy_chip_data(struct irq_desc *old_desc,
 					struct irq_desc *desc, int cpu);
 extern void arch_free_chip_data(struct irq_desc *old_desc, struct irq_desc *desc);
diff --git a/init/main.c b/init/main.c
index c1f999a3cf3..c314aa15370 100644
--- a/init/main.c
+++ b/init/main.c
@@ -539,13 +539,14 @@ void __init __weak thread_info_cache_init(void)
 {
 }
 
-void __init __weak arch_early_irq_init(void)
+int __init __weak arch_early_irq_init(void)
 {
+	return 0;
 }
 
-void __init __weak early_irq_init(void)
+int __init __weak early_irq_init(void)
 {
-	arch_early_irq_init();
+	return arch_early_irq_init();
 }
 
 asmlinkage void __init start_kernel(void)
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index 893da67b778..0bef3ecb7a0 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -86,8 +86,9 @@ void init_kstat_irqs(struct irq_desc *desc, int cpu, int nr)
 		desc->kstat_irqs = (unsigned int *)ptr;
 }
 
-void __attribute__((weak)) arch_init_chip_data(struct irq_desc *desc, int cpu)
+int __weak arch_init_chip_data(struct irq_desc *desc, int cpu)
 {
+	return 0;
 }
 
 static void init_one_irq_desc(int irq, struct irq_desc *desc, int cpu)
@@ -132,7 +133,7 @@ static struct irq_desc irq_desc_legacy[NR_IRQS_LEGACY] __cacheline_aligned_in_sm
 /* FIXME: use bootmem alloc ...*/
 static unsigned int kstat_irqs_legacy[NR_IRQS_LEGACY][NR_CPUS];
 
-void __init early_irq_init(void)
+int __init early_irq_init(void)
 {
 	struct irq_desc *desc;
 	int legacy_count;
@@ -151,7 +152,7 @@ void __init early_irq_init(void)
 	for (i = legacy_count; i < NR_IRQS; i++)
 		irq_desc_ptrs[i] = NULL;
 
-	arch_early_irq_init();
+	return arch_early_irq_init();
 }
 
 struct irq_desc *irq_to_desc(unsigned int irq)
-- 
cgit v1.2.3-70-g09d2


From 1eca4365be25c540650693e941bc06a66cf38f94 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 3 Nov 2008 20:03:17 +0900
Subject: libata: beef up iterators

There currently are the following looping constructs.

* __ata_port_for_each_link() for all available links
* ata_port_for_each_link() for edge links
* ata_link_for_each_dev() for all devices
* ata_link_for_each_dev_reverse() for all devices in reverse order

Now there's a need for looping construct which is similar to
__ata_port_for_each_link() but iterates over PMP links before the host
link.  Instead of adding another one with long name, do the following
cleanup.

* Implement and export ata_link_next() and ata_dev_next() which take
  @mode parameter and can be used to build custom loop.
* Implement ata_for_each_link() and ata_for_each_dev() which take
  looping mode explicitly.

The following iteration modes are implemented.

* ATA_LITER_EDGE		: loop over edge links
* ATA_LITER_HOST_FIRST		: loop over all links, host link first
* ATA_LITER_PMP_FIRST		: loop over all links, PMP links first

* ATA_DITER_ENABLED		: loop over enabled devices
* ATA_DITER_ENABLED_REVERSE	: loop over enabled devices in reverse order
* ATA_DITER_ALL			: loop over all devices
* ATA_DITER_ALL_REVERSE		: loop over all devices in reverse order

This change removes exlicit device enabledness checks from many loops
and makes it clear which ones are iterated over in which direction.

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jeff Garzik <jgarzik@redhat.com>
---
 drivers/ata/ahci.c           |   8 +-
 drivers/ata/ata_generic.c    |   5 +-
 drivers/ata/libata-acpi.c    |  19 ++---
 drivers/ata/libata-core.c    | 183 ++++++++++++++++++++++++++++---------------
 drivers/ata/libata-eh.c      |  84 +++++++++-----------
 drivers/ata/libata-pmp.c     |  22 +++---
 drivers/ata/libata-scsi.c    |  22 +++---
 drivers/ata/pata_it821x.c    |  34 ++++----
 drivers/ata/pata_ixp4xx_cf.c |  14 ++--
 drivers/ata/pata_legacy.c    |  17 ++--
 drivers/ata/pata_pdc2027x.c  |  31 ++++----
 drivers/ata/pata_platform.c  |  14 ++--
 drivers/ata/pata_rz1000.c    |  16 ++--
 drivers/ata/sata_sil.c       |   2 +-
 include/linux/libata.h       |  76 +++++++++++++-----
 15 files changed, 307 insertions(+), 240 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/ata/ahci.c b/drivers/ata/ahci.c
index a67b8e7c712..656448c7fef 100644
--- a/drivers/ata/ahci.c
+++ b/drivers/ata/ahci.c
@@ -1119,14 +1119,14 @@ static void ahci_start_port(struct ata_port *ap)
 
 	/* turn on LEDs */
 	if (ap->flags & ATA_FLAG_EM) {
-		ata_port_for_each_link(link, ap) {
+		ata_for_each_link(link, ap, EDGE) {
 			emp = &pp->em_priv[link->pmp];
 			ahci_transmit_led_message(ap, emp->led_state, 4);
 		}
 	}
 
 	if (ap->flags & ATA_FLAG_SW_ACTIVITY)
-		ata_port_for_each_link(link, ap)
+		ata_for_each_link(link, ap, EDGE)
 			ahci_init_sw_activity(link);
 
 }
@@ -1361,7 +1361,7 @@ static ssize_t ahci_led_show(struct ata_port *ap, char *buf)
 	struct ahci_em_priv *emp;
 	int rc = 0;
 
-	ata_port_for_each_link(link, ap) {
+	ata_for_each_link(link, ap, EDGE) {
 		emp = &pp->em_priv[link->pmp];
 		rc += sprintf(buf, "%lx\n", emp->led_state);
 	}
@@ -1941,7 +1941,7 @@ static void ahci_error_intr(struct ata_port *ap, u32 irq_stat)
 	u32 serror;
 
 	/* determine active link */
-	ata_port_for_each_link(link, ap)
+	ata_for_each_link(link, ap, EDGE)
 		if (ata_link_active(link))
 			break;
 	if (!link)
diff --git a/drivers/ata/ata_generic.c b/drivers/ata/ata_generic.c
index 5c33767e66d..dc48a6398ab 100644
--- a/drivers/ata/ata_generic.c
+++ b/drivers/ata/ata_generic.c
@@ -57,10 +57,7 @@ static int generic_set_mode(struct ata_link *link, struct ata_device **unused)
 	if (pdev->vendor == PCI_VENDOR_ID_CENATEK)
 		dma_enabled = 0xFF;
 
-	ata_link_for_each_dev(dev, link) {
-		if (!ata_dev_enabled(dev))
-			continue;
-
+	ata_for_each_dev(dev, link, ENABLED) {
 		/* We don't really care */
 		dev->pio_mode = XFER_PIO_0;
 		dev->dma_mode = XFER_MW_DMA_0;
diff --git a/drivers/ata/libata-acpi.c b/drivers/ata/libata-acpi.c
index c012307d0ba..ef02e488d46 100644
--- a/drivers/ata/libata-acpi.c
+++ b/drivers/ata/libata-acpi.c
@@ -89,7 +89,7 @@ void ata_acpi_associate_sata_port(struct ata_port *ap)
 
 		ap->link.device->acpi_handle = NULL;
 
-		ata_port_for_each_link(link, ap) {
+		ata_for_each_link(link, ap, EDGE) {
 			acpi_integer adr = SATA_ADR(ap->port_no, link->pmp);
 
 			link->device->acpi_handle =
@@ -129,8 +129,8 @@ static void ata_acpi_detach_device(struct ata_port *ap, struct ata_device *dev)
 		struct ata_link *tlink;
 		struct ata_device *tdev;
 
-		ata_port_for_each_link(tlink, ap)
-			ata_link_for_each_dev(tdev, tlink)
+		ata_for_each_link(tlink, ap, EDGE)
+			ata_for_each_dev(tdev, tlink, ALL)
 				tdev->flags |= ATA_DFLAG_DETACH;
 	}
 
@@ -588,12 +588,9 @@ int ata_acpi_cbl_80wire(struct ata_port *ap, const struct ata_acpi_gtm *gtm)
 {
 	struct ata_device *dev;
 
-	ata_link_for_each_dev(dev, &ap->link) {
+	ata_for_each_dev(dev, &ap->link, ENABLED) {
 		unsigned long xfer_mask, udma_mask;
 
-		if (!ata_dev_enabled(dev))
-			continue;
-
 		xfer_mask = ata_acpi_gtm_xfermask(dev, gtm);
 		ata_unpack_xfermask(xfer_mask, NULL, NULL, &udma_mask);
 
@@ -893,7 +890,7 @@ void ata_acpi_on_resume(struct ata_port *ap)
 		 * use values set by _STM.  Cache _GTF result and
 		 * schedule _GTF.
 		 */
-		ata_link_for_each_dev(dev, &ap->link) {
+		ata_for_each_dev(dev, &ap->link, ALL) {
 			ata_acpi_clear_gtf(dev);
 			if (ata_dev_enabled(dev) &&
 			    ata_dev_get_GTF(dev, NULL) >= 0)
@@ -904,7 +901,7 @@ void ata_acpi_on_resume(struct ata_port *ap)
 		 * there's no reason to evaluate IDE _GTF early
 		 * without _STM.  Clear cache and schedule _GTF.
 		 */
-		ata_link_for_each_dev(dev, &ap->link) {
+		ata_for_each_dev(dev, &ap->link, ALL) {
 			ata_acpi_clear_gtf(dev);
 			if (ata_dev_enabled(dev))
 				dev->flags |= ATA_DFLAG_ACPI_PENDING;
@@ -932,8 +929,8 @@ void ata_acpi_set_state(struct ata_port *ap, pm_message_t state)
 	if (state.event == PM_EVENT_ON)
 		acpi_bus_set_power(ap->acpi_handle, ACPI_STATE_D0);
 
-	ata_link_for_each_dev(dev, &ap->link) {
-		if (dev->acpi_handle && ata_dev_enabled(dev))
+	ata_for_each_dev(dev, &ap->link, ENABLED) {
+		if (dev->acpi_handle)
 			acpi_bus_set_power(dev->acpi_handle,
 				state.event == PM_EVENT_ON ?
 					ACPI_STATE_D0 : ACPI_STATE_D3);
diff --git a/drivers/ata/libata-core.c b/drivers/ata/libata-core.c
index bc6695e3c84..ffd98e4e65b 100644
--- a/drivers/ata/libata-core.c
+++ b/drivers/ata/libata-core.c
@@ -163,42 +163,118 @@ MODULE_LICENSE("GPL");
 MODULE_VERSION(DRV_VERSION);
 
 
-/*
- * Iterator helpers.  Don't use directly.
+/**
+ *	ata_link_next - link iteration helper
+ *	@link: the previous link, NULL to start
+ *	@ap: ATA port containing links to iterate
+ *	@mode: iteration mode, one of ATA_LITER_*
+ *
+ *	LOCKING:
+ *	Host lock or EH context.
  *
- * LOCKING:
- * Host lock or EH context.
+ *	RETURNS:
+ *	Pointer to the next link.
  */
-struct ata_link *__ata_port_next_link(struct ata_port *ap,
-				      struct ata_link *link, bool dev_only)
+struct ata_link *ata_link_next(struct ata_link *link, struct ata_port *ap,
+			       enum ata_link_iter_mode mode)
 {
+	BUG_ON(mode != ATA_LITER_EDGE &&
+	       mode != ATA_LITER_PMP_FIRST && mode != ATA_LITER_HOST_FIRST);
+
 	/* NULL link indicates start of iteration */
-	if (!link) {
-		if (dev_only && sata_pmp_attached(ap))
-			return ap->pmp_link;
-		return &ap->link;
-	}
+	if (!link)
+		switch (mode) {
+		case ATA_LITER_EDGE:
+		case ATA_LITER_PMP_FIRST:
+			if (sata_pmp_attached(ap))
+				return ap->pmp_link;
+			/* fall through */
+		case ATA_LITER_HOST_FIRST:
+			return &ap->link;
+		}
 
-	/* we just iterated over the host master link, what's next? */
-	if (link == &ap->link) {
-		if (!sata_pmp_attached(ap)) {
-			if (unlikely(ap->slave_link) && !dev_only)
+	/* we just iterated over the host link, what's next? */
+	if (link == &ap->link)
+		switch (mode) {
+		case ATA_LITER_HOST_FIRST:
+			if (sata_pmp_attached(ap))
+				return ap->pmp_link;
+			/* fall through */
+		case ATA_LITER_PMP_FIRST:
+			if (unlikely(ap->slave_link))
 				return ap->slave_link;
+			/* fall through */
+		case ATA_LITER_EDGE:
 			return NULL;
 		}
-		return ap->pmp_link;
-	}
 
 	/* slave_link excludes PMP */
 	if (unlikely(link == ap->slave_link))
 		return NULL;
 
-	/* iterate to the next PMP link */
+	/* we were over a PMP link */
 	if (++link < ap->pmp_link + ap->nr_pmp_links)
 		return link;
+
+	if (mode == ATA_LITER_PMP_FIRST)
+		return &ap->link;
+
 	return NULL;
 }
 
+/**
+ *	ata_dev_next - device iteration helper
+ *	@dev: the previous device, NULL to start
+ *	@link: ATA link containing devices to iterate
+ *	@mode: iteration mode, one of ATA_DITER_*
+ *
+ *	LOCKING:
+ *	Host lock or EH context.
+ *
+ *	RETURNS:
+ *	Pointer to the next device.
+ */
+struct ata_device *ata_dev_next(struct ata_device *dev, struct ata_link *link,
+				enum ata_dev_iter_mode mode)
+{
+	BUG_ON(mode != ATA_DITER_ENABLED && mode != ATA_DITER_ENABLED_REVERSE &&
+	       mode != ATA_DITER_ALL && mode != ATA_DITER_ALL_REVERSE);
+
+	/* NULL dev indicates start of iteration */
+	if (!dev)
+		switch (mode) {
+		case ATA_DITER_ENABLED:
+		case ATA_DITER_ALL:
+			dev = link->device;
+			goto check;
+		case ATA_DITER_ENABLED_REVERSE:
+		case ATA_DITER_ALL_REVERSE:
+			dev = link->device + ata_link_max_devices(link) - 1;
+			goto check;
+		}
+
+ next:
+	/* move to the next one */
+	switch (mode) {
+	case ATA_DITER_ENABLED:
+	case ATA_DITER_ALL:
+		if (++dev < link->device + ata_link_max_devices(link))
+			goto check;
+		return NULL;
+	case ATA_DITER_ENABLED_REVERSE:
+	case ATA_DITER_ALL_REVERSE:
+		if (--dev >= link->device)
+			goto check;
+		return NULL;
+	}
+
+ check:
+	if ((mode == ATA_DITER_ENABLED || mode == ATA_DITER_ENABLED_REVERSE) &&
+	    !ata_dev_enabled(dev))
+		goto next;
+	return dev;
+}
+
 /**
  *	ata_dev_phys_link - find physical link for a device
  *	@dev: ATA device to look up physical link for
@@ -1107,8 +1183,8 @@ static void ata_lpm_enable(struct ata_host *host)
 
 	for (i = 0; i < host->n_ports; i++) {
 		ap = host->ports[i];
-		ata_port_for_each_link(link, ap) {
-			ata_link_for_each_dev(dev, link)
+		ata_for_each_link(link, ap, EDGE) {
+			ata_for_each_dev(dev, link, ALL)
 				ata_dev_disable_pm(dev);
 		}
 	}
@@ -2594,11 +2670,11 @@ int ata_bus_probe(struct ata_port *ap)
 
 	ata_port_probe(ap);
 
-	ata_link_for_each_dev(dev, &ap->link)
+	ata_for_each_dev(dev, &ap->link, ALL)
 		tries[dev->devno] = ATA_PROBE_MAX_TRIES;
 
  retry:
-	ata_link_for_each_dev(dev, &ap->link) {
+	ata_for_each_dev(dev, &ap->link, ALL) {
 		/* If we issue an SRST then an ATA drive (not ATAPI)
 		 * may change configuration and be in PIO0 timing. If
 		 * we do a hard reset (or are coming from power on)
@@ -2620,7 +2696,7 @@ int ata_bus_probe(struct ata_port *ap)
 	/* reset and determine device classes */
 	ap->ops->phy_reset(ap);
 
-	ata_link_for_each_dev(dev, &ap->link) {
+	ata_for_each_dev(dev, &ap->link, ALL) {
 		if (!(ap->flags & ATA_FLAG_DISABLED) &&
 		    dev->class != ATA_DEV_UNKNOWN)
 			classes[dev->devno] = dev->class;
@@ -2636,7 +2712,7 @@ int ata_bus_probe(struct ata_port *ap)
 	   specific sequence bass-ackwards so that PDIAG- is released by
 	   the slave device */
 
-	ata_link_for_each_dev_reverse(dev, &ap->link) {
+	ata_for_each_dev(dev, &ap->link, ALL_REVERSE) {
 		if (tries[dev->devno])
 			dev->class = classes[dev->devno];
 
@@ -2653,24 +2729,19 @@ int ata_bus_probe(struct ata_port *ap)
 	if (ap->ops->cable_detect)
 		ap->cbl = ap->ops->cable_detect(ap);
 
-	/* We may have SATA bridge glue hiding here irrespective of the
-	   reported cable types and sensed types */
-	ata_link_for_each_dev(dev, &ap->link) {
-		if (!ata_dev_enabled(dev))
-			continue;
-		/* SATA drives indicate we have a bridge. We don't know which
-		   end of the link the bridge is which is a problem */
+	/* We may have SATA bridge glue hiding here irrespective of
+	 * the reported cable types and sensed types.  When SATA
+	 * drives indicate we have a bridge, we don't know which end
+	 * of the link the bridge is which is a problem.
+	 */
+	ata_for_each_dev(dev, &ap->link, ENABLED)
 		if (ata_id_is_sata(dev->id))
 			ap->cbl = ATA_CBL_SATA;
-	}
 
 	/* After the identify sequence we can now set up the devices. We do
 	   this in the normal order so that the user doesn't get confused */
 
-	ata_link_for_each_dev(dev, &ap->link) {
-		if (!ata_dev_enabled(dev))
-			continue;
-
+	ata_for_each_dev(dev, &ap->link, ENABLED) {
 		ap->link.eh_context.i.flags |= ATA_EHI_PRINTINFO;
 		rc = ata_dev_configure(dev);
 		ap->link.eh_context.i.flags &= ~ATA_EHI_PRINTINFO;
@@ -2683,9 +2754,8 @@ int ata_bus_probe(struct ata_port *ap)
 	if (rc)
 		goto fail;
 
-	ata_link_for_each_dev(dev, &ap->link)
-		if (ata_dev_enabled(dev))
-			return 0;
+	ata_for_each_dev(dev, &ap->link, ENABLED)
+		return 0;
 
 	/* no device present, disable port */
 	ata_port_disable(ap);
@@ -3331,13 +3401,10 @@ int ata_do_set_mode(struct ata_link *link, struct ata_device **r_failed_dev)
 	int rc = 0, used_dma = 0, found = 0;
 
 	/* step 1: calculate xfer_mask */
-	ata_link_for_each_dev(dev, link) {
+	ata_for_each_dev(dev, link, ENABLED) {
 		unsigned long pio_mask, dma_mask;
 		unsigned int mode_mask;
 
-		if (!ata_dev_enabled(dev))
-			continue;
-
 		mode_mask = ATA_DMA_MASK_ATA;
 		if (dev->class == ATA_DEV_ATAPI)
 			mode_mask = ATA_DMA_MASK_ATAPI;
@@ -3366,10 +3433,7 @@ int ata_do_set_mode(struct ata_link *link, struct ata_device **r_failed_dev)
 		goto out;
 
 	/* step 2: always set host PIO timings */
-	ata_link_for_each_dev(dev, link) {
-		if (!ata_dev_enabled(dev))
-			continue;
-
+	ata_for_each_dev(dev, link, ENABLED) {
 		if (dev->pio_mode == 0xff) {
 			ata_dev_printk(dev, KERN_WARNING, "no PIO support\n");
 			rc = -EINVAL;
@@ -3383,8 +3447,8 @@ int ata_do_set_mode(struct ata_link *link, struct ata_device **r_failed_dev)
 	}
 
 	/* step 3: set host DMA timings */
-	ata_link_for_each_dev(dev, link) {
-		if (!ata_dev_enabled(dev) || !ata_dma_enabled(dev))
+	ata_for_each_dev(dev, link, ENABLED) {
+		if (!ata_dma_enabled(dev))
 			continue;
 
 		dev->xfer_mode = dev->dma_mode;
@@ -3394,11 +3458,7 @@ int ata_do_set_mode(struct ata_link *link, struct ata_device **r_failed_dev)
 	}
 
 	/* step 4: update devices' xfer mode */
-	ata_link_for_each_dev(dev, link) {
-		/* don't update suspended devices' xfer mode */
-		if (!ata_dev_enabled(dev))
-			continue;
-
+	ata_for_each_dev(dev, link, ENABLED) {
 		rc = ata_dev_set_mode(dev);
 		if (rc)
 			goto out;
@@ -4263,9 +4323,9 @@ static int cable_is_40wire(struct ata_port *ap)
 	 * - if you have a non detect capable drive you don't want it
 	 *   to colour the choice
 	 */
-	ata_port_for_each_link(link, ap) {
-		ata_link_for_each_dev(dev, link) {
-			if (ata_dev_enabled(dev) && !ata_is_40wire(dev))
+	ata_for_each_link(link, ap, EDGE) {
+		ata_for_each_dev(dev, link, ENABLED) {
+			if (!ata_is_40wire(dev))
 				return 0;
 		}
 	}
@@ -5218,7 +5278,7 @@ static int ata_host_request_pm(struct ata_host *host, pm_message_t mesg,
 		}
 
 		ap->pflags |= ATA_PFLAG_PM_PENDING;
-		__ata_port_for_each_link(link, ap) {
+		ata_for_each_link(link, ap, HOST_FIRST) {
 			link->eh_info.action |= action;
 			link->eh_info.flags |= ehi_flags;
 		}
@@ -6063,9 +6123,9 @@ static void ata_port_detach(struct ata_port *ap)
 	/* EH is now guaranteed to see UNLOADING - EH context belongs
 	 * to us.  Restore SControl and disable all existing devices.
 	 */
-	__ata_port_for_each_link(link, ap) {
+	ata_for_each_link(link, ap, HOST_FIRST) {
 		sata_scr_write(link, SCR_CONTROL, link->saved_scontrol & 0xff0);
-		ata_link_for_each_dev(dev, link)
+		ata_for_each_dev(dev, link, ALL)
 			ata_dev_disable(dev);
 	}
 
@@ -6528,7 +6588,8 @@ EXPORT_SYMBOL_GPL(ata_base_port_ops);
 EXPORT_SYMBOL_GPL(sata_port_ops);
 EXPORT_SYMBOL_GPL(ata_dummy_port_ops);
 EXPORT_SYMBOL_GPL(ata_dummy_port_info);
-EXPORT_SYMBOL_GPL(__ata_port_next_link);
+EXPORT_SYMBOL_GPL(ata_link_next);
+EXPORT_SYMBOL_GPL(ata_dev_next);
 EXPORT_SYMBOL_GPL(ata_std_bios_param);
 EXPORT_SYMBOL_GPL(ata_host_init);
 EXPORT_SYMBOL_GPL(ata_host_alloc);
diff --git a/drivers/ata/libata-eh.c b/drivers/ata/libata-eh.c
index 32da9a93ce4..d673f3712bd 100644
--- a/drivers/ata/libata-eh.c
+++ b/drivers/ata/libata-eh.c
@@ -422,7 +422,7 @@ static void ata_eh_clear_action(struct ata_link *link, struct ata_device *dev,
 
 	if (!dev) {
 		ehi->action &= ~action;
-		ata_link_for_each_dev(tdev, link)
+		ata_for_each_dev(tdev, link, ALL)
 			ehi->dev_action[tdev->devno] &= ~action;
 	} else {
 		/* doesn't make sense for port-wide EH actions */
@@ -430,7 +430,7 @@ static void ata_eh_clear_action(struct ata_link *link, struct ata_device *dev,
 
 		/* break ehi->action into ehi->dev_action */
 		if (ehi->action & action) {
-			ata_link_for_each_dev(tdev, link)
+			ata_for_each_dev(tdev, link, ALL)
 				ehi->dev_action[tdev->devno] |=
 					ehi->action & action;
 			ehi->action &= ~action;
@@ -592,7 +592,7 @@ void ata_scsi_error(struct Scsi_Host *host)
 		/* fetch & clear EH info */
 		spin_lock_irqsave(ap->lock, flags);
 
-		__ata_port_for_each_link(link, ap) {
+		ata_for_each_link(link, ap, HOST_FIRST) {
 			struct ata_eh_context *ehc = &link->eh_context;
 			struct ata_device *dev;
 
@@ -600,12 +600,9 @@ void ata_scsi_error(struct Scsi_Host *host)
 			link->eh_context.i = link->eh_info;
 			memset(&link->eh_info, 0, sizeof(link->eh_info));
 
-			ata_link_for_each_dev(dev, link) {
+			ata_for_each_dev(dev, link, ENABLED) {
 				int devno = dev->devno;
 
-				if (!ata_dev_enabled(dev))
-					continue;
-
 				ehc->saved_xfer_mode[devno] = dev->xfer_mode;
 				if (ata_ncq_enabled(dev))
 					ehc->saved_ncq_enabled |= 1 << devno;
@@ -644,7 +641,7 @@ void ata_scsi_error(struct Scsi_Host *host)
 		}
 
 		/* this run is complete, make sure EH info is clear */
-		__ata_port_for_each_link(link, ap)
+		ata_for_each_link(link, ap, HOST_FIRST)
 			memset(&link->eh_info, 0, sizeof(link->eh_info));
 
 		/* Clear host_eh_scheduled while holding ap->lock such
@@ -1025,7 +1022,7 @@ int sata_async_notification(struct ata_port *ap)
 		struct ata_link *link;
 
 		/* check and notify ATAPI AN */
-		ata_port_for_each_link(link, ap) {
+		ata_for_each_link(link, ap, EDGE) {
 			if (!(sntf & (1 << link->pmp)))
 				continue;
 
@@ -2005,7 +2002,7 @@ void ata_eh_autopsy(struct ata_port *ap)
 {
 	struct ata_link *link;
 
-	ata_port_for_each_link(link, ap)
+	ata_for_each_link(link, ap, EDGE)
 		ata_eh_link_autopsy(link);
 
 	/* Handle the frigging slave link.  Autopsy is done similarly
@@ -2219,7 +2216,7 @@ void ata_eh_report(struct ata_port *ap)
 {
 	struct ata_link *link;
 
-	__ata_port_for_each_link(link, ap)
+	ata_for_each_link(link, ap, HOST_FIRST)
 		ata_eh_link_report(link);
 }
 
@@ -2230,7 +2227,7 @@ static int ata_do_reset(struct ata_link *link, ata_reset_fn_t reset,
 	struct ata_device *dev;
 
 	if (clear_classes)
-		ata_link_for_each_dev(dev, link)
+		ata_for_each_dev(dev, link, ALL)
 			classes[dev->devno] = ATA_DEV_UNKNOWN;
 
 	return reset(link, classes, deadline);
@@ -2294,7 +2291,7 @@ int ata_eh_reset(struct ata_link *link, int classify,
 
 	ata_eh_about_to_do(link, NULL, ATA_EH_RESET);
 
-	ata_link_for_each_dev(dev, link) {
+	ata_for_each_dev(dev, link, ALL) {
 		/* If we issue an SRST then an ATA drive (not ATAPI)
 		 * may change configuration and be in PIO0 timing. If
 		 * we do a hard reset (or are coming from power on)
@@ -2355,7 +2352,7 @@ int ata_eh_reset(struct ata_link *link, int classify,
 						"port disabled. ignoring.\n");
 				ehc->i.action &= ~ATA_EH_RESET;
 
-				ata_link_for_each_dev(dev, link)
+				ata_for_each_dev(dev, link, ALL)
 					classes[dev->devno] = ATA_DEV_NONE;
 
 				rc = 0;
@@ -2369,7 +2366,7 @@ int ata_eh_reset(struct ata_link *link, int classify,
 		 * bang classes and return.
 		 */
 		if (reset && !(ehc->i.action & ATA_EH_RESET)) {
-			ata_link_for_each_dev(dev, link)
+			ata_for_each_dev(dev, link, ALL)
 				classes[dev->devno] = ATA_DEV_NONE;
 			rc = 0;
 			goto out;
@@ -2454,7 +2451,7 @@ int ata_eh_reset(struct ata_link *link, int classify,
 	/*
 	 * Post-reset processing
 	 */
-	ata_link_for_each_dev(dev, link) {
+	ata_for_each_dev(dev, link, ALL) {
 		/* After the reset, the device state is PIO 0 and the
 		 * controller state is undefined.  Reset also wakes up
 		 * drives from sleeping mode.
@@ -2510,7 +2507,7 @@ int ata_eh_reset(struct ata_link *link, int classify,
 	 * can be reliably detected and retried.
 	 */
 	nr_unknown = 0;
-	ata_link_for_each_dev(dev, link) {
+	ata_for_each_dev(dev, link, ALL) {
 		/* convert all ATA_DEV_UNKNOWN to ATA_DEV_NONE */
 		if (classes[dev->devno] == ATA_DEV_UNKNOWN) {
 			classes[dev->devno] = ATA_DEV_NONE;
@@ -2619,8 +2616,8 @@ static inline void ata_eh_pull_park_action(struct ata_port *ap)
 
 	spin_lock_irqsave(ap->lock, flags);
 	INIT_COMPLETION(ap->park_req_pending);
-	ata_port_for_each_link(link, ap) {
-		ata_link_for_each_dev(dev, link) {
+	ata_for_each_link(link, ap, EDGE) {
+		ata_for_each_dev(dev, link, ALL) {
 			struct ata_eh_info *ehi = &link->eh_info;
 
 			link->eh_context.i.dev_action[dev->devno] |=
@@ -2675,7 +2672,7 @@ static int ata_eh_revalidate_and_attach(struct ata_link *link,
 	 * be done backwards such that PDIAG- is released by the slave
 	 * device before the master device is identified.
 	 */
-	ata_link_for_each_dev_reverse(dev, link) {
+	ata_for_each_dev(dev, link, ALL_REVERSE) {
 		unsigned int action = ata_eh_dev_action(dev);
 		unsigned int readid_flags = 0;
 
@@ -2744,7 +2741,7 @@ static int ata_eh_revalidate_and_attach(struct ata_link *link,
 	/* Configure new devices forward such that user doesn't see
 	 * device detection messages backwards.
 	 */
-	ata_link_for_each_dev(dev, link) {
+	ata_for_each_dev(dev, link, ALL) {
 		if (!(new_mask & (1 << dev->devno)) ||
 		    dev->class == ATA_DEV_PMP)
 			continue;
@@ -2793,10 +2790,7 @@ int ata_set_mode(struct ata_link *link, struct ata_device **r_failed_dev)
 	int rc;
 
 	/* if data transfer is verified, clear DUBIOUS_XFER on ering top */
-	ata_link_for_each_dev(dev, link) {
-		if (!ata_dev_enabled(dev))
-			continue;
-
+	ata_for_each_dev(dev, link, ENABLED) {
 		if (!(dev->flags & ATA_DFLAG_DUBIOUS_XFER)) {
 			struct ata_ering_entry *ent;
 
@@ -2813,14 +2807,11 @@ int ata_set_mode(struct ata_link *link, struct ata_device **r_failed_dev)
 		rc = ata_do_set_mode(link, r_failed_dev);
 
 	/* if transfer mode has changed, set DUBIOUS_XFER on device */
-	ata_link_for_each_dev(dev, link) {
+	ata_for_each_dev(dev, link, ENABLED) {
 		struct ata_eh_context *ehc = &link->eh_context;
 		u8 saved_xfer_mode = ehc->saved_xfer_mode[dev->devno];
 		u8 saved_ncq = !!(ehc->saved_ncq_enabled & (1 << dev->devno));
 
-		if (!ata_dev_enabled(dev))
-			continue;
-
 		if (dev->xfer_mode != saved_xfer_mode ||
 		    ata_ncq_enabled(dev) != saved_ncq)
 			dev->flags |= ATA_DFLAG_DUBIOUS_XFER;
@@ -2881,9 +2872,8 @@ static int ata_link_nr_enabled(struct ata_link *link)
 	struct ata_device *dev;
 	int cnt = 0;
 
-	ata_link_for_each_dev(dev, link)
-		if (ata_dev_enabled(dev))
-			cnt++;
+	ata_for_each_dev(dev, link, ENABLED)
+		cnt++;
 	return cnt;
 }
 
@@ -2892,7 +2882,7 @@ static int ata_link_nr_vacant(struct ata_link *link)
 	struct ata_device *dev;
 	int cnt = 0;
 
-	ata_link_for_each_dev(dev, link)
+	ata_for_each_dev(dev, link, ALL)
 		if (dev->class == ATA_DEV_UNKNOWN)
 			cnt++;
 	return cnt;
@@ -2918,7 +2908,7 @@ static int ata_eh_skip_recovery(struct ata_link *link)
 		return 0;
 
 	/* skip if class codes for all vacant slots are ATA_DEV_NONE */
-	ata_link_for_each_dev(dev, link) {
+	ata_for_each_dev(dev, link, ALL) {
 		if (dev->class == ATA_DEV_UNKNOWN &&
 		    ehc->classes[dev->devno] != ATA_DEV_NONE)
 			return 0;
@@ -3026,7 +3016,7 @@ int ata_eh_recover(struct ata_port *ap, ata_prereset_fn_t prereset,
 	DPRINTK("ENTER\n");
 
 	/* prep for recovery */
-	ata_port_for_each_link(link, ap) {
+	ata_for_each_link(link, ap, EDGE) {
 		struct ata_eh_context *ehc = &link->eh_context;
 
 		/* re-enable link? */
@@ -3038,7 +3028,7 @@ int ata_eh_recover(struct ata_port *ap, ata_prereset_fn_t prereset,
 			ata_eh_done(link, NULL, ATA_EH_ENABLE_LINK);
 		}
 
-		ata_link_for_each_dev(dev, link) {
+		ata_for_each_dev(dev, link, ALL) {
 			if (link->flags & ATA_LFLAG_NO_RETRY)
 				ehc->tries[dev->devno] = 1;
 			else
@@ -3068,19 +3058,19 @@ int ata_eh_recover(struct ata_port *ap, ata_prereset_fn_t prereset,
 		goto out;
 
 	/* prep for EH */
-	ata_port_for_each_link(link, ap) {
+	ata_for_each_link(link, ap, EDGE) {
 		struct ata_eh_context *ehc = &link->eh_context;
 
 		/* skip EH if possible. */
 		if (ata_eh_skip_recovery(link))
 			ehc->i.action = 0;
 
-		ata_link_for_each_dev(dev, link)
+		ata_for_each_dev(dev, link, ALL)
 			ehc->classes[dev->devno] = ATA_DEV_UNKNOWN;
 	}
 
 	/* reset */
-	ata_port_for_each_link(link, ap) {
+	ata_for_each_link(link, ap, EDGE) {
 		struct ata_eh_context *ehc = &link->eh_context;
 
 		if (!(ehc->i.action & ATA_EH_RESET))
@@ -3105,8 +3095,8 @@ int ata_eh_recover(struct ata_port *ap, ata_prereset_fn_t prereset,
 		ata_eh_pull_park_action(ap);
 
 		deadline = jiffies;
-		ata_port_for_each_link(link, ap) {
-			ata_link_for_each_dev(dev, link) {
+		ata_for_each_link(link, ap, EDGE) {
+			ata_for_each_dev(dev, link, ALL) {
 				struct ata_eh_context *ehc = &link->eh_context;
 				unsigned long tmp;
 
@@ -3134,8 +3124,8 @@ int ata_eh_recover(struct ata_port *ap, ata_prereset_fn_t prereset,
 		deadline = wait_for_completion_timeout(&ap->park_req_pending,
 						       deadline - now);
 	} while (deadline);
-	ata_port_for_each_link(link, ap) {
-		ata_link_for_each_dev(dev, link) {
+	ata_for_each_link(link, ap, EDGE) {
+		ata_for_each_dev(dev, link, ALL) {
 			if (!(link->eh_context.unloaded_mask &
 			      (1 << dev->devno)))
 				continue;
@@ -3146,7 +3136,7 @@ int ata_eh_recover(struct ata_port *ap, ata_prereset_fn_t prereset,
 	}
 
 	/* the rest */
-	ata_port_for_each_link(link, ap) {
+	ata_for_each_link(link, ap, EDGE) {
 		struct ata_eh_context *ehc = &link->eh_context;
 
 		/* revalidate existing devices and attach new ones */
@@ -3172,7 +3162,7 @@ int ata_eh_recover(struct ata_port *ap, ata_prereset_fn_t prereset,
 		 * disrupting the current users of the device.
 		 */
 		if (ehc->i.flags & ATA_EHI_DID_RESET) {
-			ata_link_for_each_dev(dev, link) {
+			ata_for_each_dev(dev, link, ALL) {
 				if (dev->class != ATA_DEV_ATAPI)
 					continue;
 				rc = atapi_eh_clear_ua(dev);
@@ -3183,7 +3173,7 @@ int ata_eh_recover(struct ata_port *ap, ata_prereset_fn_t prereset,
 
 		/* configure link power saving */
 		if (ehc->i.action & ATA_EH_LPM)
-			ata_link_for_each_dev(dev, link)
+			ata_for_each_dev(dev, link, ALL)
 				ata_dev_enable_pm(dev, ap->pm_policy);
 
 		/* this link is okay now */
@@ -3288,7 +3278,7 @@ void ata_do_eh(struct ata_port *ap, ata_prereset_fn_t prereset,
 	rc = ata_eh_recover(ap, prereset, softreset, hardreset, postreset,
 			    NULL);
 	if (rc) {
-		ata_link_for_each_dev(dev, &ap->link)
+		ata_for_each_dev(dev, &ap->link, ALL)
 			ata_dev_disable(dev);
 	}
 
diff --git a/drivers/ata/libata-pmp.c b/drivers/ata/libata-pmp.c
index b65db309c18..98ca07a2db8 100644
--- a/drivers/ata/libata-pmp.c
+++ b/drivers/ata/libata-pmp.c
@@ -321,7 +321,7 @@ static void sata_pmp_quirks(struct ata_port *ap)
 
 	if (vendor == 0x1095 && devid == 0x3726) {
 		/* sil3726 quirks */
-		ata_port_for_each_link(link, ap) {
+		ata_for_each_link(link, ap, EDGE) {
 			/* Class code report is unreliable and SRST
 			 * times out under certain configurations.
 			 */
@@ -336,7 +336,7 @@ static void sata_pmp_quirks(struct ata_port *ap)
 		}
 	} else if (vendor == 0x1095 && devid == 0x4723) {
 		/* sil4723 quirks */
-		ata_port_for_each_link(link, ap) {
+		ata_for_each_link(link, ap, EDGE) {
 			/* class code report is unreliable */
 			if (link->pmp < 2)
 				link->flags |= ATA_LFLAG_ASSUME_ATA;
@@ -348,7 +348,7 @@ static void sata_pmp_quirks(struct ata_port *ap)
 		}
 	} else if (vendor == 0x1095 && devid == 0x4726) {
 		/* sil4726 quirks */
-		ata_port_for_each_link(link, ap) {
+		ata_for_each_link(link, ap, EDGE) {
 			/* Class code report is unreliable and SRST
 			 * times out under certain configurations.
 			 * Config device can be at port 0 or 5 and
@@ -450,7 +450,7 @@ int sata_pmp_attach(struct ata_device *dev)
 	if (ap->ops->pmp_attach)
 		ap->ops->pmp_attach(ap);
 
-	ata_port_for_each_link(tlink, ap)
+	ata_for_each_link(tlink, ap, EDGE)
 		sata_link_init_spd(tlink);
 
 	ata_acpi_associate_sata_port(ap);
@@ -487,7 +487,7 @@ static void sata_pmp_detach(struct ata_device *dev)
 	if (ap->ops->pmp_detach)
 		ap->ops->pmp_detach(ap);
 
-	ata_port_for_each_link(tlink, ap)
+	ata_for_each_link(tlink, ap, EDGE)
 		ata_eh_detach_dev(tlink->device);
 
 	spin_lock_irqsave(ap->lock, flags);
@@ -700,7 +700,7 @@ static int sata_pmp_eh_recover_pmp(struct ata_port *ap,
 		}
 
 		/* PMP is reset, SErrors cannot be trusted, scan all */
-		ata_port_for_each_link(tlink, ap) {
+		ata_for_each_link(tlink, ap, EDGE) {
 			struct ata_eh_context *ehc = &tlink->eh_context;
 
 			ehc->i.probe_mask |= ATA_ALL_DEVICES;
@@ -768,7 +768,7 @@ static int sata_pmp_eh_handle_disabled_links(struct ata_port *ap)
 
 	spin_lock_irqsave(ap->lock, flags);
 
-	ata_port_for_each_link(link, ap) {
+	ata_for_each_link(link, ap, EDGE) {
 		if (!(link->flags & ATA_LFLAG_DISABLED))
 			continue;
 
@@ -852,7 +852,7 @@ static int sata_pmp_eh_recover(struct ata_port *ap)
 	int cnt, rc;
 
 	pmp_tries = ATA_EH_PMP_TRIES;
-	ata_port_for_each_link(link, ap)
+	ata_for_each_link(link, ap, EDGE)
 		link_tries[link->pmp] = ATA_EH_PMP_LINK_TRIES;
 
  retry:
@@ -861,7 +861,7 @@ static int sata_pmp_eh_recover(struct ata_port *ap)
 		rc = ata_eh_recover(ap, ops->prereset, ops->softreset,
 				    ops->hardreset, ops->postreset, NULL);
 		if (rc) {
-			ata_link_for_each_dev(dev, &ap->link)
+			ata_for_each_dev(dev, &ap->link, ALL)
 				ata_dev_disable(dev);
 			return rc;
 		}
@@ -870,7 +870,7 @@ static int sata_pmp_eh_recover(struct ata_port *ap)
 			return 0;
 
 		/* new PMP online */
-		ata_port_for_each_link(link, ap)
+		ata_for_each_link(link, ap, EDGE)
 			link_tries[link->pmp] = ATA_EH_PMP_LINK_TRIES;
 
 		/* fall through */
@@ -942,7 +942,7 @@ static int sata_pmp_eh_recover(struct ata_port *ap)
 	}
 
 	cnt = 0;
-	ata_port_for_each_link(link, ap) {
+	ata_for_each_link(link, ap, EDGE) {
 		if (!(gscr_error & (1 << link->pmp)))
 			continue;
 
diff --git a/drivers/ata/libata-scsi.c b/drivers/ata/libata-scsi.c
index 47c7afcb36f..0b2e14f6765 100644
--- a/drivers/ata/libata-scsi.c
+++ b/drivers/ata/libata-scsi.c
@@ -3229,12 +3229,12 @@ void ata_scsi_scan_host(struct ata_port *ap, int sync)
 		return;
 
  repeat:
-	ata_port_for_each_link(link, ap) {
-		ata_link_for_each_dev(dev, link) {
+	ata_for_each_link(link, ap, EDGE) {
+		ata_for_each_dev(dev, link, ENABLED) {
 			struct scsi_device *sdev;
 			int channel = 0, id = 0;
 
-			if (!ata_dev_enabled(dev) || dev->sdev)
+			if (dev->sdev)
 				continue;
 
 			if (ata_is_host_link(link))
@@ -3255,9 +3255,9 @@ void ata_scsi_scan_host(struct ata_port *ap, int sync)
 	 * failure occurred, scan would have failed silently.  Check
 	 * whether all devices are attached.
 	 */
-	ata_port_for_each_link(link, ap) {
-		ata_link_for_each_dev(dev, link) {
-			if (ata_dev_enabled(dev) && !dev->sdev)
+	ata_for_each_link(link, ap, EDGE) {
+		ata_for_each_dev(dev, link, ENABLED) {
+			if (!dev->sdev)
 				goto exit_loop;
 		}
 	}
@@ -3381,7 +3381,7 @@ static void ata_scsi_handle_link_detach(struct ata_link *link)
 	struct ata_port *ap = link->ap;
 	struct ata_device *dev;
 
-	ata_link_for_each_dev(dev, link) {
+	ata_for_each_dev(dev, link, ALL) {
 		unsigned long flags;
 
 		if (!(dev->flags & ATA_DFLAG_DETACHED))
@@ -3496,7 +3496,7 @@ static int ata_scsi_user_scan(struct Scsi_Host *shost, unsigned int channel,
 	if (devno == SCAN_WILD_CARD) {
 		struct ata_link *link;
 
-		ata_port_for_each_link(link, ap) {
+		ata_for_each_link(link, ap, EDGE) {
 			struct ata_eh_info *ehi = &link->eh_info;
 			ehi->probe_mask |= ATA_ALL_DEVICES;
 			ehi->action |= ATA_EH_RESET;
@@ -3544,11 +3544,11 @@ void ata_scsi_dev_rescan(struct work_struct *work)
 
 	spin_lock_irqsave(ap->lock, flags);
 
-	ata_port_for_each_link(link, ap) {
-		ata_link_for_each_dev(dev, link) {
+	ata_for_each_link(link, ap, EDGE) {
+		ata_for_each_dev(dev, link, ENABLED) {
 			struct scsi_device *sdev = dev->sdev;
 
-			if (!ata_dev_enabled(dev) || !sdev)
+			if (!sdev)
 				continue;
 			if (scsi_device_get(sdev))
 				continue;
diff --git a/drivers/ata/pata_it821x.c b/drivers/ata/pata_it821x.c
index 860ede52628..f828a29d775 100644
--- a/drivers/ata/pata_it821x.c
+++ b/drivers/ata/pata_it821x.c
@@ -465,24 +465,22 @@ static int it821x_smart_set_mode(struct ata_link *link, struct ata_device **unus
 {
 	struct ata_device *dev;
 
-	ata_link_for_each_dev(dev, link) {
-		if (ata_dev_enabled(dev)) {
-			/* We don't really care */
-			dev->pio_mode = XFER_PIO_0;
-			dev->dma_mode = XFER_MW_DMA_0;
-			/* We do need the right mode information for DMA or PIO
-			   and this comes from the current configuration flags */
-			if (ata_id_has_dma(dev->id)) {
-				ata_dev_printk(dev, KERN_INFO, "configured for DMA\n");
-				dev->xfer_mode = XFER_MW_DMA_0;
-				dev->xfer_shift = ATA_SHIFT_MWDMA;
-				dev->flags &= ~ATA_DFLAG_PIO;
-			} else {
-				ata_dev_printk(dev, KERN_INFO, "configured for PIO\n");
-				dev->xfer_mode = XFER_PIO_0;
-				dev->xfer_shift = ATA_SHIFT_PIO;
-				dev->flags |= ATA_DFLAG_PIO;
-			}
+	ata_for_each_dev(dev, link, ENABLED) {
+		/* We don't really care */
+		dev->pio_mode = XFER_PIO_0;
+		dev->dma_mode = XFER_MW_DMA_0;
+		/* We do need the right mode information for DMA or PIO
+		   and this comes from the current configuration flags */
+		if (ata_id_has_dma(dev->id)) {
+			ata_dev_printk(dev, KERN_INFO, "configured for DMA\n");
+			dev->xfer_mode = XFER_MW_DMA_0;
+			dev->xfer_shift = ATA_SHIFT_MWDMA;
+			dev->flags &= ~ATA_DFLAG_PIO;
+		} else {
+			ata_dev_printk(dev, KERN_INFO, "configured for PIO\n");
+			dev->xfer_mode = XFER_PIO_0;
+			dev->xfer_shift = ATA_SHIFT_PIO;
+			dev->flags |= ATA_DFLAG_PIO;
 		}
 	}
 	return 0;
diff --git a/drivers/ata/pata_ixp4xx_cf.c b/drivers/ata/pata_ixp4xx_cf.c
index 2014253f6c8..b173c157ab0 100644
--- a/drivers/ata/pata_ixp4xx_cf.c
+++ b/drivers/ata/pata_ixp4xx_cf.c
@@ -30,14 +30,12 @@ static int ixp4xx_set_mode(struct ata_link *link, struct ata_device **error)
 {
 	struct ata_device *dev;
 
-	ata_link_for_each_dev(dev, link) {
-		if (ata_dev_enabled(dev)) {
-			ata_dev_printk(dev, KERN_INFO, "configured for PIO0\n");
-			dev->pio_mode = XFER_PIO_0;
-			dev->xfer_mode = XFER_PIO_0;
-			dev->xfer_shift = ATA_SHIFT_PIO;
-			dev->flags |= ATA_DFLAG_PIO;
-		}
+	ata_for_each_dev(dev, link, ENABLED) {
+		ata_dev_printk(dev, KERN_INFO, "configured for PIO0\n");
+		dev->pio_mode = XFER_PIO_0;
+		dev->xfer_mode = XFER_PIO_0;
+		dev->xfer_shift = ATA_SHIFT_PIO;
+		dev->flags |= ATA_DFLAG_PIO;
 	}
 	return 0;
 }
diff --git a/drivers/ata/pata_legacy.c b/drivers/ata/pata_legacy.c
index 930c2208640..cbd98c16eea 100644
--- a/drivers/ata/pata_legacy.c
+++ b/drivers/ata/pata_legacy.c
@@ -194,15 +194,12 @@ static int legacy_set_mode(struct ata_link *link, struct ata_device **unused)
 {
 	struct ata_device *dev;
 
-	ata_link_for_each_dev(dev, link) {
-		if (ata_dev_enabled(dev)) {
-			ata_dev_printk(dev, KERN_INFO,
-						"configured for PIO\n");
-			dev->pio_mode = XFER_PIO_0;
-			dev->xfer_mode = XFER_PIO_0;
-			dev->xfer_shift = ATA_SHIFT_PIO;
-			dev->flags |= ATA_DFLAG_PIO;
-		}
+	ata_for_each_dev(dev, link, ENABLED) {
+		ata_dev_printk(dev, KERN_INFO, "configured for PIO\n");
+		dev->pio_mode = XFER_PIO_0;
+		dev->xfer_mode = XFER_PIO_0;
+		dev->xfer_shift = ATA_SHIFT_PIO;
+		dev->flags |= ATA_DFLAG_PIO;
 	}
 	return 0;
 }
@@ -1028,7 +1025,7 @@ static __init int legacy_init_one(struct legacy_probe *probe)
 	/* Nothing found means we drop the port as its probably not there */
 
 	ret = -ENODEV;
-	ata_link_for_each_dev(dev, &ap->link) {
+	ata_for_each_dev(dev, &ap->link, ALL) {
 		if (!ata_dev_absent(dev)) {
 			legacy_host[probe->slot] = host;
 			ld->platform_dev = pdev;
diff --git a/drivers/ata/pata_pdc2027x.c b/drivers/ata/pata_pdc2027x.c
index 0e1c2c1134d..d43e8be96b8 100644
--- a/drivers/ata/pata_pdc2027x.c
+++ b/drivers/ata/pata_pdc2027x.c
@@ -406,23 +406,20 @@ static int pdc2027x_set_mode(struct ata_link *link, struct ata_device **r_failed
 	if (rc < 0)
 		return rc;
 
-	ata_link_for_each_dev(dev, link) {
-		if (ata_dev_enabled(dev)) {
-
-			pdc2027x_set_piomode(ap, dev);
-
-			/*
-			 * Enable prefetch if the device support PIO only.
-			 */
-			if (dev->xfer_shift == ATA_SHIFT_PIO) {
-				u32 ctcr1 = ioread32(dev_mmio(ap, dev, PDC_CTCR1));
-				ctcr1 |= (1 << 25);
-				iowrite32(ctcr1, dev_mmio(ap, dev, PDC_CTCR1));
-
-				PDPRINTK("Turn on prefetch\n");
-			} else {
-				pdc2027x_set_dmamode(ap, dev);
-			}
+	ata_for_each_dev(dev, link, ENABLED) {
+		pdc2027x_set_piomode(ap, dev);
+
+		/*
+		 * Enable prefetch if the device support PIO only.
+		 */
+		if (dev->xfer_shift == ATA_SHIFT_PIO) {
+			u32 ctcr1 = ioread32(dev_mmio(ap, dev, PDC_CTCR1));
+			ctcr1 |= (1 << 25);
+			iowrite32(ctcr1, dev_mmio(ap, dev, PDC_CTCR1));
+
+			PDPRINTK("Turn on prefetch\n");
+		} else {
+			pdc2027x_set_dmamode(ap, dev);
 		}
 	}
 	return 0;
diff --git a/drivers/ata/pata_platform.c b/drivers/ata/pata_platform.c
index 77e4e3b17f5..6afa07a3764 100644
--- a/drivers/ata/pata_platform.c
+++ b/drivers/ata/pata_platform.c
@@ -34,14 +34,12 @@ static int pata_platform_set_mode(struct ata_link *link, struct ata_device **unu
 {
 	struct ata_device *dev;
 
-	ata_link_for_each_dev(dev, link) {
-		if (ata_dev_enabled(dev)) {
-			/* We don't really care */
-			dev->pio_mode = dev->xfer_mode = XFER_PIO_0;
-			dev->xfer_shift = ATA_SHIFT_PIO;
-			dev->flags |= ATA_DFLAG_PIO;
-			ata_dev_printk(dev, KERN_INFO, "configured for PIO\n");
-		}
+	ata_for_each_dev(dev, link, ENABLED) {
+		/* We don't really care */
+		dev->pio_mode = dev->xfer_mode = XFER_PIO_0;
+		dev->xfer_shift = ATA_SHIFT_PIO;
+		dev->flags |= ATA_DFLAG_PIO;
+		ata_dev_printk(dev, KERN_INFO, "configured for PIO\n");
 	}
 	return 0;
 }
diff --git a/drivers/ata/pata_rz1000.c b/drivers/ata/pata_rz1000.c
index 7dfd1f3f6f3..46d6bc1bf1e 100644
--- a/drivers/ata/pata_rz1000.c
+++ b/drivers/ata/pata_rz1000.c
@@ -38,15 +38,13 @@ static int rz1000_set_mode(struct ata_link *link, struct ata_device **unused)
 {
 	struct ata_device *dev;
 
-	ata_link_for_each_dev(dev, link) {
-		if (ata_dev_enabled(dev)) {
-			/* We don't really care */
-			dev->pio_mode = XFER_PIO_0;
-			dev->xfer_mode = XFER_PIO_0;
-			dev->xfer_shift = ATA_SHIFT_PIO;
-			dev->flags |= ATA_DFLAG_PIO;
-			ata_dev_printk(dev, KERN_INFO, "configured for PIO\n");
-		}
+	ata_for_each_dev(dev, link, ENABLED) {
+		/* We don't really care */
+		dev->pio_mode = XFER_PIO_0;
+		dev->xfer_mode = XFER_PIO_0;
+		dev->xfer_shift = ATA_SHIFT_PIO;
+		dev->flags |= ATA_DFLAG_PIO;
+		ata_dev_printk(dev, KERN_INFO, "configured for PIO\n");
 	}
 	return 0;
 }
diff --git a/drivers/ata/sata_sil.c b/drivers/ata/sata_sil.c
index 031d7b7dee3..17737092177 100644
--- a/drivers/ata/sata_sil.c
+++ b/drivers/ata/sata_sil.c
@@ -278,7 +278,7 @@ static int sil_set_mode(struct ata_link *link, struct ata_device **r_failed)
 	if (rc)
 		return rc;
 
-	ata_link_for_each_dev(dev, link) {
+	ata_for_each_dev(dev, link, ALL) {
 		if (!ata_dev_enabled(dev))
 			dev_mode[dev->devno] = 0;	/* PIO0/1/2 */
 		else if (dev->flags & ATA_DFLAG_PIO)
diff --git a/include/linux/libata.h b/include/linux/libata.h
index ed3f26eb5df..3b2a0c6444e 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -1285,26 +1285,62 @@ static inline int ata_link_active(struct ata_link *link)
 	return ata_tag_valid(link->active_tag) || link->sactive;
 }
 
-extern struct ata_link *__ata_port_next_link(struct ata_port *ap,
-					     struct ata_link *link,
-					     bool dev_only);
-
-#define __ata_port_for_each_link(link, ap) \
-	for ((link) = __ata_port_next_link((ap), NULL, false); (link); \
-	     (link) = __ata_port_next_link((ap), (link), false))
-
-#define ata_port_for_each_link(link, ap) \
-	for ((link) = __ata_port_next_link((ap), NULL, true); (link); \
-	     (link) = __ata_port_next_link((ap), (link), true))
-
-#define ata_link_for_each_dev(dev, link) \
-	for ((dev) = (link)->device; \
-	     (dev) < (link)->device + ata_link_max_devices(link) || ((dev) = NULL); \
-	     (dev)++)
-
-#define ata_link_for_each_dev_reverse(dev, link) \
-	for ((dev) = (link)->device + ata_link_max_devices(link) - 1; \
-	     (dev) >= (link)->device || ((dev) = NULL); (dev)--)
+/*
+ * Iterators
+ *
+ * ATA_LITER_* constants are used to select link iteration mode and
+ * ATA_DITER_* device iteration mode.
+ *
+ * For a custom iteration directly using ata_{link|dev}_next(), if
+ * @link or @dev, respectively, is NULL, the first element is
+ * returned.  @dev and @link can be any valid device or link and the
+ * next element according to the iteration mode will be returned.
+ * After the last element, NULL is returned.
+ */
+enum ata_link_iter_mode {
+	ATA_LITER_EDGE,		/* if present, PMP links only; otherwise,
+				 * host link.  no slave link */
+	ATA_LITER_HOST_FIRST,	/* host link followed by PMP or slave links */
+	ATA_LITER_PMP_FIRST,	/* PMP links followed by host link,
+				 * slave link still comes after host link */
+};
+
+enum ata_dev_iter_mode {
+	ATA_DITER_ENABLED,
+	ATA_DITER_ENABLED_REVERSE,
+	ATA_DITER_ALL,
+	ATA_DITER_ALL_REVERSE,
+};
+
+extern struct ata_link *ata_link_next(struct ata_link *link,
+				      struct ata_port *ap,
+				      enum ata_link_iter_mode mode);
+
+extern struct ata_device *ata_dev_next(struct ata_device *dev,
+				       struct ata_link *link,
+				       enum ata_dev_iter_mode mode);
+
+/*
+ * Shortcut notation for iterations
+ *
+ * ata_for_each_link() iterates over each link of @ap according to
+ * @mode.  @link points to the current link in the loop.  @link is
+ * NULL after loop termination.  ata_for_each_dev() works the same way
+ * except that it iterates over each device of @link.
+ *
+ * Note that the mode prefixes ATA_{L|D}ITER_ shouldn't need to be
+ * specified when using the following shorthand notations.  Only the
+ * mode itself (EDGE, HOST_FIRST, ENABLED, etc...) should be
+ * specified.  This not only increases brevity but also makes it
+ * impossible to use ATA_LITER_* for device iteration or vice-versa.
+ */
+#define ata_for_each_link(link, ap, mode) \
+	for ((link) = ata_link_next(NULL, (ap), ATA_LITER_##mode); (link); \
+	     (link) = ata_link_next((link), (ap), ATA_LITER_##mode))
+
+#define ata_for_each_dev(dev, link, mode) \
+	for ((dev) = ata_dev_next(NULL, (link), ATA_DITER_##mode); (dev); \
+	     (dev) = ata_dev_next((dev), (link), ATA_DITER_##mode))
 
 /**
  *	ata_ncq_enabled - Test whether NCQ is enabled
-- 
cgit v1.2.3-70-g09d2


From ece180d1cfe5fa751eaa85bf796cf28b2150af15 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 3 Nov 2008 20:04:37 +0900
Subject: libata: perform port detach in EH

ata_port_detach() first made sure EH saw ATA_PFLAG_UNLOADING and then
assumed EH context belongs to it and performed detach operation
itself.  However, UNLOADING doesn't disable all of EH and this could
lead to problems including triggering WARN_ON()'s in EH path.

This patch makes port detach behave more like other EH actions such
that ata_port_detach() requests EH to detach and waits for completion.

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jeff Garzik <jgarzik@redhat.com>
---
 drivers/ata/libata-core.c | 23 ++++-------------------
 drivers/ata/libata-eh.c   | 32 +++++++++++++++++++++++++++++++-
 include/linux/libata.h    |  3 ++-
 3 files changed, 37 insertions(+), 21 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/ata/libata-core.c b/drivers/ata/libata-core.c
index 1ecc3cb0b72..837fb60a6dc 100644
--- a/drivers/ata/libata-core.c
+++ b/drivers/ata/libata-core.c
@@ -6107,8 +6107,6 @@ int ata_host_activate(struct ata_host *host, int irq,
 static void ata_port_detach(struct ata_port *ap)
 {
 	unsigned long flags;
-	struct ata_link *link;
-	struct ata_device *dev;
 
 	if (!ap->ops->error_handler)
 		goto skip_eh;
@@ -6116,28 +6114,15 @@ static void ata_port_detach(struct ata_port *ap)
 	/* tell EH we're leaving & flush EH */
 	spin_lock_irqsave(ap->lock, flags);
 	ap->pflags |= ATA_PFLAG_UNLOADING;
+	ata_port_schedule_eh(ap);
 	spin_unlock_irqrestore(ap->lock, flags);
 
+	/* wait till EH commits suicide */
 	ata_port_wait_eh(ap);
 
-	/* EH is now guaranteed to see UNLOADING - EH context belongs
-	 * to us.  Restore SControl and disable all existing devices.
-	 */
-	ata_for_each_link(link, ap, PMP_FIRST) {
-		sata_scr_write(link, SCR_CONTROL, link->saved_scontrol & 0xff0);
-		ata_for_each_dev(dev, link, ALL)
-			ata_dev_disable(dev);
-	}
-
-	/* Final freeze & EH.  All in-flight commands are aborted.  EH
-	 * will be skipped and retrials will be terminated with bad
-	 * target.
-	 */
-	spin_lock_irqsave(ap->lock, flags);
-	ata_port_freeze(ap);	/* won't be thawed */
-	spin_unlock_irqrestore(ap->lock, flags);
+	/* it better be dead now */
+	WARN_ON(!(ap->pflags & ATA_PFLAG_UNLOADED));
 
-	ata_port_wait_eh(ap);
 	cancel_rearming_delayed_work(&ap->hotplug_task);
 
  skip_eh:
diff --git a/drivers/ata/libata-eh.c b/drivers/ata/libata-eh.c
index d673f3712bd..8147a838637 100644
--- a/drivers/ata/libata-eh.c
+++ b/drivers/ata/libata-eh.c
@@ -491,6 +491,31 @@ enum blk_eh_timer_return ata_scsi_timed_out(struct scsi_cmnd *cmd)
 	return ret;
 }
 
+static void ata_eh_unload(struct ata_port *ap)
+{
+	struct ata_link *link;
+	struct ata_device *dev;
+	unsigned long flags;
+
+	/* Restore SControl IPM and SPD for the next driver and
+	 * disable attached devices.
+	 */
+	ata_for_each_link(link, ap, PMP_FIRST) {
+		sata_scr_write(link, SCR_CONTROL, link->saved_scontrol & 0xff0);
+		ata_for_each_dev(dev, link, ALL)
+			ata_dev_disable(dev);
+	}
+
+	/* freeze and set UNLOADED */
+	spin_lock_irqsave(ap->lock, flags);
+
+	ata_port_freeze(ap);			/* won't be thawed */
+	ap->pflags &= ~ATA_PFLAG_EH_PENDING;	/* clear pending from freeze */
+	ap->pflags |= ATA_PFLAG_UNLOADED;
+
+	spin_unlock_irqrestore(ap->lock, flags);
+}
+
 /**
  *	ata_scsi_error - SCSI layer error handler callback
  *	@host: SCSI host on which error occurred
@@ -618,8 +643,13 @@ void ata_scsi_error(struct Scsi_Host *host)
 		/* invoke EH, skip if unloading or suspended */
 		if (!(ap->pflags & (ATA_PFLAG_UNLOADING | ATA_PFLAG_SUSPENDED)))
 			ap->ops->error_handler(ap);
-		else
+		else {
+			/* if unloading, commence suicide */
+			if ((ap->pflags & ATA_PFLAG_UNLOADING) &&
+			    !(ap->pflags & ATA_PFLAG_UNLOADED))
+				ata_eh_unload(ap);
 			ata_eh_finish(ap);
+		}
 
 		/* process port suspend request */
 		ata_eh_handle_port_suspend(ap);
diff --git a/include/linux/libata.h b/include/linux/libata.h
index 3b2a0c6444e..3449de597ef 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -213,10 +213,11 @@ enum {
 	ATA_PFLAG_FROZEN	= (1 << 2), /* port is frozen */
 	ATA_PFLAG_RECOVERED	= (1 << 3), /* recovery action performed */
 	ATA_PFLAG_LOADING	= (1 << 4), /* boot/loading probe */
-	ATA_PFLAG_UNLOADING	= (1 << 5), /* module is unloading */
 	ATA_PFLAG_SCSI_HOTPLUG	= (1 << 6), /* SCSI hotplug scheduled */
 	ATA_PFLAG_INITIALIZING	= (1 << 7), /* being initialized, don't touch */
 	ATA_PFLAG_RESETTING	= (1 << 8), /* reset in progress */
+	ATA_PFLAG_UNLOADING	= (1 << 9), /* driver is being unloaded */
+	ATA_PFLAG_UNLOADED	= (1 << 10), /* driver is unloaded */
 
 	ATA_PFLAG_SUSPENDED	= (1 << 17), /* port is suspended (power) */
 	ATA_PFLAG_PM_PENDING	= (1 << 18), /* PM operation pending */
-- 
cgit v1.2.3-70-g09d2


From 88e740f1654bf28565edd528a060695c1f2b5ad7 Mon Sep 17 00:00:00 2001
From: Fernando Luis Vázquez Cao <fernando@oss.ntt.co.jp>
Date: Mon, 27 Oct 2008 18:44:46 +0900
Subject: block: add queue flag for paravirt frontend drivers

As is the case with SSD devices, we do not want to idle in AS/CFQ when
the block device is a paravirt front-end driver. This patch adds a flag
(QUEUE_FLAG_VIRT) which should be used by front-end drivers such as
virtio_blk and xen-blkfront to indicate a paravirtualized device.

Signed-off-by: Fernando Luis Vazquez Cao <fernando@oss.ntt.co.jp>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 include/linux/blkdev.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 031a315c050..482e9600f7a 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -449,6 +449,7 @@ struct request_queue
 #define QUEUE_FLAG_FAIL_IO     12	/* fake timeout */
 #define QUEUE_FLAG_STACKABLE   13	/* supports request stacking */
 #define QUEUE_FLAG_NONROT      14	/* non-rotational device (SSD) */
+#define QUEUE_FLAG_VIRT        QUEUE_FLAG_NONROT /* paravirt device */
 
 static inline int queue_is_locked(struct request_queue *q)
 {
-- 
cgit v1.2.3-70-g09d2


From 08bafc0341f2f7920e9045bc32c40299cac8c21b Mon Sep 17 00:00:00 2001
From: Keith Mannthey <kmannth@us.ibm.com>
Date: Tue, 25 Nov 2008 10:24:35 +0100
Subject: block: Supress Buffer I/O errors when SCSI REQ_QUIET flag set

Allow the scsi request REQ_QUIET flag to be propagated to the buffer
file system layer. The basic ideas is to pass the flag from the scsi
request to the bio (block IO) and then to the buffer layer.  The buffer
layer can then suppress needless printks.

This patch declutters the kernel log by removed the 40-50 (per lun)
buffer io error messages seen during a boot in my multipath setup . It
is a good chance any real errors will be missed in the "noise" it the
logs without this patch.

During boot I see blocks of messages like
"
__ratelimit: 211 callbacks suppressed
Buffer I/O error on device sdm, logical block 5242879
Buffer I/O error on device sdm, logical block 5242879
Buffer I/O error on device sdm, logical block 5242847
Buffer I/O error on device sdm, logical block 1
Buffer I/O error on device sdm, logical block 5242878
Buffer I/O error on device sdm, logical block 5242879
Buffer I/O error on device sdm, logical block 5242879
Buffer I/O error on device sdm, logical block 5242879
Buffer I/O error on device sdm, logical block 5242879
Buffer I/O error on device sdm, logical block 5242872
"
in my logs.

My disk environment is multipath fiber channel using the SCSI_DH_RDAC
code and multipathd.  This topology includes an "active" and "ghost"
path for each lun. IO's to the "ghost" path will never complete and the
SCSI layer, via the scsi device handler rdac code, quick returns the IOs
to theses paths and sets the REQ_QUIET scsi flag to suppress the scsi
layer messages.

 I am wanting to extend the QUIET behavior to include the buffer file
system layer to deal with these errors as well. I have been running this
patch for a while now on several boxes without issue.  A few runs of
bonnie++ show no noticeable difference in performance in my setup.

Thanks for John Stultz for the quiet_error finalization.

Submitted-by:  Keith Mannthey <kmannth@us.ibm.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/blk-core.c            |  3 +++
 fs/buffer.c                 | 19 +++++++++++++++----
 include/linux/bio.h         |  1 +
 include/linux/buffer_head.h |  1 +
 4 files changed, 20 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/block/blk-core.c b/block/blk-core.c
index 243d18b4ceb..20e1724ccb4 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -153,6 +153,9 @@ static void req_bio_endio(struct request *rq, struct bio *bio,
 			nbytes = bio->bi_size;
 		}
 
+		if (unlikely(rq->cmd_flags & REQ_QUIET))
+			set_bit(BIO_QUIET, &bio->bi_flags);
+
 		bio->bi_size -= nbytes;
 		bio->bi_sector += (nbytes >> 9);
 
diff --git a/fs/buffer.c b/fs/buffer.c
index 10179cfa115..776ae091d3b 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -99,10 +99,18 @@ __clear_page_buffers(struct page *page)
 	page_cache_release(page);
 }
 
+
+static int quiet_error(struct buffer_head *bh)
+{
+	if (!test_bit(BH_Quiet, &bh->b_state) && printk_ratelimit())
+		return 0;
+	return 1;
+}
+
+
 static void buffer_io_error(struct buffer_head *bh)
 {
 	char b[BDEVNAME_SIZE];
-
 	printk(KERN_ERR "Buffer I/O error on device %s, logical block %Lu\n",
 			bdevname(bh->b_bdev, b),
 			(unsigned long long)bh->b_blocknr);
@@ -144,7 +152,7 @@ void end_buffer_write_sync(struct buffer_head *bh, int uptodate)
 	if (uptodate) {
 		set_buffer_uptodate(bh);
 	} else {
-		if (!buffer_eopnotsupp(bh) && printk_ratelimit()) {
+		if (!buffer_eopnotsupp(bh) && !quiet_error(bh)) {
 			buffer_io_error(bh);
 			printk(KERN_WARNING "lost page write due to "
 					"I/O error on %s\n",
@@ -394,7 +402,7 @@ static void end_buffer_async_read(struct buffer_head *bh, int uptodate)
 		set_buffer_uptodate(bh);
 	} else {
 		clear_buffer_uptodate(bh);
-		if (printk_ratelimit())
+		if (!quiet_error(bh))
 			buffer_io_error(bh);
 		SetPageError(page);
 	}
@@ -455,7 +463,7 @@ static void end_buffer_async_write(struct buffer_head *bh, int uptodate)
 	if (uptodate) {
 		set_buffer_uptodate(bh);
 	} else {
-		if (printk_ratelimit()) {
+		if (!quiet_error(bh)) {
 			buffer_io_error(bh);
 			printk(KERN_WARNING "lost page write due to "
 					"I/O error on %s\n",
@@ -2913,6 +2921,9 @@ static void end_bio_bh_io_sync(struct bio *bio, int err)
 		set_bit(BH_Eopnotsupp, &bh->b_state);
 	}
 
+	if (unlikely (test_bit(BIO_QUIET,&bio->bi_flags)))
+		set_bit(BH_Quiet, &bh->b_state);
+
 	bh->b_end_io(bh, test_bit(BIO_UPTODATE, &bio->bi_flags));
 	bio_put(bio);
 }
diff --git a/include/linux/bio.h b/include/linux/bio.h
index 6a642098e5c..cf132bfbbac 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -117,6 +117,7 @@ struct bio {
 #define BIO_CPU_AFFINE	8	/* complete bio on same CPU as submitted */
 #define BIO_NULL_MAPPED 9	/* contains invalid user pages */
 #define BIO_FS_INTEGRITY 10	/* fs owns integrity data, not block layer */
+#define BIO_QUIET	11	/* Make BIO Quiet */
 #define bio_flagged(bio, flag)	((bio)->bi_flags & (1 << (flag)))
 
 /*
diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h
index 3ce64b90118..8605f8a74df 100644
--- a/include/linux/buffer_head.h
+++ b/include/linux/buffer_head.h
@@ -35,6 +35,7 @@ enum bh_state_bits {
 	BH_Ordered,	/* ordered write */
 	BH_Eopnotsupp,	/* operation not supported (barrier) */
 	BH_Unwritten,	/* Buffer is allocated on disk but not written */
+	BH_Quiet,	/* Buffer Error Prinks to be quiet */
 
 	BH_PrivateStart,/* not a state bit, but the first bit available
 			 * for private allocation by other entities
-- 
cgit v1.2.3-70-g09d2


From 64d01dc9e1927e6535627d73f2336c75d1dd3fe2 Mon Sep 17 00:00:00 2001
From: Cheng Renquan <crquan@gmail.com>
Date: Wed, 3 Dec 2008 12:41:39 +0100
Subject: block: use cancel_work_sync() instead of kblockd_flush_work()

After many improvements on kblockd_flush_work, it is now identical to
cancel_work_sync, so a direct call to cancel_work_sync is suggested.

The only difference is that cancel_work_sync is a GPL symbol,
so no non-GPL modules anymore.

Signed-off-by: Cheng Renquan <crquan@gmail.com>
Cc: Jens Axboe <jens.axboe@oracle.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/as-iosched.c     | 2 +-
 block/blk-core.c       | 8 +-------
 block/cfq-iosched.c    | 2 +-
 include/linux/blkdev.h | 1 -
 4 files changed, 3 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/block/as-iosched.c b/block/as-iosched.c
index 71f0abb219e..802b5d0d853 100644
--- a/block/as-iosched.c
+++ b/block/as-iosched.c
@@ -1344,7 +1344,7 @@ static void as_exit_queue(elevator_t *e)
 	struct as_data *ad = e->elevator_data;
 
 	del_timer_sync(&ad->antic_timer);
-	kblockd_flush_work(&ad->antic_work);
+	cancel_work_sync(&ad->antic_work);
 
 	BUG_ON(!list_empty(&ad->fifo_list[REQ_SYNC]));
 	BUG_ON(!list_empty(&ad->fifo_list[REQ_ASYNC]));
diff --git a/block/blk-core.c b/block/blk-core.c
index 20e1724ccb4..2fdcd0cff57 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -408,7 +408,7 @@ void blk_sync_queue(struct request_queue *q)
 {
 	del_timer_sync(&q->unplug_timer);
 	del_timer_sync(&q->timeout);
-	kblockd_flush_work(&q->unplug_work);
+	cancel_work_sync(&q->unplug_work);
 }
 EXPORT_SYMBOL(blk_sync_queue);
 
@@ -2147,12 +2147,6 @@ int kblockd_schedule_work(struct request_queue *q, struct work_struct *work)
 }
 EXPORT_SYMBOL(kblockd_schedule_work);
 
-void kblockd_flush_work(struct work_struct *work)
-{
-	cancel_work_sync(work);
-}
-EXPORT_SYMBOL(kblockd_flush_work);
-
 int __init blk_dev_init(void)
 {
 	kblockd_workqueue = create_workqueue("kblockd");
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 6a062eebbd1..a2bfec7d6b3 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -2160,7 +2160,7 @@ out_cont:
 static void cfq_shutdown_timer_wq(struct cfq_data *cfqd)
 {
 	del_timer_sync(&cfqd->idle_slice_timer);
-	kblockd_flush_work(&cfqd->unplug_work);
+	cancel_work_sync(&cfqd->unplug_work);
 }
 
 static void cfq_put_async_queues(struct cfq_data *cfqd)
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 482e9600f7a..e9bb73ff1d6 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -978,7 +978,6 @@ static inline void put_dev_sector(Sector p)
 
 struct work_struct;
 int kblockd_schedule_work(struct request_queue *q, struct work_struct *work);
-void kblockd_flush_work(struct work_struct *work);
 
 #define MODULE_ALIAS_BLOCKDEV(major,minor) \
 	MODULE_ALIAS("block-major-" __stringify(major) "-" __stringify(minor))
-- 
cgit v1.2.3-70-g09d2


From ba744d5e290055d171c68067259fcc1e2721f542 Mon Sep 17 00:00:00 2001
From: Richard Kennedy <richard@rsk.demon.co.uk>
Date: Wed, 3 Dec 2008 12:41:40 +0100
Subject: block: reorder struct bio to remove padding on 64bit

Remove 8 bytes of padding from struct bio which also removes 16 bytes from
struct bio_pair to make it 248 bytes.  bio_pair then fits into one fewer
cache lines & into a smaller slab.

Signed-off-by: Richard Kennedy <richard@rsk.demon.co.uk>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 include/linux/bio.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/bio.h b/include/linux/bio.h
index cf132bfbbac..3ed714eb54d 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -90,10 +90,11 @@ struct bio {
 
 	unsigned int		bi_comp_cpu;	/* completion CPU */
 
+	atomic_t		bi_cnt;		/* pin count */
+
 	struct bio_vec		*bi_io_vec;	/* the actual vec list */
 
 	bio_end_io_t		*bi_end_io;
-	atomic_t		bi_cnt;		/* pin count */
 
 	void			*bi_private;
 #if defined(CONFIG_BLK_DEV_INTEGRITY)
-- 
cgit v1.2.3-70-g09d2


From 313e42999dbc0f234ca5909a236f78f082cb43b1 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 28 Nov 2008 13:32:02 +0900
Subject: block: reorganize QUEUE_ORDERED_* constants

Separate out ordering type (drain,) and action masks (preflush,
postflush, fua) from visible ordering mode selectors
(QUEUE_ORDERED_*).  Ordering types are now named QUEUE_ORDERED_BY_*
while action masks are named QUEUE_ORDERED_DO_*.

This change is necessary to add QUEUE_ORDERED_DO_BAR and make it
optional to improve empty barrier implementation.

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/blk-barrier.c    | 20 ++++++++++----------
 include/linux/blkdev.h | 39 +++++++++++++++++++++++----------------
 2 files changed, 33 insertions(+), 26 deletions(-)

(limited to 'include/linux')

diff --git a/block/blk-barrier.c b/block/blk-barrier.c
index 6e72d661ae4..1d7adc72c95 100644
--- a/block/blk-barrier.c
+++ b/block/blk-barrier.c
@@ -24,8 +24,8 @@
 int blk_queue_ordered(struct request_queue *q, unsigned ordered,
 		      prepare_flush_fn *prepare_flush_fn)
 {
-	if (ordered & (QUEUE_ORDERED_PREFLUSH | QUEUE_ORDERED_POSTFLUSH) &&
-	    prepare_flush_fn == NULL) {
+	if (!prepare_flush_fn && (ordered & (QUEUE_ORDERED_DO_PREFLUSH |
+					     QUEUE_ORDERED_DO_POSTFLUSH))) {
 		printk(KERN_ERR "%s: prepare_flush_fn required\n", __func__);
 		return -EINVAL;
 	}
@@ -134,7 +134,7 @@ static void queue_flush(struct request_queue *q, unsigned which)
 	struct request *rq;
 	rq_end_io_fn *end_io;
 
-	if (which == QUEUE_ORDERED_PREFLUSH) {
+	if (which == QUEUE_ORDERED_DO_PREFLUSH) {
 		rq = &q->pre_flush_rq;
 		end_io = pre_flush_end_io;
 	} else {
@@ -167,7 +167,7 @@ static inline struct request *start_ordered(struct request_queue *q,
 	blk_rq_init(q, rq);
 	if (bio_data_dir(q->orig_bar_rq->bio) == WRITE)
 		rq->cmd_flags |= REQ_RW;
-	if (q->ordered & QUEUE_ORDERED_FUA)
+	if (q->ordered & QUEUE_ORDERED_DO_FUA)
 		rq->cmd_flags |= REQ_FUA;
 	init_request_from_bio(rq, q->orig_bar_rq->bio);
 	rq->end_io = bar_end_io;
@@ -181,20 +181,20 @@ static inline struct request *start_ordered(struct request_queue *q,
 	 * there will be no data written between the pre and post flush.
 	 * Hence a single flush will suffice.
 	 */
-	if ((q->ordered & QUEUE_ORDERED_POSTFLUSH) && !blk_empty_barrier(rq))
-		queue_flush(q, QUEUE_ORDERED_POSTFLUSH);
+	if ((q->ordered & QUEUE_ORDERED_DO_POSTFLUSH) && !blk_empty_barrier(rq))
+		queue_flush(q, QUEUE_ORDERED_DO_POSTFLUSH);
 	else
 		q->ordseq |= QUEUE_ORDSEQ_POSTFLUSH;
 
 	elv_insert(q, rq, ELEVATOR_INSERT_FRONT);
 
-	if (q->ordered & QUEUE_ORDERED_PREFLUSH) {
-		queue_flush(q, QUEUE_ORDERED_PREFLUSH);
+	if (q->ordered & QUEUE_ORDERED_DO_PREFLUSH) {
+		queue_flush(q, QUEUE_ORDERED_DO_PREFLUSH);
 		rq = &q->pre_flush_rq;
 	} else
 		q->ordseq |= QUEUE_ORDSEQ_PREFLUSH;
 
-	if ((q->ordered & QUEUE_ORDERED_TAG) || q->in_flight == 0)
+	if ((q->ordered & QUEUE_ORDERED_BY_TAG) || q->in_flight == 0)
 		q->ordseq |= QUEUE_ORDSEQ_DRAIN;
 	else
 		rq = NULL;
@@ -237,7 +237,7 @@ int blk_do_ordered(struct request_queue *q, struct request **rqp)
 	    rq != &q->pre_flush_rq && rq != &q->post_flush_rq)
 		return 1;
 
-	if (q->ordered & QUEUE_ORDERED_TAG) {
+	if (q->ordered & QUEUE_ORDERED_BY_TAG) {
 		/* Ordered by tag.  Blocking the next barrier is enough. */
 		if (is_barrier && rq != &q->bar_rq)
 			*rqp = NULL;
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index e9bb73ff1d6..5c92b443239 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -523,22 +523,29 @@ enum {
 	 * TAG_FLUSH	: ordering by tag w/ pre and post flushes
 	 * TAG_FUA	: ordering by tag w/ pre flush and FUA write
 	 */
-	QUEUE_ORDERED_NONE	= 0x00,
-	QUEUE_ORDERED_DRAIN	= 0x01,
-	QUEUE_ORDERED_TAG	= 0x02,
-
-	QUEUE_ORDERED_PREFLUSH	= 0x10,
-	QUEUE_ORDERED_POSTFLUSH	= 0x20,
-	QUEUE_ORDERED_FUA	= 0x40,
-
-	QUEUE_ORDERED_DRAIN_FLUSH = QUEUE_ORDERED_DRAIN |
-			QUEUE_ORDERED_PREFLUSH | QUEUE_ORDERED_POSTFLUSH,
-	QUEUE_ORDERED_DRAIN_FUA	= QUEUE_ORDERED_DRAIN |
-			QUEUE_ORDERED_PREFLUSH | QUEUE_ORDERED_FUA,
-	QUEUE_ORDERED_TAG_FLUSH	= QUEUE_ORDERED_TAG |
-			QUEUE_ORDERED_PREFLUSH | QUEUE_ORDERED_POSTFLUSH,
-	QUEUE_ORDERED_TAG_FUA	= QUEUE_ORDERED_TAG |
-			QUEUE_ORDERED_PREFLUSH | QUEUE_ORDERED_FUA,
+	QUEUE_ORDERED_BY_DRAIN		= 0x01,
+	QUEUE_ORDERED_BY_TAG		= 0x02,
+	QUEUE_ORDERED_DO_PREFLUSH	= 0x10,
+	QUEUE_ORDERED_DO_POSTFLUSH	= 0x40,
+	QUEUE_ORDERED_DO_FUA		= 0x80,
+
+	QUEUE_ORDERED_NONE		= 0x00,
+
+	QUEUE_ORDERED_DRAIN		= QUEUE_ORDERED_BY_DRAIN,
+	QUEUE_ORDERED_DRAIN_FLUSH	= QUEUE_ORDERED_DRAIN |
+					  QUEUE_ORDERED_DO_PREFLUSH |
+					  QUEUE_ORDERED_DO_POSTFLUSH,
+	QUEUE_ORDERED_DRAIN_FUA		= QUEUE_ORDERED_DRAIN |
+					  QUEUE_ORDERED_DO_PREFLUSH |
+					  QUEUE_ORDERED_DO_FUA,
+
+	QUEUE_ORDERED_TAG		= QUEUE_ORDERED_BY_TAG,
+	QUEUE_ORDERED_TAG_FLUSH		= QUEUE_ORDERED_TAG |
+					  QUEUE_ORDERED_DO_PREFLUSH |
+					  QUEUE_ORDERED_DO_POSTFLUSH,
+	QUEUE_ORDERED_TAG_FUA		= QUEUE_ORDERED_TAG |
+					  QUEUE_ORDERED_DO_PREFLUSH |
+					  QUEUE_ORDERED_DO_FUA,
 
 	/*
 	 * Ordered operation sequence
-- 
cgit v1.2.3-70-g09d2


From f671620e7d895af221bdfeda751d54fa55ed9546 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 28 Nov 2008 13:32:04 +0900
Subject: block: make every barrier action optional

In all barrier sequences, the barrier write itself was always assumed
to be issued and thus didn't have corresponding control flag.  This
patch adds QUEUE_ORDERED_DO_BAR and unify action mask handling in
start_ordered() such that any barrier action can be skipped.

This patch doesn't introduce any visible behavior changes.

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/blk-barrier.c    | 41 ++++++++++++++++++++++++-----------------
 include/linux/blkdev.h |  7 +++++--
 2 files changed, 29 insertions(+), 19 deletions(-)

(limited to 'include/linux')

diff --git a/block/blk-barrier.c b/block/blk-barrier.c
index 43d479a1e66..1efabf829c5 100644
--- a/block/blk-barrier.c
+++ b/block/blk-barrier.c
@@ -158,19 +158,10 @@ static inline struct request *start_ordered(struct request_queue *q,
 	q->ordered = q->next_ordered;
 	q->ordseq |= QUEUE_ORDSEQ_STARTED;
 
-	/*
-	 * Prep proxy barrier request.
-	 */
+	/* stash away the original request */
 	elv_dequeue_request(q, rq);
 	q->orig_bar_rq = rq;
-	rq = &q->bar_rq;
-	blk_rq_init(q, rq);
-	if (bio_data_dir(q->orig_bar_rq->bio) == WRITE)
-		rq->cmd_flags |= REQ_RW;
-	if (q->ordered & QUEUE_ORDERED_DO_FUA)
-		rq->cmd_flags |= REQ_FUA;
-	init_request_from_bio(rq, q->orig_bar_rq->bio);
-	rq->end_io = bar_end_io;
+	rq = NULL;
 
 	/*
 	 * Queue ordered sequence.  As we stack them at the head, we
@@ -181,12 +172,28 @@ static inline struct request *start_ordered(struct request_queue *q,
 	 * there will be no data written between the pre and post flush.
 	 * Hence a single flush will suffice.
 	 */
-	if ((q->ordered & QUEUE_ORDERED_DO_POSTFLUSH) && !blk_empty_barrier(rq))
+	if ((q->ordered & QUEUE_ORDERED_DO_POSTFLUSH) &&
+	    !blk_empty_barrier(q->orig_bar_rq)) {
 		queue_flush(q, QUEUE_ORDERED_DO_POSTFLUSH);
-	else
+		rq = &q->post_flush_rq;
+	} else
 		q->ordseq |= QUEUE_ORDSEQ_POSTFLUSH;
 
-	elv_insert(q, rq, ELEVATOR_INSERT_FRONT);
+	if (q->ordered & QUEUE_ORDERED_DO_BAR) {
+		rq = &q->bar_rq;
+
+		/* initialize proxy request and queue it */
+		blk_rq_init(q, rq);
+		if (bio_data_dir(q->orig_bar_rq->bio) == WRITE)
+			rq->cmd_flags |= REQ_RW;
+		if (q->ordered & QUEUE_ORDERED_DO_FUA)
+			rq->cmd_flags |= REQ_FUA;
+		init_request_from_bio(rq, q->orig_bar_rq->bio);
+		rq->end_io = bar_end_io;
+
+		elv_insert(q, rq, ELEVATOR_INSERT_FRONT);
+	} else
+		q->ordseq |= QUEUE_ORDSEQ_BAR;
 
 	if (q->ordered & QUEUE_ORDERED_DO_PREFLUSH) {
 		queue_flush(q, QUEUE_ORDERED_DO_PREFLUSH);
@@ -194,10 +201,10 @@ static inline struct request *start_ordered(struct request_queue *q,
 	} else
 		q->ordseq |= QUEUE_ORDSEQ_PREFLUSH;
 
-	if ((q->ordered & QUEUE_ORDERED_BY_TAG) || q->in_flight == 0)
-		q->ordseq |= QUEUE_ORDSEQ_DRAIN;
-	else
+	if ((q->ordered & QUEUE_ORDERED_BY_DRAIN) && q->in_flight)
 		rq = NULL;
+	else
+		q->ordseq |= QUEUE_ORDSEQ_DRAIN;
 
 	return rq;
 }
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 5c92b443239..b044267009e 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -526,12 +526,14 @@ enum {
 	QUEUE_ORDERED_BY_DRAIN		= 0x01,
 	QUEUE_ORDERED_BY_TAG		= 0x02,
 	QUEUE_ORDERED_DO_PREFLUSH	= 0x10,
+	QUEUE_ORDERED_DO_BAR		= 0x20,
 	QUEUE_ORDERED_DO_POSTFLUSH	= 0x40,
 	QUEUE_ORDERED_DO_FUA		= 0x80,
 
 	QUEUE_ORDERED_NONE		= 0x00,
 
-	QUEUE_ORDERED_DRAIN		= QUEUE_ORDERED_BY_DRAIN,
+	QUEUE_ORDERED_DRAIN		= QUEUE_ORDERED_BY_DRAIN |
+					  QUEUE_ORDERED_DO_BAR,
 	QUEUE_ORDERED_DRAIN_FLUSH	= QUEUE_ORDERED_DRAIN |
 					  QUEUE_ORDERED_DO_PREFLUSH |
 					  QUEUE_ORDERED_DO_POSTFLUSH,
@@ -539,7 +541,8 @@ enum {
 					  QUEUE_ORDERED_DO_PREFLUSH |
 					  QUEUE_ORDERED_DO_FUA,
 
-	QUEUE_ORDERED_TAG		= QUEUE_ORDERED_BY_TAG,
+	QUEUE_ORDERED_TAG		= QUEUE_ORDERED_BY_TAG |
+					  QUEUE_ORDERED_DO_BAR,
 	QUEUE_ORDERED_TAG_FLUSH		= QUEUE_ORDERED_TAG |
 					  QUEUE_ORDERED_DO_PREFLUSH |
 					  QUEUE_ORDERED_DO_POSTFLUSH,
-- 
cgit v1.2.3-70-g09d2


From 8f11b3e99a1136fcbb67316c3260f085299c0bff Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 28 Nov 2008 13:32:05 +0900
Subject: block: make barrier completion more robust

Barrier completion had the following assumptions.

* start_ordered() couldn't finish the whole sequence properly.  If all
  actions are to be skipped, q->ordseq is set correctly but the actual
  completion was never triggered thus hanging the barrier request.

* Drain completion in elv_complete_request() assumed that there's
  always at least one request in the queue when drain completes.

Both assumptions are true but these assumptions need to be removed to
improve empty barrier implementation.  This patch makes the following
changes.

* Make start_ordered() use blk_ordered_complete_seq() to mark skipped
  steps complete and notify __elv_next_request() that it should fetch
  the next request if the whole barrier has completed inside
  start_ordered().

* Make drain completion path in elv_complete_request() check whether
  the queue is empty.  Empty queue also indicates drain completion.

* While at it, convert 0/1 return from blk_do_ordered() to false/true.

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/blk-barrier.c    | 45 +++++++++++++++++++++++++++------------------
 block/elevator.c       | 10 +++++++---
 include/linux/blkdev.h |  4 ++--
 3 files changed, 36 insertions(+), 23 deletions(-)

(limited to 'include/linux')

diff --git a/block/blk-barrier.c b/block/blk-barrier.c
index 1efabf829c5..b03d88013e1 100644
--- a/block/blk-barrier.c
+++ b/block/blk-barrier.c
@@ -88,7 +88,7 @@ unsigned blk_ordered_req_seq(struct request *rq)
 		return QUEUE_ORDSEQ_DONE;
 }
 
-void blk_ordered_complete_seq(struct request_queue *q, unsigned seq, int error)
+bool blk_ordered_complete_seq(struct request_queue *q, unsigned seq, int error)
 {
 	struct request *rq;
 
@@ -99,7 +99,7 @@ void blk_ordered_complete_seq(struct request_queue *q, unsigned seq, int error)
 	q->ordseq |= seq;
 
 	if (blk_ordered_cur_seq(q) != QUEUE_ORDSEQ_DONE)
-		return;
+		return false;
 
 	/*
 	 * Okay, sequence complete.
@@ -109,6 +109,8 @@ void blk_ordered_complete_seq(struct request_queue *q, unsigned seq, int error)
 
 	if (__blk_end_request(rq, q->orderr, blk_rq_bytes(rq)))
 		BUG();
+
+	return true;
 }
 
 static void pre_flush_end_io(struct request *rq, int error)
@@ -151,9 +153,11 @@ static void queue_flush(struct request_queue *q, unsigned which)
 	elv_insert(q, rq, ELEVATOR_INSERT_FRONT);
 }
 
-static inline struct request *start_ordered(struct request_queue *q,
-					    struct request *rq)
+static inline bool start_ordered(struct request_queue *q, struct request **rqp)
 {
+	struct request *rq = *rqp;
+	unsigned skip = 0;
+
 	q->orderr = 0;
 	q->ordered = q->next_ordered;
 	q->ordseq |= QUEUE_ORDSEQ_STARTED;
@@ -177,7 +181,7 @@ static inline struct request *start_ordered(struct request_queue *q,
 		queue_flush(q, QUEUE_ORDERED_DO_POSTFLUSH);
 		rq = &q->post_flush_rq;
 	} else
-		q->ordseq |= QUEUE_ORDSEQ_POSTFLUSH;
+		skip |= QUEUE_ORDSEQ_POSTFLUSH;
 
 	if (q->ordered & QUEUE_ORDERED_DO_BAR) {
 		rq = &q->bar_rq;
@@ -193,35 +197,40 @@ static inline struct request *start_ordered(struct request_queue *q,
 
 		elv_insert(q, rq, ELEVATOR_INSERT_FRONT);
 	} else
-		q->ordseq |= QUEUE_ORDSEQ_BAR;
+		skip |= QUEUE_ORDSEQ_BAR;
 
 	if (q->ordered & QUEUE_ORDERED_DO_PREFLUSH) {
 		queue_flush(q, QUEUE_ORDERED_DO_PREFLUSH);
 		rq = &q->pre_flush_rq;
 	} else
-		q->ordseq |= QUEUE_ORDSEQ_PREFLUSH;
+		skip |= QUEUE_ORDSEQ_PREFLUSH;
 
 	if ((q->ordered & QUEUE_ORDERED_BY_DRAIN) && q->in_flight)
 		rq = NULL;
 	else
-		q->ordseq |= QUEUE_ORDSEQ_DRAIN;
+		skip |= QUEUE_ORDSEQ_DRAIN;
+
+	*rqp = rq;
 
-	return rq;
+	/*
+	 * Complete skipped sequences.  If whole sequence is complete,
+	 * return false to tell elevator that this request is gone.
+	 */
+	return !blk_ordered_complete_seq(q, skip, 0);
 }
 
-int blk_do_ordered(struct request_queue *q, struct request **rqp)
+bool blk_do_ordered(struct request_queue *q, struct request **rqp)
 {
 	struct request *rq = *rqp;
 	const int is_barrier = blk_fs_request(rq) && blk_barrier_rq(rq);
 
 	if (!q->ordseq) {
 		if (!is_barrier)
-			return 1;
+			return true;
 
-		if (q->next_ordered != QUEUE_ORDERED_NONE) {
-			*rqp = start_ordered(q, rq);
-			return 1;
-		} else {
+		if (q->next_ordered != QUEUE_ORDERED_NONE)
+			return start_ordered(q, rqp);
+		else {
 			/*
 			 * Queue ordering not supported.  Terminate
 			 * with prejudice.
@@ -231,7 +240,7 @@ int blk_do_ordered(struct request_queue *q, struct request **rqp)
 					      blk_rq_bytes(rq)))
 				BUG();
 			*rqp = NULL;
-			return 0;
+			return false;
 		}
 	}
 
@@ -242,7 +251,7 @@ int blk_do_ordered(struct request_queue *q, struct request **rqp)
 	/* Special requests are not subject to ordering rules. */
 	if (!blk_fs_request(rq) &&
 	    rq != &q->pre_flush_rq && rq != &q->post_flush_rq)
-		return 1;
+		return true;
 
 	if (q->ordered & QUEUE_ORDERED_BY_TAG) {
 		/* Ordered by tag.  Blocking the next barrier is enough. */
@@ -255,7 +264,7 @@ int blk_do_ordered(struct request_queue *q, struct request **rqp)
 			*rqp = NULL;
 	}
 
-	return 1;
+	return true;
 }
 
 static void bio_end_empty_barrier(struct bio *bio, int err)
diff --git a/block/elevator.c b/block/elevator.c
index 86836dd179c..261ffaaf47b 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -944,10 +944,14 @@ void elv_completed_request(struct request_queue *q, struct request *rq)
 	 * drained for flush sequence.
 	 */
 	if (unlikely(q->ordseq)) {
-		struct request *first_rq = list_entry_rq(q->queue_head.next);
-		if (q->in_flight == 0 &&
+		struct request *next = NULL;
+
+		if (!list_empty(&q->queue_head))
+			next = list_entry_rq(q->queue_head.next);
+
+		if (!q->in_flight &&
 		    blk_ordered_cur_seq(q) == QUEUE_ORDSEQ_DRAIN &&
-		    blk_ordered_req_seq(first_rq) > QUEUE_ORDSEQ_DRAIN) {
+		    (!next || blk_ordered_req_seq(next) > QUEUE_ORDSEQ_DRAIN)) {
 			blk_ordered_complete_seq(q, QUEUE_ORDSEQ_DRAIN, 0);
 			blk_start_queueing(q);
 		}
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index b044267009e..3c7078e0129 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -866,10 +866,10 @@ extern void blk_queue_rq_timed_out(struct request_queue *, rq_timed_out_fn *);
 extern void blk_queue_rq_timeout(struct request_queue *, unsigned int);
 extern struct backing_dev_info *blk_get_backing_dev_info(struct block_device *bdev);
 extern int blk_queue_ordered(struct request_queue *, unsigned, prepare_flush_fn *);
-extern int blk_do_ordered(struct request_queue *, struct request **);
+extern bool blk_do_ordered(struct request_queue *, struct request **);
 extern unsigned blk_ordered_cur_seq(struct request_queue *);
 extern unsigned blk_ordered_req_seq(struct request *);
-extern void blk_ordered_complete_seq(struct request_queue *, unsigned, int);
+extern bool blk_ordered_complete_seq(struct request_queue *, unsigned, int);
 
 extern int blk_rq_map_sg(struct request_queue *, struct request *, struct scatterlist *);
 extern void blk_dump_rq_flags(struct request *, char *);
-- 
cgit v1.2.3-70-g09d2


From 58eea927d2de43dc6f03d1ba2c46e55854b31540 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 28 Nov 2008 13:32:06 +0900
Subject: block: simplify empty barrier implementation

Empty barrier required special handling in __elv_next_request() to
complete it without letting the low level driver see it.

With previous changes, barrier code is now flexible enough to skip the
BAR step using the same barrier sequence selection mechanism.  Drop
the special handling and mask off q->ordered from start_ordered().

Remove blk_empty_barrier() test which now has no user.

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/blk-barrier.c    | 16 ++++++++++------
 block/elevator.c       |  8 --------
 include/linux/blkdev.h |  1 -
 3 files changed, 10 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/block/blk-barrier.c b/block/blk-barrier.c
index b03d88013e1..c63044e9c4c 100644
--- a/block/blk-barrier.c
+++ b/block/blk-barrier.c
@@ -162,6 +162,14 @@ static inline bool start_ordered(struct request_queue *q, struct request **rqp)
 	q->ordered = q->next_ordered;
 	q->ordseq |= QUEUE_ORDSEQ_STARTED;
 
+	/*
+	 * For an empty barrier, there's no actual BAR request, which
+	 * in turn makes POSTFLUSH unnecessary.  Mask them off.
+	 */
+	if (!rq->hard_nr_sectors)
+		q->ordered &= ~(QUEUE_ORDERED_DO_BAR |
+				QUEUE_ORDERED_DO_POSTFLUSH);
+
 	/* stash away the original request */
 	elv_dequeue_request(q, rq);
 	q->orig_bar_rq = rq;
@@ -171,13 +179,9 @@ static inline bool start_ordered(struct request_queue *q, struct request **rqp)
 	 * Queue ordered sequence.  As we stack them at the head, we
 	 * need to queue in reverse order.  Note that we rely on that
 	 * no fs request uses ELEVATOR_INSERT_FRONT and thus no fs
-	 * request gets inbetween ordered sequence. If this request is
-	 * an empty barrier, we don't need to do a postflush ever since
-	 * there will be no data written between the pre and post flush.
-	 * Hence a single flush will suffice.
+	 * request gets inbetween ordered sequence.
 	 */
-	if ((q->ordered & QUEUE_ORDERED_DO_POSTFLUSH) &&
-	    !blk_empty_barrier(q->orig_bar_rq)) {
+	if (q->ordered & QUEUE_ORDERED_DO_POSTFLUSH) {
 		queue_flush(q, QUEUE_ORDERED_DO_POSTFLUSH);
 		rq = &q->post_flush_rq;
 	} else
diff --git a/block/elevator.c b/block/elevator.c
index 261ffaaf47b..ff60177a3ba 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -755,14 +755,6 @@ struct request *elv_next_request(struct request_queue *q)
 	int ret;
 
 	while ((rq = __elv_next_request(q)) != NULL) {
-		/*
-		 * Kill the empty barrier place holder, the driver must
-		 * not ever see it.
-		 */
-		if (blk_empty_barrier(rq)) {
-			__blk_end_request(rq, 0, blk_rq_bytes(rq));
-			continue;
-		}
 		if (!(rq->cmd_flags & REQ_STARTED)) {
 			/*
 			 * This is the first time the device driver
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 3c7078e0129..41bbadfd17f 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -596,7 +596,6 @@ enum {
 #define blk_fua_rq(rq)		((rq)->cmd_flags & REQ_FUA)
 #define blk_discard_rq(rq)	((rq)->cmd_flags & REQ_DISCARD)
 #define blk_bidi_rq(rq)		((rq)->next_rq != NULL)
-#define blk_empty_barrier(rq)	(blk_barrier_rq(rq) && blk_fs_request(rq) && !(rq)->hard_nr_sectors)
 /* rq->queuelist of dequeued request must be list_empty() */
 #define blk_queued_rq(rq)	(!list_empty(&(rq)->queuelist))
 
-- 
cgit v1.2.3-70-g09d2


From 7ff9345ffac56743b5001561bc2dc1e041b79149 Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Thu, 11 Dec 2008 11:53:43 +0100
Subject: bio: only mempool back the largest bio_vec slab cache

We only very rarely need the mempool backing, so it makes sense to
get rid of all but one of the mempool in a bio_set. So keep the
largest bio_vec count mempool so we can always honor the largest
allocation, and "upgrade" callers that fail.

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 fs/bio.c            | 125 ++++++++++++++++++++++++++++++----------------------
 include/linux/bio.h |   3 +-
 2 files changed, 74 insertions(+), 54 deletions(-)

(limited to 'include/linux')

diff --git a/fs/bio.c b/fs/bio.c
index df99c882b80..eb6b4683a26 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -58,7 +58,8 @@ unsigned int bvec_nr_vecs(unsigned short idx)
 	return bvec_slabs[idx].nr_vecs;
 }
 
-struct bio_vec *bvec_alloc_bs(gfp_t gfp_mask, int nr, unsigned long *idx, struct bio_set *bs)
+struct bio_vec *bvec_alloc_bs(gfp_t gfp_mask, int nr, unsigned long *idx,
+			      struct bio_set *bs)
 {
 	struct bio_vec *bvl;
 
@@ -67,60 +68,90 @@ struct bio_vec *bvec_alloc_bs(gfp_t gfp_mask, int nr, unsigned long *idx, struct
 	 * If not, this is a bio_kmalloc() allocation and just do a
 	 * kzalloc() for the exact number of vecs right away.
 	 */
-	if (bs) {
+	if (!bs)
+		bvl = kzalloc(nr * sizeof(struct bio_vec), gfp_mask);
+
+	/*
+	 * see comment near bvec_array define!
+	 */
+	switch (nr) {
+	case 1:
+		*idx = 0;
+		break;
+	case 2 ... 4:
+		*idx = 1;
+		break;
+	case 5 ... 16:
+		*idx = 2;
+		break;
+	case 17 ... 64:
+		*idx = 3;
+		break;
+	case 65 ... 128:
+		*idx = 4;
+		break;
+	case 129 ... BIO_MAX_PAGES:
+		*idx = 5;
+		break;
+	default:
+		return NULL;
+	}
+
+	/*
+	 * idx now points to the pool we want to allocate from. only the
+	 * 1-vec entry pool is mempool backed.
+	 */
+	if (*idx == BIOVEC_MAX_IDX) {
+fallback:
+		bvl = mempool_alloc(bs->bvec_pool, gfp_mask);
+	} else {
+		struct biovec_slab *bvs = bvec_slabs + *idx;
+		gfp_t __gfp_mask = gfp_mask & ~(__GFP_WAIT | __GFP_IO);
+
 		/*
-		 * see comment near bvec_array define!
+		 * Make this allocation restricted and don't dump info on
+		 * allocation failures, since we'll fallback to the mempool
+		 * in case of failure.
 		 */
-		switch (nr) {
-		case 1:
-			*idx = 0;
-			break;
-		case 2 ... 4:
-			*idx = 1;
-			break;
-		case 5 ... 16:
-			*idx = 2;
-			break;
-		case 17 ... 64:
-			*idx = 3;
-			break;
-		case 65 ... 128:
-			*idx = 4;
-			break;
-		case 129 ... BIO_MAX_PAGES:
-			*idx = 5;
-			break;
-		default:
-			return NULL;
-		}
+		__gfp_mask |= __GFP_NOMEMALLOC | __GFP_NORETRY | __GFP_NOWARN;
 
 		/*
-		 * idx now points to the pool we want to allocate from
+		 * Try a slab allocation. If this fails and __GFP_WAIT
+		 * is set, retry with the 1-entry mempool
 		 */
-		bvl = mempool_alloc(bs->bvec_pools[*idx], gfp_mask);
-		if (bvl)
-			memset(bvl, 0,
-				bvec_nr_vecs(*idx) * sizeof(struct bio_vec));
-	} else
-		bvl = kzalloc(nr * sizeof(struct bio_vec), gfp_mask);
+		bvl = kmem_cache_alloc(bvs->slab, __gfp_mask);
+		if (unlikely(!bvl && (gfp_mask & __GFP_WAIT))) {
+			*idx = BIOVEC_MAX_IDX;
+			goto fallback;
+		}
+	}
+
+	if (bvl)
+		memset(bvl, 0, bvec_nr_vecs(*idx) * sizeof(struct bio_vec));
 
 	return bvl;
 }
 
-void bio_free(struct bio *bio, struct bio_set *bio_set)
+void bio_free(struct bio *bio, struct bio_set *bs)
 {
 	if (bio->bi_io_vec) {
 		const int pool_idx = BIO_POOL_IDX(bio);
 
 		BIO_BUG_ON(pool_idx >= BIOVEC_NR_POOLS);
 
-		mempool_free(bio->bi_io_vec, bio_set->bvec_pools[pool_idx]);
+		if (pool_idx == BIOVEC_MAX_IDX)
+			mempool_free(bio->bi_io_vec, bs->bvec_pool);
+		else {
+			struct biovec_slab *bvs = bvec_slabs + pool_idx;
+
+			kmem_cache_free(bvs->slab, bio->bi_io_vec);
+		}
 	}
 
 	if (bio_integrity(bio))
-		bio_integrity_free(bio, bio_set);
+		bio_integrity_free(bio, bs);
 
-	mempool_free(bio, bio_set->bio_pool);
+	mempool_free(bio, bs->bio_pool);
 }
 
 /*
@@ -1346,30 +1377,18 @@ EXPORT_SYMBOL(bio_sector_offset);
  */
 static int biovec_create_pools(struct bio_set *bs, int pool_entries)
 {
-	int i;
+	struct biovec_slab *bp = bvec_slabs + BIOVEC_MAX_IDX;
 
-	for (i = 0; i < BIOVEC_NR_POOLS; i++) {
-		struct biovec_slab *bp = bvec_slabs + i;
-		mempool_t **bvp = bs->bvec_pools + i;
+	bs->bvec_pool = mempool_create_slab_pool(pool_entries, bp->slab);
+	if (!bs->bvec_pool)
+		return -ENOMEM;
 
-		*bvp = mempool_create_slab_pool(pool_entries, bp->slab);
-		if (!*bvp)
-			return -ENOMEM;
-	}
 	return 0;
 }
 
 static void biovec_free_pools(struct bio_set *bs)
 {
-	int i;
-
-	for (i = 0; i < BIOVEC_NR_POOLS; i++) {
-		mempool_t *bvp = bs->bvec_pools[i];
-
-		if (bvp)
-			mempool_destroy(bvp);
-	}
-
+	mempool_destroy(bs->bvec_pool);
 }
 
 void bioset_free(struct bio_set *bs)
diff --git a/include/linux/bio.h b/include/linux/bio.h
index 3ed714eb54d..d76e4bf22f2 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -397,13 +397,14 @@ static inline void bio_set_completion_cpu(struct bio *bio, unsigned int cpu)
  */
 #define BIO_POOL_SIZE 2
 #define BIOVEC_NR_POOLS 6
+#define BIOVEC_MAX_IDX	(BIOVEC_NR_POOLS - 1)
 
 struct bio_set {
 	mempool_t *bio_pool;
 #if defined(CONFIG_BLK_DEV_INTEGRITY)
 	mempool_t *bio_integrity_pool;
 #endif
-	mempool_t *bvec_pools[BIOVEC_NR_POOLS];
+	mempool_t *bvec_pool;
 };
 
 struct biovec_slab {
-- 
cgit v1.2.3-70-g09d2


From 1b4344986926da324b5cd10b683e5a1a5e1b7db3 Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Wed, 22 Oct 2008 20:32:58 +0200
Subject: bio: move the slab pointer inside the bio_set

In preparation for adding differently sized bios.

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 fs/bio.c            | 7 +++++--
 include/linux/bio.h | 1 +
 2 files changed, 6 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/fs/bio.c b/fs/bio.c
index eb6b4683a26..1ab8986b041 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -1404,12 +1404,15 @@ void bioset_free(struct bio_set *bs)
 
 struct bio_set *bioset_create(int bio_pool_size, int bvec_pool_size)
 {
-	struct bio_set *bs = kzalloc(sizeof(*bs), GFP_KERNEL);
+	struct bio_set *bs;
 
+	bs = kzalloc(sizeof(*bs), GFP_KERNEL);
 	if (!bs)
 		return NULL;
 
-	bs->bio_pool = mempool_create_slab_pool(bio_pool_size, bio_slab);
+	bs->bio_slab = bio_slab;
+
+	bs->bio_pool = mempool_create_slab_pool(bio_pool_size, bs->bio_slab);
 	if (!bs->bio_pool)
 		goto bad;
 
diff --git a/include/linux/bio.h b/include/linux/bio.h
index d76e4bf22f2..9340098d75d 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -400,6 +400,7 @@ static inline void bio_set_completion_cpu(struct bio *bio, unsigned int cpu)
 #define BIOVEC_MAX_IDX	(BIOVEC_NR_POOLS - 1)
 
 struct bio_set {
+	struct kmem_cache *bio_slab;
 	mempool_t *bio_pool;
 #if defined(CONFIG_BLK_DEV_INTEGRITY)
 	mempool_t *bio_integrity_pool;
-- 
cgit v1.2.3-70-g09d2


From bb799ca0202a360fa74d5f17039b9100caebdde7 Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Wed, 10 Dec 2008 15:35:05 +0100
Subject: bio: allow individual slabs in the bio_set

Instead of having a global bio slab cache, add a reference to one
in each bio_set that is created. This allows for personalized slabs
in each bio_set, so that they can have bios of different sizes.

This means we can personalize the bios we return. File systems may
want to embed the bio inside another structure, to avoid allocation
more items (and stuffing them in ->bi_private) after the get a bio.
Or we may want to embed a number of bio_vecs directly at the end
of a bio, to avoid doing two allocations to return a bio. This is now
possible.

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 drivers/md/dm-crypt.c |   2 +-
 drivers/md/dm-io.c    |   2 +-
 drivers/md/dm.c       |   2 +-
 fs/bio-integrity.c    |   2 +-
 fs/bio.c              | 191 ++++++++++++++++++++++++++++++++++++++++++--------
 include/linux/bio.h   |   6 +-
 6 files changed, 170 insertions(+), 35 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index ce26c84af06..3326750ec02 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -1060,7 +1060,7 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 		goto bad_page_pool;
 	}
 
-	cc->bs = bioset_create(MIN_IOS, MIN_IOS);
+	cc->bs = bioset_create(MIN_IOS, 0);
 	if (!cc->bs) {
 		ti->error = "Cannot allocate crypt bioset";
 		goto bad_bs;
diff --git a/drivers/md/dm-io.c b/drivers/md/dm-io.c
index 2fd6d445063..a34338567a2 100644
--- a/drivers/md/dm-io.c
+++ b/drivers/md/dm-io.c
@@ -56,7 +56,7 @@ struct dm_io_client *dm_io_client_create(unsigned num_pages)
 	if (!client->pool)
 		goto bad;
 
-	client->bios = bioset_create(16, 16);
+	client->bios = bioset_create(16, 0);
 	if (!client->bios)
 		goto bad;
 
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 343094c3fee..421c9f02d8c 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -1093,7 +1093,7 @@ static struct mapped_device *alloc_dev(int minor)
 	if (!md->tio_pool)
 		goto bad_tio_pool;
 
-	md->bs = bioset_create(16, 16);
+	md->bs = bioset_create(16, 0);
 	if (!md->bs)
 		goto bad_no_bioset;
 
diff --git a/fs/bio-integrity.c b/fs/bio-integrity.c
index 19caf7c962a..77ebc3c263d 100644
--- a/fs/bio-integrity.c
+++ b/fs/bio-integrity.c
@@ -111,7 +111,7 @@ void bio_integrity_free(struct bio *bio, struct bio_set *bs)
 	    && bip->bip_buf != NULL)
 		kfree(bip->bip_buf);
 
-	mempool_free(bip->bip_vec, bs->bvec_pools[bip->bip_pool]);
+	bvec_free_bs(bs, bip->bip_vec, bip->bip_pool);
 	mempool_free(bip, bs->bio_integrity_pool);
 
 	bio->bi_integrity = NULL;
diff --git a/fs/bio.c b/fs/bio.c
index 1ab8986b041..0146f80789e 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -31,8 +31,6 @@
 
 DEFINE_TRACE(block_split);
 
-static struct kmem_cache *bio_slab __read_mostly;
-
 static mempool_t *bio_split_pool __read_mostly;
 
 /*
@@ -40,9 +38,8 @@ static mempool_t *bio_split_pool __read_mostly;
  * break badly! cannot be bigger than what you can fit into an
  * unsigned short
  */
-
 #define BV(x) { .nr_vecs = x, .name = "biovec-"__stringify(x) }
-static struct biovec_slab bvec_slabs[BIOVEC_NR_POOLS] __read_mostly = {
+struct biovec_slab bvec_slabs[BIOVEC_NR_POOLS] __read_mostly = {
 	BV(1), BV(4), BV(16), BV(64), BV(128), BV(BIO_MAX_PAGES),
 };
 #undef BV
@@ -53,11 +50,119 @@ static struct biovec_slab bvec_slabs[BIOVEC_NR_POOLS] __read_mostly = {
  */
 struct bio_set *fs_bio_set;
 
+/*
+ * Our slab pool management
+ */
+struct bio_slab {
+	struct kmem_cache *slab;
+	unsigned int slab_ref;
+	unsigned int slab_size;
+	char name[8];
+};
+static DEFINE_MUTEX(bio_slab_lock);
+static struct bio_slab *bio_slabs;
+static unsigned int bio_slab_nr, bio_slab_max;
+
+static struct kmem_cache *bio_find_or_create_slab(unsigned int extra_size)
+{
+	unsigned int sz = sizeof(struct bio) + extra_size;
+	struct kmem_cache *slab = NULL;
+	struct bio_slab *bslab;
+	unsigned int i, entry = -1;
+
+	mutex_lock(&bio_slab_lock);
+
+	i = 0;
+	while (i < bio_slab_nr) {
+		struct bio_slab *bslab = &bio_slabs[i];
+
+		if (!bslab->slab && entry == -1)
+			entry = i;
+		else if (bslab->slab_size == sz) {
+			slab = bslab->slab;
+			bslab->slab_ref++;
+			break;
+		}
+		i++;
+	}
+
+	if (slab)
+		goto out_unlock;
+
+	if (bio_slab_nr == bio_slab_max && entry == -1) {
+		bio_slab_max <<= 1;
+		bio_slabs = krealloc(bio_slabs,
+				     bio_slab_max * sizeof(struct bio_slab),
+				     GFP_KERNEL);
+		if (!bio_slabs)
+			goto out_unlock;
+	}
+	if (entry == -1)
+		entry = bio_slab_nr++;
+
+	bslab = &bio_slabs[entry];
+
+	snprintf(bslab->name, sizeof(bslab->name), "bio-%d", entry);
+	slab = kmem_cache_create(bslab->name, sz, 0, SLAB_HWCACHE_ALIGN, NULL);
+	if (!slab)
+		goto out_unlock;
+
+	printk("bio: create slab <%s> at %d\n", bslab->name, entry);
+	bslab->slab = slab;
+	bslab->slab_ref = 1;
+	bslab->slab_size = sz;
+out_unlock:
+	mutex_unlock(&bio_slab_lock);
+	return slab;
+}
+
+static void bio_put_slab(struct bio_set *bs)
+{
+	struct bio_slab *bslab = NULL;
+	unsigned int i;
+
+	mutex_lock(&bio_slab_lock);
+
+	for (i = 0; i < bio_slab_nr; i++) {
+		if (bs->bio_slab == bio_slabs[i].slab) {
+			bslab = &bio_slabs[i];
+			break;
+		}
+	}
+
+	if (WARN(!bslab, KERN_ERR "bio: unable to find slab!\n"))
+		goto out;
+
+	WARN_ON(!bslab->slab_ref);
+
+	if (--bslab->slab_ref)
+		goto out;
+
+	kmem_cache_destroy(bslab->slab);
+	bslab->slab = NULL;
+
+out:
+	mutex_unlock(&bio_slab_lock);
+}
+
 unsigned int bvec_nr_vecs(unsigned short idx)
 {
 	return bvec_slabs[idx].nr_vecs;
 }
 
+void bvec_free_bs(struct bio_set *bs, struct bio_vec *bv, unsigned int idx)
+{
+	BIO_BUG_ON(idx >= BIOVEC_NR_POOLS);
+
+	if (idx == BIOVEC_MAX_IDX)
+		mempool_free(bv, bs->bvec_pool);
+	else {
+		struct biovec_slab *bvs = bvec_slabs + idx;
+
+		kmem_cache_free(bvs->slab, bv);
+	}
+}
+
 struct bio_vec *bvec_alloc_bs(gfp_t gfp_mask, int nr, unsigned long *idx,
 			      struct bio_set *bs)
 {
@@ -134,24 +239,22 @@ fallback:
 
 void bio_free(struct bio *bio, struct bio_set *bs)
 {
-	if (bio->bi_io_vec) {
-		const int pool_idx = BIO_POOL_IDX(bio);
+	void *p;
 
-		BIO_BUG_ON(pool_idx >= BIOVEC_NR_POOLS);
-
-		if (pool_idx == BIOVEC_MAX_IDX)
-			mempool_free(bio->bi_io_vec, bs->bvec_pool);
-		else {
-			struct biovec_slab *bvs = bvec_slabs + pool_idx;
-
-			kmem_cache_free(bvs->slab, bio->bi_io_vec);
-		}
-	}
+	if (bio->bi_io_vec)
+		bvec_free_bs(bs, bio->bi_io_vec, BIO_POOL_IDX(bio));
 
 	if (bio_integrity(bio))
 		bio_integrity_free(bio, bs);
 
-	mempool_free(bio, bs->bio_pool);
+	/*
+	 * If we have front padding, adjust the bio pointer before freeing
+	 */
+	p = bio;
+	if (bs->front_pad)
+		p -= bs->front_pad;
+
+	mempool_free(p, bs->bio_pool);
 }
 
 /*
@@ -188,16 +291,20 @@ void bio_init(struct bio *bio)
  *   for a &struct bio to become free. If a %NULL @bs is passed in, we will
  *   fall back to just using @kmalloc to allocate the required memory.
  *
- *   allocate bio and iovecs from the memory pools specified by the
- *   bio_set structure, or @kmalloc if none given.
+ *   Note that the caller must set ->bi_destructor on succesful return
+ *   of a bio, to do the appropriate freeing of the bio once the reference
+ *   count drops to zero.
  **/
 struct bio *bio_alloc_bioset(gfp_t gfp_mask, int nr_iovecs, struct bio_set *bs)
 {
-	struct bio *bio;
+	struct bio *bio = NULL;
+
+	if (bs) {
+		void *p = mempool_alloc(bs->bio_pool, gfp_mask);
 
-	if (bs)
-		bio = mempool_alloc(bs->bio_pool, gfp_mask);
-	else
+		if (p)
+			bio = p + bs->front_pad;
+	} else
 		bio = kmalloc(sizeof(*bio), gfp_mask);
 
 	if (likely(bio)) {
@@ -1398,11 +1505,25 @@ void bioset_free(struct bio_set *bs)
 
 	bioset_integrity_free(bs);
 	biovec_free_pools(bs);
+	bio_put_slab(bs);
 
 	kfree(bs);
 }
 
-struct bio_set *bioset_create(int bio_pool_size, int bvec_pool_size)
+/**
+ * bioset_create  - Create a bio_set
+ * @pool_size:	Number of bio and bio_vecs to cache in the mempool
+ * @front_pad:	Number of bytes to allocate in front of the returned bio
+ *
+ * Description:
+ *    Set up a bio_set to be used with @bio_alloc_bioset. Allows the caller
+ *    to ask for a number of bytes to be allocated in front of the bio.
+ *    Front pad allocation is useful for embedding the bio inside
+ *    another structure, to avoid allocating extra data to go with the bio.
+ *    Note that the bio must be embedded at the END of that structure always,
+ *    or things will break badly.
+ */
+struct bio_set *bioset_create(unsigned int pool_size, unsigned int front_pad)
 {
 	struct bio_set *bs;
 
@@ -1410,16 +1531,22 @@ struct bio_set *bioset_create(int bio_pool_size, int bvec_pool_size)
 	if (!bs)
 		return NULL;
 
-	bs->bio_slab = bio_slab;
+	bs->front_pad = front_pad;
 
-	bs->bio_pool = mempool_create_slab_pool(bio_pool_size, bs->bio_slab);
+	bs->bio_slab = bio_find_or_create_slab(front_pad);
+	if (!bs->bio_slab) {
+		kfree(bs);
+		return NULL;
+	}
+
+	bs->bio_pool = mempool_create_slab_pool(pool_size, bs->bio_slab);
 	if (!bs->bio_pool)
 		goto bad;
 
-	if (bioset_integrity_create(bs, bio_pool_size))
+	if (bioset_integrity_create(bs, pool_size))
 		goto bad;
 
-	if (!biovec_create_pools(bs, bvec_pool_size))
+	if (!biovec_create_pools(bs, pool_size))
 		return bs;
 
 bad:
@@ -1443,12 +1570,16 @@ static void __init biovec_init_slabs(void)
 
 static int __init init_bio(void)
 {
-	bio_slab = KMEM_CACHE(bio, SLAB_HWCACHE_ALIGN|SLAB_PANIC);
+	bio_slab_max = 2;
+	bio_slab_nr = 0;
+	bio_slabs = kzalloc(bio_slab_max * sizeof(struct bio_slab), GFP_KERNEL);
+	if (!bio_slabs)
+		panic("bio: can't allocate bios\n");
 
 	bio_integrity_init_slab();
 	biovec_init_slabs();
 
-	fs_bio_set = bioset_create(BIO_POOL_SIZE, 2);
+	fs_bio_set = bioset_create(BIO_POOL_SIZE, 0);
 	if (!fs_bio_set)
 		panic("bio: can't allocate bios\n");
 
diff --git a/include/linux/bio.h b/include/linux/bio.h
index 9340098d75d..4b80d3537f9 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -334,7 +334,7 @@ struct bio_pair {
 extern struct bio_pair *bio_split(struct bio *bi, int first_sectors);
 extern void bio_pair_release(struct bio_pair *dbio);
 
-extern struct bio_set *bioset_create(int, int);
+extern struct bio_set *bioset_create(unsigned int, unsigned int);
 extern void bioset_free(struct bio_set *);
 
 extern struct bio *bio_alloc(gfp_t, int);
@@ -379,6 +379,7 @@ extern struct bio *bio_copy_user_iov(struct request_queue *,
 extern int bio_uncopy_user(struct bio *);
 void zero_fill_bio(struct bio *bio);
 extern struct bio_vec *bvec_alloc_bs(gfp_t, int, unsigned long *, struct bio_set *);
+extern void bvec_free_bs(struct bio_set *, struct bio_vec *, unsigned int);
 extern unsigned int bvec_nr_vecs(unsigned short idx);
 
 /*
@@ -401,6 +402,8 @@ static inline void bio_set_completion_cpu(struct bio *bio, unsigned int cpu)
 
 struct bio_set {
 	struct kmem_cache *bio_slab;
+	unsigned int front_pad;
+
 	mempool_t *bio_pool;
 #if defined(CONFIG_BLK_DEV_INTEGRITY)
 	mempool_t *bio_integrity_pool;
@@ -415,6 +418,7 @@ struct biovec_slab {
 };
 
 extern struct bio_set *fs_bio_set;
+extern struct biovec_slab bvec_slabs[BIOVEC_NR_POOLS] __read_mostly;
 
 /*
  * a small number of entries is fine, not going to be performance critical.
-- 
cgit v1.2.3-70-g09d2


From 392ddc32982a5c661dd90dd49a3cb37f1c68b782 Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Tue, 23 Dec 2008 12:42:54 +0100
Subject: bio: add support for inlining a number of bio_vecs inside the bio

When we go and allocate a bio for IO, we actually do two allocations.
One for the bio itself, and one for the bi_io_vec that holds the
actual pages we are interested in.

This feature inlines a definable amount of io vecs inside the bio
itself, so we eliminate the bio_vec array allocation for IO's up
to a certain size. It defaults to 4 vecs, which is typically 16k
of IO.

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 fs/bio.c            | 27 ++++++++++++++++++++++-----
 include/linux/bio.h | 12 ++++++++++++
 2 files changed, 34 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/fs/bio.c b/fs/bio.c
index 0146f80789e..75e6be18ecd 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -31,6 +31,12 @@
 
 DEFINE_TRACE(block_split);
 
+/*
+ * Test patch to inline a certain number of bi_io_vec's inside the bio
+ * itself, to shrink a bio data allocation from two mempool calls to one
+ */
+#define BIO_INLINE_VECS		4
+
 static mempool_t *bio_split_pool __read_mostly;
 
 /*
@@ -241,7 +247,7 @@ void bio_free(struct bio *bio, struct bio_set *bs)
 {
 	void *p;
 
-	if (bio->bi_io_vec)
+	if (bio_has_allocated_vec(bio))
 		bvec_free_bs(bs, bio->bi_io_vec, BIO_POOL_IDX(bio));
 
 	if (bio_integrity(bio))
@@ -267,7 +273,8 @@ static void bio_fs_destructor(struct bio *bio)
 
 static void bio_kmalloc_destructor(struct bio *bio)
 {
-	kfree(bio->bi_io_vec);
+	if (bio_has_allocated_vec(bio))
+		kfree(bio->bi_io_vec);
 	kfree(bio);
 }
 
@@ -314,7 +321,16 @@ struct bio *bio_alloc_bioset(gfp_t gfp_mask, int nr_iovecs, struct bio_set *bs)
 		if (likely(nr_iovecs)) {
 			unsigned long uninitialized_var(idx);
 
-			bvl = bvec_alloc_bs(gfp_mask, nr_iovecs, &idx, bs);
+			if (nr_iovecs <= BIO_INLINE_VECS) {
+				idx = 0;
+				bvl = bio->bi_inline_vecs;
+				nr_iovecs = BIO_INLINE_VECS;
+				memset(bvl, 0, BIO_INLINE_VECS * sizeof(*bvl));
+			} else {
+				bvl = bvec_alloc_bs(gfp_mask, nr_iovecs, &idx,
+							bs);
+				nr_iovecs = bvec_nr_vecs(idx);
+			}
 			if (unlikely(!bvl)) {
 				if (bs)
 					mempool_free(bio, bs->bio_pool);
@@ -324,7 +340,7 @@ struct bio *bio_alloc_bioset(gfp_t gfp_mask, int nr_iovecs, struct bio_set *bs)
 				goto out;
 			}
 			bio->bi_flags |= idx << BIO_POOL_OFFSET;
-			bio->bi_max_vecs = bvec_nr_vecs(idx);
+			bio->bi_max_vecs = nr_iovecs;
 		}
 		bio->bi_io_vec = bvl;
 	}
@@ -1525,6 +1541,7 @@ void bioset_free(struct bio_set *bs)
  */
 struct bio_set *bioset_create(unsigned int pool_size, unsigned int front_pad)
 {
+	unsigned int back_pad = BIO_INLINE_VECS * sizeof(struct bio_vec);
 	struct bio_set *bs;
 
 	bs = kzalloc(sizeof(*bs), GFP_KERNEL);
@@ -1533,7 +1550,7 @@ struct bio_set *bioset_create(unsigned int pool_size, unsigned int front_pad)
 
 	bs->front_pad = front_pad;
 
-	bs->bio_slab = bio_find_or_create_slab(front_pad);
+	bs->bio_slab = bio_find_or_create_slab(front_pad + back_pad);
 	if (!bs->bio_slab) {
 		kfree(bs);
 		return NULL;
diff --git a/include/linux/bio.h b/include/linux/bio.h
index 4b80d3537f9..18462c5b8ff 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -102,6 +102,13 @@ struct bio {
 #endif
 
 	bio_destructor_t	*bi_destructor;	/* destructor */
+
+	/*
+	 * We can inline a number of vecs at the end of the bio, to avoid
+	 * double allocations for a small number of bio_vecs. This member
+	 * MUST obviously be kept at the very end of the bio.
+	 */
+	struct bio_vec		bi_inline_vecs[0];
 };
 
 /*
@@ -213,6 +220,11 @@ static inline void *bio_data(struct bio *bio)
 	return NULL;
 }
 
+static inline int bio_has_allocated_vec(struct bio *bio)
+{
+	return bio->bi_io_vec && bio->bi_io_vec != bio->bi_inline_vecs;
+}
+
 /*
  * will die
  */
-- 
cgit v1.2.3-70-g09d2


From abf137dd7712132ee56d5b3143c2ff61a72a5faa Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Tue, 9 Dec 2008 08:11:22 +0100
Subject: aio: make the lookup_ioctx() lockless

The mm->ioctx_list is currently protected by a reader-writer lock,
so we always grab that lock on the read side for doing ioctx
lookups. As the workload is extremely reader biased, turn this into
an rcu hlist so we can make lookup_ioctx() lockless. Get rid of
the rwlock and use a spinlock for providing update side exclusion.

There's usually only 1 entry on this list, so it doesn't make sense
to look into fancier data structures.

Reviewed-by: Jeff Moyer <jmoyer@redhat.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 arch/s390/mm/pgtable.c   |   4 +-
 fs/aio.c                 | 100 ++++++++++++++++++++++++++---------------------
 include/linux/aio.h      |   5 ++-
 include/linux/mm_types.h |   5 ++-
 kernel/fork.c            |   4 +-
 5 files changed, 67 insertions(+), 51 deletions(-)

(limited to 'include/linux')

diff --git a/arch/s390/mm/pgtable.c b/arch/s390/mm/pgtable.c
index ef3635b52fc..0767827540b 100644
--- a/arch/s390/mm/pgtable.c
+++ b/arch/s390/mm/pgtable.c
@@ -263,7 +263,7 @@ int s390_enable_sie(void)
 	/* lets check if we are allowed to replace the mm */
 	task_lock(tsk);
 	if (!tsk->mm || atomic_read(&tsk->mm->mm_users) > 1 ||
-	    tsk->mm != tsk->active_mm || tsk->mm->ioctx_list) {
+	    tsk->mm != tsk->active_mm || !hlist_empty(&tsk->mm->ioctx_list)) {
 		task_unlock(tsk);
 		return -EINVAL;
 	}
@@ -279,7 +279,7 @@ int s390_enable_sie(void)
 	/* Now lets check again if something happened */
 	task_lock(tsk);
 	if (!tsk->mm || atomic_read(&tsk->mm->mm_users) > 1 ||
-	    tsk->mm != tsk->active_mm || tsk->mm->ioctx_list) {
+	    tsk->mm != tsk->active_mm || !hlist_empty(&tsk->mm->ioctx_list)) {
 		mmput(mm);
 		task_unlock(tsk);
 		return -EINVAL;
diff --git a/fs/aio.c b/fs/aio.c
index f658441d566..d6f89d3c15e 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -191,6 +191,20 @@ static int aio_setup_ring(struct kioctx *ctx)
 	kunmap_atomic((void *)((unsigned long)__event & PAGE_MASK), km); \
 } while(0)
 
+static void ctx_rcu_free(struct rcu_head *head)
+{
+	struct kioctx *ctx = container_of(head, struct kioctx, rcu_head);
+	unsigned nr_events = ctx->max_reqs;
+
+	kmem_cache_free(kioctx_cachep, ctx);
+
+	if (nr_events) {
+		spin_lock(&aio_nr_lock);
+		BUG_ON(aio_nr - nr_events > aio_nr);
+		aio_nr -= nr_events;
+		spin_unlock(&aio_nr_lock);
+	}
+}
 
 /* __put_ioctx
  *	Called when the last user of an aio context has gone away,
@@ -198,8 +212,6 @@ static int aio_setup_ring(struct kioctx *ctx)
  */
 static void __put_ioctx(struct kioctx *ctx)
 {
-	unsigned nr_events = ctx->max_reqs;
-
 	BUG_ON(ctx->reqs_active);
 
 	cancel_delayed_work(&ctx->wq);
@@ -208,14 +220,7 @@ static void __put_ioctx(struct kioctx *ctx)
 	mmdrop(ctx->mm);
 	ctx->mm = NULL;
 	pr_debug("__put_ioctx: freeing %p\n", ctx);
-	kmem_cache_free(kioctx_cachep, ctx);
-
-	if (nr_events) {
-		spin_lock(&aio_nr_lock);
-		BUG_ON(aio_nr - nr_events > aio_nr);
-		aio_nr -= nr_events;
-		spin_unlock(&aio_nr_lock);
-	}
+	call_rcu(&ctx->rcu_head, ctx_rcu_free);
 }
 
 #define get_ioctx(kioctx) do {						\
@@ -235,6 +240,7 @@ static struct kioctx *ioctx_alloc(unsigned nr_events)
 {
 	struct mm_struct *mm;
 	struct kioctx *ctx;
+	int did_sync = 0;
 
 	/* Prevent overflows */
 	if ((nr_events > (0x10000000U / sizeof(struct io_event))) ||
@@ -267,21 +273,30 @@ static struct kioctx *ioctx_alloc(unsigned nr_events)
 		goto out_freectx;
 
 	/* limit the number of system wide aios */
-	spin_lock(&aio_nr_lock);
-	if (aio_nr + ctx->max_reqs > aio_max_nr ||
-	    aio_nr + ctx->max_reqs < aio_nr)
-		ctx->max_reqs = 0;
-	else
-		aio_nr += ctx->max_reqs;
-	spin_unlock(&aio_nr_lock);
+	do {
+		spin_lock_bh(&aio_nr_lock);
+		if (aio_nr + nr_events > aio_max_nr ||
+		    aio_nr + nr_events < aio_nr)
+			ctx->max_reqs = 0;
+		else
+			aio_nr += ctx->max_reqs;
+		spin_unlock_bh(&aio_nr_lock);
+		if (ctx->max_reqs || did_sync)
+			break;
+
+		/* wait for rcu callbacks to have completed before giving up */
+		synchronize_rcu();
+		did_sync = 1;
+		ctx->max_reqs = nr_events;
+	} while (1);
+
 	if (ctx->max_reqs == 0)
 		goto out_cleanup;
 
 	/* now link into global list. */
-	write_lock(&mm->ioctx_list_lock);
-	ctx->next = mm->ioctx_list;
-	mm->ioctx_list = ctx;
-	write_unlock(&mm->ioctx_list_lock);
+	spin_lock(&mm->ioctx_lock);
+	hlist_add_head_rcu(&ctx->list, &mm->ioctx_list);
+	spin_unlock(&mm->ioctx_lock);
 
 	dprintk("aio: allocated ioctx %p[%ld]: mm=%p mask=0x%x\n",
 		ctx, ctx->user_id, current->mm, ctx->ring_info.nr);
@@ -375,11 +390,12 @@ ssize_t wait_on_sync_kiocb(struct kiocb *iocb)
  */
 void exit_aio(struct mm_struct *mm)
 {
-	struct kioctx *ctx = mm->ioctx_list;
-	mm->ioctx_list = NULL;
-	while (ctx) {
-		struct kioctx *next = ctx->next;
-		ctx->next = NULL;
+	struct kioctx *ctx;
+
+	while (!hlist_empty(&mm->ioctx_list)) {
+		ctx = hlist_entry(mm->ioctx_list.first, struct kioctx, list);
+		hlist_del_rcu(&ctx->list);
+
 		aio_cancel_all(ctx);
 
 		wait_for_all_aios(ctx);
@@ -394,7 +410,6 @@ void exit_aio(struct mm_struct *mm)
 				atomic_read(&ctx->users), ctx->dead,
 				ctx->reqs_active);
 		put_ioctx(ctx);
-		ctx = next;
 	}
 }
 
@@ -555,19 +570,21 @@ int aio_put_req(struct kiocb *req)
 
 static struct kioctx *lookup_ioctx(unsigned long ctx_id)
 {
-	struct kioctx *ioctx;
-	struct mm_struct *mm;
+	struct mm_struct *mm = current->mm;
+	struct kioctx *ctx = NULL;
+	struct hlist_node *n;
 
-	mm = current->mm;
-	read_lock(&mm->ioctx_list_lock);
-	for (ioctx = mm->ioctx_list; ioctx; ioctx = ioctx->next)
-		if (likely(ioctx->user_id == ctx_id && !ioctx->dead)) {
-			get_ioctx(ioctx);
+	rcu_read_lock();
+
+	hlist_for_each_entry_rcu(ctx, n, &mm->ioctx_list, list) {
+		if (ctx->user_id == ctx_id && !ctx->dead) {
+			get_ioctx(ctx);
 			break;
 		}
-	read_unlock(&mm->ioctx_list_lock);
+	}
 
-	return ioctx;
+	rcu_read_unlock();
+	return ctx;
 }
 
 /*
@@ -1215,19 +1232,14 @@ out:
 static void io_destroy(struct kioctx *ioctx)
 {
 	struct mm_struct *mm = current->mm;
-	struct kioctx **tmp;
 	int was_dead;
 
 	/* delete the entry from the list is someone else hasn't already */
-	write_lock(&mm->ioctx_list_lock);
+	spin_lock(&mm->ioctx_lock);
 	was_dead = ioctx->dead;
 	ioctx->dead = 1;
-	for (tmp = &mm->ioctx_list; *tmp && *tmp != ioctx;
-	     tmp = &(*tmp)->next)
-		;
-	if (*tmp)
-		*tmp = ioctx->next;
-	write_unlock(&mm->ioctx_list_lock);
+	hlist_del_rcu(&ioctx->list);
+	spin_unlock(&mm->ioctx_lock);
 
 	dprintk("aio_release(%p)\n", ioctx);
 	if (likely(!was_dead))
diff --git a/include/linux/aio.h b/include/linux/aio.h
index f6b8cf99b59..b16a957030f 100644
--- a/include/linux/aio.h
+++ b/include/linux/aio.h
@@ -5,6 +5,7 @@
 #include <linux/workqueue.h>
 #include <linux/aio_abi.h>
 #include <linux/uio.h>
+#include <linux/rcupdate.h>
 
 #include <asm/atomic.h>
 
@@ -183,7 +184,7 @@ struct kioctx {
 
 	/* This needs improving */
 	unsigned long		user_id;
-	struct kioctx		*next;
+	struct hlist_node	list;
 
 	wait_queue_head_t	wait;
 
@@ -199,6 +200,8 @@ struct kioctx {
 	struct aio_ring_info	ring_info;
 
 	struct delayed_work	wq;
+
+	struct rcu_head		rcu_head;
 };
 
 /* prototypes */
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index fe825471d5a..9cfc9b627fd 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -232,8 +232,9 @@ struct mm_struct {
 	struct core_state *core_state; /* coredumping support */
 
 	/* aio bits */
-	rwlock_t		ioctx_list_lock;	/* aio lock */
-	struct kioctx		*ioctx_list;
+	spinlock_t		ioctx_lock;
+	struct hlist_head	ioctx_list;
+
 #ifdef CONFIG_MM_OWNER
 	/*
 	 * "owner" points to a task that is regarded as the canonical
diff --git a/kernel/fork.c b/kernel/fork.c
index 6144b36cd89..43cbf30669e 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -415,8 +415,8 @@ static struct mm_struct * mm_init(struct mm_struct * mm, struct task_struct *p)
 	set_mm_counter(mm, file_rss, 0);
 	set_mm_counter(mm, anon_rss, 0);
 	spin_lock_init(&mm->page_table_lock);
-	rwlock_init(&mm->ioctx_list_lock);
-	mm->ioctx_list = NULL;
+	spin_lock_init(&mm->ioctx_lock);
+	INIT_HLIST_HEAD(&mm->ioctx_list);
 	mm->free_area_cache = TASK_UNMAPPED_BASE;
 	mm->cached_hole_size = ~0UL;
 	mm_init_owner(mm, p);
-- 
cgit v1.2.3-70-g09d2


From b374d18a4bfce705e4a99ae9f501b53e86ecb283 Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Fri, 31 Oct 2008 10:05:07 +0100
Subject: block: get rid of elevator_t typedef

Just use struct elevator_queue everywhere instead.

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/as-iosched.c       |  8 +++----
 block/cfq-iosched.c      |  6 +++---
 block/deadline-iosched.c |  6 +++---
 block/elevator.c         | 55 +++++++++++++++++++++++++-----------------------
 block/noop-iosched.c     |  2 +-
 drivers/block/nbd.c      |  2 +-
 include/linux/blkdev.h   |  3 +--
 include/linux/elevator.h |  8 +++----
 8 files changed, 46 insertions(+), 44 deletions(-)

(limited to 'include/linux')

diff --git a/block/as-iosched.c b/block/as-iosched.c
index 802b5d0d853..631f6f44460 100644
--- a/block/as-iosched.c
+++ b/block/as-iosched.c
@@ -1339,7 +1339,7 @@ static int as_may_queue(struct request_queue *q, int rw)
 	return ret;
 }
 
-static void as_exit_queue(elevator_t *e)
+static void as_exit_queue(struct elevator_queue *e)
 {
 	struct as_data *ad = e->elevator_data;
 
@@ -1409,7 +1409,7 @@ as_var_store(unsigned long *var, const char *page, size_t count)
 	return count;
 }
 
-static ssize_t est_time_show(elevator_t *e, char *page)
+static ssize_t est_time_show(struct elevator_queue *e, char *page)
 {
 	struct as_data *ad = e->elevator_data;
 	int pos = 0;
@@ -1427,7 +1427,7 @@ static ssize_t est_time_show(elevator_t *e, char *page)
 }
 
 #define SHOW_FUNCTION(__FUNC, __VAR)				\
-static ssize_t __FUNC(elevator_t *e, char *page)		\
+static ssize_t __FUNC(struct elevator_queue *e, char *page)	\
 {								\
 	struct as_data *ad = e->elevator_data;			\
 	return as_var_show(jiffies_to_msecs((__VAR)), (page));	\
@@ -1440,7 +1440,7 @@ SHOW_FUNCTION(as_write_batch_expire_show, ad->batch_expire[REQ_ASYNC]);
 #undef SHOW_FUNCTION
 
 #define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX)				\
-static ssize_t __FUNC(elevator_t *e, const char *page, size_t count)	\
+static ssize_t __FUNC(struct elevator_queue *e, const char *page, size_t count)	\
 {									\
 	struct as_data *ad = e->elevator_data;				\
 	int ret = as_var_store(__PTR, (page), count);			\
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index a2bfec7d6b3..adaf93a9d19 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -2178,7 +2178,7 @@ static void cfq_put_async_queues(struct cfq_data *cfqd)
 		cfq_put_queue(cfqd->async_idle_cfqq);
 }
 
-static void cfq_exit_queue(elevator_t *e)
+static void cfq_exit_queue(struct elevator_queue *e)
 {
 	struct cfq_data *cfqd = e->elevator_data;
 	struct request_queue *q = cfqd->queue;
@@ -2288,7 +2288,7 @@ cfq_var_store(unsigned int *var, const char *page, size_t count)
 }
 
 #define SHOW_FUNCTION(__FUNC, __VAR, __CONV)				\
-static ssize_t __FUNC(elevator_t *e, char *page)			\
+static ssize_t __FUNC(struct elevator_queue *e, char *page)		\
 {									\
 	struct cfq_data *cfqd = e->elevator_data;			\
 	unsigned int __data = __VAR;					\
@@ -2308,7 +2308,7 @@ SHOW_FUNCTION(cfq_slice_async_rq_show, cfqd->cfq_slice_async_rq, 0);
 #undef SHOW_FUNCTION
 
 #define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV)			\
-static ssize_t __FUNC(elevator_t *e, const char *page, size_t count)	\
+static ssize_t __FUNC(struct elevator_queue *e, const char *page, size_t count)	\
 {									\
 	struct cfq_data *cfqd = e->elevator_data;			\
 	unsigned int __data;						\
diff --git a/block/deadline-iosched.c b/block/deadline-iosched.c
index fd311179f44..c4d991d4ade 100644
--- a/block/deadline-iosched.c
+++ b/block/deadline-iosched.c
@@ -334,7 +334,7 @@ static int deadline_queue_empty(struct request_queue *q)
 		&& list_empty(&dd->fifo_list[READ]);
 }
 
-static void deadline_exit_queue(elevator_t *e)
+static void deadline_exit_queue(struct elevator_queue *e)
 {
 	struct deadline_data *dd = e->elevator_data;
 
@@ -387,7 +387,7 @@ deadline_var_store(int *var, const char *page, size_t count)
 }
 
 #define SHOW_FUNCTION(__FUNC, __VAR, __CONV)				\
-static ssize_t __FUNC(elevator_t *e, char *page)			\
+static ssize_t __FUNC(struct elevator_queue *e, char *page)		\
 {									\
 	struct deadline_data *dd = e->elevator_data;			\
 	int __data = __VAR;						\
@@ -403,7 +403,7 @@ SHOW_FUNCTION(deadline_fifo_batch_show, dd->fifo_batch, 0);
 #undef SHOW_FUNCTION
 
 #define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV)			\
-static ssize_t __FUNC(elevator_t *e, const char *page, size_t count)	\
+static ssize_t __FUNC(struct elevator_queue *e, const char *page, size_t count)	\
 {									\
 	struct deadline_data *dd = e->elevator_data;			\
 	int __data;							\
diff --git a/block/elevator.c b/block/elevator.c
index ff60177a3ba..98259eda0ef 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -65,7 +65,7 @@ DEFINE_TRACE(block_rq_issue);
 static int elv_iosched_allow_merge(struct request *rq, struct bio *bio)
 {
 	struct request_queue *q = rq->q;
-	elevator_t *e = q->elevator;
+	struct elevator_queue *e = q->elevator;
 
 	if (e->ops->elevator_allow_merge_fn)
 		return e->ops->elevator_allow_merge_fn(q, rq, bio);
@@ -208,13 +208,13 @@ __setup("elevator=", elevator_setup);
 
 static struct kobj_type elv_ktype;
 
-static elevator_t *elevator_alloc(struct request_queue *q,
+static struct elevator_queue *elevator_alloc(struct request_queue *q,
 				  struct elevator_type *e)
 {
-	elevator_t *eq;
+	struct elevator_queue *eq;
 	int i;
 
-	eq = kmalloc_node(sizeof(elevator_t), GFP_KERNEL | __GFP_ZERO, q->node);
+	eq = kmalloc_node(sizeof(*eq), GFP_KERNEL | __GFP_ZERO, q->node);
 	if (unlikely(!eq))
 		goto err;
 
@@ -240,8 +240,9 @@ err:
 
 static void elevator_release(struct kobject *kobj)
 {
-	elevator_t *e = container_of(kobj, elevator_t, kobj);
+	struct elevator_queue *e;
 
+	e = container_of(kobj, struct elevator_queue, kobj);
 	elevator_put(e->elevator_type);
 	kfree(e->hash);
 	kfree(e);
@@ -297,7 +298,7 @@ int elevator_init(struct request_queue *q, char *name)
 }
 EXPORT_SYMBOL(elevator_init);
 
-void elevator_exit(elevator_t *e)
+void elevator_exit(struct elevator_queue *e)
 {
 	mutex_lock(&e->sysfs_lock);
 	if (e->ops->elevator_exit_fn)
@@ -311,7 +312,7 @@ EXPORT_SYMBOL(elevator_exit);
 
 static void elv_activate_rq(struct request_queue *q, struct request *rq)
 {
-	elevator_t *e = q->elevator;
+	struct elevator_queue *e = q->elevator;
 
 	if (e->ops->elevator_activate_req_fn)
 		e->ops->elevator_activate_req_fn(q, rq);
@@ -319,7 +320,7 @@ static void elv_activate_rq(struct request_queue *q, struct request *rq)
 
 static void elv_deactivate_rq(struct request_queue *q, struct request *rq)
 {
-	elevator_t *e = q->elevator;
+	struct elevator_queue *e = q->elevator;
 
 	if (e->ops->elevator_deactivate_req_fn)
 		e->ops->elevator_deactivate_req_fn(q, rq);
@@ -338,7 +339,7 @@ static void elv_rqhash_del(struct request_queue *q, struct request *rq)
 
 static void elv_rqhash_add(struct request_queue *q, struct request *rq)
 {
-	elevator_t *e = q->elevator;
+	struct elevator_queue *e = q->elevator;
 
 	BUG_ON(ELV_ON_HASH(rq));
 	hlist_add_head(&rq->hash, &e->hash[ELV_HASH_FN(rq_hash_key(rq))]);
@@ -352,7 +353,7 @@ static void elv_rqhash_reposition(struct request_queue *q, struct request *rq)
 
 static struct request *elv_rqhash_find(struct request_queue *q, sector_t offset)
 {
-	elevator_t *e = q->elevator;
+	struct elevator_queue *e = q->elevator;
 	struct hlist_head *hash_list = &e->hash[ELV_HASH_FN(offset)];
 	struct hlist_node *entry, *next;
 	struct request *rq;
@@ -494,7 +495,7 @@ EXPORT_SYMBOL(elv_dispatch_add_tail);
 
 int elv_merge(struct request_queue *q, struct request **req, struct bio *bio)
 {
-	elevator_t *e = q->elevator;
+	struct elevator_queue *e = q->elevator;
 	struct request *__rq;
 	int ret;
 
@@ -529,7 +530,7 @@ int elv_merge(struct request_queue *q, struct request **req, struct bio *bio)
 
 void elv_merged_request(struct request_queue *q, struct request *rq, int type)
 {
-	elevator_t *e = q->elevator;
+	struct elevator_queue *e = q->elevator;
 
 	if (e->ops->elevator_merged_fn)
 		e->ops->elevator_merged_fn(q, rq, type);
@@ -543,7 +544,7 @@ void elv_merged_request(struct request_queue *q, struct request *rq, int type)
 void elv_merge_requests(struct request_queue *q, struct request *rq,
 			     struct request *next)
 {
-	elevator_t *e = q->elevator;
+	struct elevator_queue *e = q->elevator;
 
 	if (e->ops->elevator_merge_req_fn)
 		e->ops->elevator_merge_req_fn(q, rq, next);
@@ -846,7 +847,7 @@ void elv_dequeue_request(struct request_queue *q, struct request *rq)
 
 int elv_queue_empty(struct request_queue *q)
 {
-	elevator_t *e = q->elevator;
+	struct elevator_queue *e = q->elevator;
 
 	if (!list_empty(&q->queue_head))
 		return 0;
@@ -860,7 +861,7 @@ EXPORT_SYMBOL(elv_queue_empty);
 
 struct request *elv_latter_request(struct request_queue *q, struct request *rq)
 {
-	elevator_t *e = q->elevator;
+	struct elevator_queue *e = q->elevator;
 
 	if (e->ops->elevator_latter_req_fn)
 		return e->ops->elevator_latter_req_fn(q, rq);
@@ -869,7 +870,7 @@ struct request *elv_latter_request(struct request_queue *q, struct request *rq)
 
 struct request *elv_former_request(struct request_queue *q, struct request *rq)
 {
-	elevator_t *e = q->elevator;
+	struct elevator_queue *e = q->elevator;
 
 	if (e->ops->elevator_former_req_fn)
 		return e->ops->elevator_former_req_fn(q, rq);
@@ -878,7 +879,7 @@ struct request *elv_former_request(struct request_queue *q, struct request *rq)
 
 int elv_set_request(struct request_queue *q, struct request *rq, gfp_t gfp_mask)
 {
-	elevator_t *e = q->elevator;
+	struct elevator_queue *e = q->elevator;
 
 	if (e->ops->elevator_set_req_fn)
 		return e->ops->elevator_set_req_fn(q, rq, gfp_mask);
@@ -889,7 +890,7 @@ int elv_set_request(struct request_queue *q, struct request *rq, gfp_t gfp_mask)
 
 void elv_put_request(struct request_queue *q, struct request *rq)
 {
-	elevator_t *e = q->elevator;
+	struct elevator_queue *e = q->elevator;
 
 	if (e->ops->elevator_put_req_fn)
 		e->ops->elevator_put_req_fn(rq);
@@ -897,7 +898,7 @@ void elv_put_request(struct request_queue *q, struct request *rq)
 
 int elv_may_queue(struct request_queue *q, int rw)
 {
-	elevator_t *e = q->elevator;
+	struct elevator_queue *e = q->elevator;
 
 	if (e->ops->elevator_may_queue_fn)
 		return e->ops->elevator_may_queue_fn(q, rw);
@@ -920,7 +921,7 @@ EXPORT_SYMBOL(elv_abort_queue);
 
 void elv_completed_request(struct request_queue *q, struct request *rq)
 {
-	elevator_t *e = q->elevator;
+	struct elevator_queue *e = q->elevator;
 
 	/*
 	 * request is released from the driver, io must be done
@@ -955,13 +956,14 @@ void elv_completed_request(struct request_queue *q, struct request *rq)
 static ssize_t
 elv_attr_show(struct kobject *kobj, struct attribute *attr, char *page)
 {
-	elevator_t *e = container_of(kobj, elevator_t, kobj);
 	struct elv_fs_entry *entry = to_elv(attr);
+	struct elevator_queue *e;
 	ssize_t error;
 
 	if (!entry->show)
 		return -EIO;
 
+	e = container_of(kobj, struct elevator_queue, kobj);
 	mutex_lock(&e->sysfs_lock);
 	error = e->ops ? entry->show(e, page) : -ENOENT;
 	mutex_unlock(&e->sysfs_lock);
@@ -972,13 +974,14 @@ static ssize_t
 elv_attr_store(struct kobject *kobj, struct attribute *attr,
 	       const char *page, size_t length)
 {
-	elevator_t *e = container_of(kobj, elevator_t, kobj);
 	struct elv_fs_entry *entry = to_elv(attr);
+	struct elevator_queue *e;
 	ssize_t error;
 
 	if (!entry->store)
 		return -EIO;
 
+	e = container_of(kobj, struct elevator_queue, kobj);
 	mutex_lock(&e->sysfs_lock);
 	error = e->ops ? entry->store(e, page, length) : -ENOENT;
 	mutex_unlock(&e->sysfs_lock);
@@ -997,7 +1000,7 @@ static struct kobj_type elv_ktype = {
 
 int elv_register_queue(struct request_queue *q)
 {
-	elevator_t *e = q->elevator;
+	struct elevator_queue *e = q->elevator;
 	int error;
 
 	error = kobject_add(&e->kobj, &q->kobj, "%s", "iosched");
@@ -1015,7 +1018,7 @@ int elv_register_queue(struct request_queue *q)
 	return error;
 }
 
-static void __elv_unregister_queue(elevator_t *e)
+static void __elv_unregister_queue(struct elevator_queue *e)
 {
 	kobject_uevent(&e->kobj, KOBJ_REMOVE);
 	kobject_del(&e->kobj);
@@ -1078,7 +1081,7 @@ EXPORT_SYMBOL_GPL(elv_unregister);
  */
 static int elevator_switch(struct request_queue *q, struct elevator_type *new_e)
 {
-	elevator_t *old_elevator, *e;
+	struct elevator_queue *old_elevator, *e;
 	void *data;
 
 	/*
@@ -1184,7 +1187,7 @@ ssize_t elv_iosched_store(struct request_queue *q, const char *name,
 
 ssize_t elv_iosched_show(struct request_queue *q, char *name)
 {
-	elevator_t *e = q->elevator;
+	struct elevator_queue *e = q->elevator;
 	struct elevator_type *elv = e->elevator_type;
 	struct elevator_type *__e;
 	int len = 0;
diff --git a/block/noop-iosched.c b/block/noop-iosched.c
index c23e0296965..3a0d369d08c 100644
--- a/block/noop-iosched.c
+++ b/block/noop-iosched.c
@@ -76,7 +76,7 @@ static void *noop_init_queue(struct request_queue *q)
 	return nd;
 }
 
-static void noop_exit_queue(elevator_t *e)
+static void noop_exit_queue(struct elevator_queue *e)
 {
 	struct noop_data *nd = e->elevator_data;
 
diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c
index d3a91cacee8..0766ce6187a 100644
--- a/drivers/block/nbd.c
+++ b/drivers/block/nbd.c
@@ -722,7 +722,7 @@ static int __init nbd_init(void)
 
 	for (i = 0; i < nbds_max; i++) {
 		struct gendisk *disk = alloc_disk(1 << part_shift);
-		elevator_t *old_e;
+		struct elevator_queue *old_e;
 		if (!disk)
 			goto out;
 		nbd_dev[i].disk = disk;
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 41bbadfd17f..7035cec583b 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -26,7 +26,6 @@ struct scsi_ioctl_command;
 
 struct request_queue;
 struct elevator_queue;
-typedef struct elevator_queue elevator_t;
 struct request_pm_state;
 struct blk_trace;
 struct request;
@@ -313,7 +312,7 @@ struct request_queue
 	 */
 	struct list_head	queue_head;
 	struct request		*last_merge;
-	elevator_t		*elevator;
+	struct elevator_queue	*elevator;
 
 	/*
 	 * the queue request freelist, one for reads and one for writes
diff --git a/include/linux/elevator.h b/include/linux/elevator.h
index 92f6f634e3e..7a204256b15 100644
--- a/include/linux/elevator.h
+++ b/include/linux/elevator.h
@@ -28,7 +28,7 @@ typedef void (elevator_activate_req_fn) (struct request_queue *, struct request
 typedef void (elevator_deactivate_req_fn) (struct request_queue *, struct request *);
 
 typedef void *(elevator_init_fn) (struct request_queue *);
-typedef void (elevator_exit_fn) (elevator_t *);
+typedef void (elevator_exit_fn) (struct elevator_queue *);
 
 struct elevator_ops
 {
@@ -62,8 +62,8 @@ struct elevator_ops
 
 struct elv_fs_entry {
 	struct attribute attr;
-	ssize_t (*show)(elevator_t *, char *);
-	ssize_t (*store)(elevator_t *, const char *, size_t);
+	ssize_t (*show)(struct elevator_queue *, char *);
+	ssize_t (*store)(struct elevator_queue *, const char *, size_t);
 };
 
 /*
@@ -130,7 +130,7 @@ extern ssize_t elv_iosched_show(struct request_queue *, char *);
 extern ssize_t elv_iosched_store(struct request_queue *, const char *, size_t);
 
 extern int elevator_init(struct request_queue *, char *);
-extern void elevator_exit(elevator_t *);
+extern void elevator_exit(struct elevator_queue *);
 extern int elv_rq_merge_ok(struct request *, struct bio *);
 
 /*
-- 
cgit v1.2.3-70-g09d2


From a6f23657d3072bde6844055bbc2290e497f33fbc Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Fri, 24 Oct 2008 12:52:42 +0200
Subject: block: add one-hit cache for disk partition lookup

disk_map_sector_rcu() returns a partition from a sector offset,
which we use for IO statistics on a per-partition basis. The
lookup itself is an O(N) list lookup, where N is the number of
partitions. This actually hurts performance quite a bit, even
on the lower end partitions. On higher numbered partitions,
it can get pretty bad.

Solve this by adding a one-hit cache for partition lookup.
This makes the lookup O(1) for the case where we do most IO to
one partition. Even for mixed partition workloads, amortized cost
is pretty close to O(1) since the natural IO batching makes the
one-hit cache last for lots of IOs.

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/genhd.c         | 23 +++++++++++++++++++----
 include/linux/genhd.h |  1 +
 2 files changed, 20 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/block/genhd.c b/block/genhd.c
index 2f7feda61e3..d84a7df1e2a 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -181,6 +181,12 @@ void disk_part_iter_exit(struct disk_part_iter *piter)
 }
 EXPORT_SYMBOL_GPL(disk_part_iter_exit);
 
+static inline int sector_in_part(struct hd_struct *part, sector_t sector)
+{
+	return part->start_sect <= sector &&
+		sector < part->start_sect + part->nr_sects;
+}
+
 /**
  * disk_map_sector_rcu - map sector to partition
  * @disk: gendisk of interest
@@ -199,16 +205,22 @@ EXPORT_SYMBOL_GPL(disk_part_iter_exit);
 struct hd_struct *disk_map_sector_rcu(struct gendisk *disk, sector_t sector)
 {
 	struct disk_part_tbl *ptbl;
+	struct hd_struct *part;
 	int i;
 
 	ptbl = rcu_dereference(disk->part_tbl);
 
+	part = rcu_dereference(ptbl->last_lookup);
+	if (part && sector_in_part(part, sector))
+		return part;
+
 	for (i = 1; i < ptbl->len; i++) {
-		struct hd_struct *part = rcu_dereference(ptbl->part[i]);
+		part = rcu_dereference(ptbl->part[i]);
 
-		if (part && part->start_sect <= sector &&
-		    sector < part->start_sect + part->nr_sects)
+		if (part && sector_in_part(part, sector)) {
+			rcu_assign_pointer(ptbl->last_lookup, part);
 			return part;
+		}
 	}
 	return &disk->part0;
 }
@@ -888,8 +900,11 @@ static void disk_replace_part_tbl(struct gendisk *disk,
 	struct disk_part_tbl *old_ptbl = disk->part_tbl;
 
 	rcu_assign_pointer(disk->part_tbl, new_ptbl);
-	if (old_ptbl)
+
+	if (old_ptbl) {
+		rcu_assign_pointer(old_ptbl->last_lookup, NULL);
 		call_rcu(&old_ptbl->rcu_head, disk_free_ptbl_rcu_cb);
+	}
 }
 
 /**
diff --git a/include/linux/genhd.h b/include/linux/genhd.h
index 3df7742ce24..16948eaecae 100644
--- a/include/linux/genhd.h
+++ b/include/linux/genhd.h
@@ -126,6 +126,7 @@ struct blk_scsi_cmd_filter {
 struct disk_part_tbl {
 	struct rcu_head rcu_head;
 	int len;
+	struct hd_struct *last_lookup;
 	struct hd_struct *part[];
 };
 
-- 
cgit v1.2.3-70-g09d2


From b3a6ffe16b5cc48abe7db8d04882dc45280eb693 Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Fri, 12 Dec 2008 09:51:16 +0100
Subject: Get rid of CONFIG_LSF

We have two seperate config entries for large devices/files. One
is CONFIG_LBD that guards just the devices, the other is CONFIG_LSF
that handles large files. This doesn't make a lot of sense, you typically
want both or none. So get rid of CONFIG_LSF and change CONFIG_LBD wording
to indicate that it covers both.

Acked-by: Jean Delvare <khali@linux-fr.org>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/Kconfig         | 23 +++++------------------
 fs/ext4/super.c       |  8 ++++----
 include/linux/types.h | 11 +++--------
 3 files changed, 12 insertions(+), 30 deletions(-)

(limited to 'include/linux')

diff --git a/block/Kconfig b/block/Kconfig
index 290b219fad9..ac0956f7778 100644
--- a/block/Kconfig
+++ b/block/Kconfig
@@ -24,21 +24,17 @@ menuconfig BLOCK
 if BLOCK
 
 config LBD
-	bool "Support for Large Block Devices"
+	bool "Support for large block devices and files"
 	depends on !64BIT
 	help
-	  Enable block devices of size 2TB and larger.
+	  Enable block devices or files of size 2TB and larger.
 
 	  This option is required to support the full capacity of large
 	  (2TB+) block devices, including RAID, disk, Network Block Device,
 	  Logical Volume Manager (LVM) and loopback.
-
-	  For example, RAID devices are frequently bigger than the capacity
-	  of the largest individual hard drive.
-
-	  This option is not required if you have individual disk drives
-	  which total 2TB+ and you are not aggregating the capacity into
-	  a large block device (e.g. using RAID or LVM).
+	
+	  This option also enables support for single files larger than
+	  2TB.
 
 	  If unsure, say N.
 
@@ -58,15 +54,6 @@ config BLK_DEV_IO_TRACE
 
 	  If unsure, say N.
 
-config LSF
-	bool "Support for Large Single Files"
-	depends on !64BIT
-	help
-	  Say Y here if you want to be able to handle very large files (2TB
-	  and larger), otherwise say N.
-
-	  If unsure, say Y.
-
 config BLK_DEV_BSG
 	bool "Block layer SG support v4 (EXPERIMENTAL)"
 	depends on EXPERIMENTAL
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index e4a241c65db..04158ad74db 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -1721,7 +1721,7 @@ static loff_t ext4_max_size(int blkbits, int has_huge_files)
 	/* small i_blocks in vfs inode? */
 	if (!has_huge_files || sizeof(blkcnt_t) < sizeof(u64)) {
 		/*
-		 * CONFIG_LSF is not enabled implies the inode
+		 * CONFIG_LBD is not enabled implies the inode
 		 * i_block represent total blocks in 512 bytes
 		 * 32 == size of vfs inode i_blocks * 8
 		 */
@@ -1764,7 +1764,7 @@ static loff_t ext4_max_bitmap_size(int bits, int has_huge_files)
 
 	if (!has_huge_files || sizeof(blkcnt_t) < sizeof(u64)) {
 		/*
-		 * !has_huge_files or CONFIG_LSF is not enabled
+		 * !has_huge_files or CONFIG_LBD is not enabled
 		 * implies the inode i_block represent total blocks in
 		 * 512 bytes 32 == size of vfs inode i_blocks * 8
 		 */
@@ -2021,13 +2021,13 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 	if (has_huge_files) {
 		/*
 		 * Large file size enabled file system can only be
-		 * mount if kernel is build with CONFIG_LSF
+		 * mount if kernel is build with CONFIG_LBD
 		 */
 		if (sizeof(root->i_blocks) < sizeof(u64) &&
 				!(sb->s_flags & MS_RDONLY)) {
 			printk(KERN_ERR "EXT4-fs: %s: Filesystem with huge "
 					"files cannot be mounted read-write "
-					"without CONFIG_LSF.\n", sb->s_id);
+					"without CONFIG_LBD.\n", sb->s_id);
 			goto failed_mount;
 		}
 	}
diff --git a/include/linux/types.h b/include/linux/types.h
index 1d98330b1f2..121f349cb7e 100644
--- a/include/linux/types.h
+++ b/include/linux/types.h
@@ -135,19 +135,14 @@ typedef		__s64		int64_t;
  *
  * Linux always considers sectors to be 512 bytes long independently
  * of the devices real block size.
+ *
+ * blkcnt_t is the type of the inode's block count.
  */
 #ifdef CONFIG_LBD
 typedef u64 sector_t;
-#else
-typedef unsigned long sector_t;
-#endif
-
-/*
- * The type of the inode's block count.
- */
-#ifdef CONFIG_LSF
 typedef u64 blkcnt_t;
 #else
+typedef unsigned long sector_t;
 typedef unsigned long blkcnt_t;
 #endif
 
-- 
cgit v1.2.3-70-g09d2


From f453ba0460742ad027ae0c4c7d61e62817b3e7ef Mon Sep 17 00:00:00 2001
From: Dave Airlie <airlied@redhat.com>
Date: Fri, 7 Nov 2008 14:05:41 -0800
Subject: DRM: add mode setting support

Add mode setting support to the DRM layer.

This is a fairly big chunk of work that allows DRM drivers to provide
full output control and configuration capabilities to userspace.  It was
motivated by several factors:
  - the fb layer's APIs aren't suited for anything but simple
    configurations
  - coordination between the fb layer, DRM layer, and various userspace
    drivers is poor to non-existent (radeonfb excepted)
  - user level mode setting drivers makes displaying panic & oops
    messages more difficult
  - suspend/resume of graphics state is possible in many more
    configurations with kernel level support

This commit just adds the core DRM part of the mode setting APIs.
Driver specific commits using these new structure and APIs will follow.

Co-authors: Jesse Barnes <jbarnes@virtuousgeek.org>, Jakob Bornecrantz <jakob@tungstengraphics.com>
Contributors: Alan Hourihane <alanh@tungstengraphics.com>, Maarten Maathuis <madman2003@gmail.com>

Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
Signed-off-by: Eric Anholt <eric@anholt.net>
Signed-off-by: Dave Airlie <airlied@redhat.com>
---
 drivers/gpu/drm/Makefile          |    3 +-
 drivers/gpu/drm/drm_crtc.c        | 2497 +++++++++++++++++++++++++++++++++++++
 drivers/gpu/drm/drm_crtc_helper.c |  822 ++++++++++++
 drivers/gpu/drm/drm_drv.c         |   34 +-
 drivers/gpu/drm/drm_edid.c        |  732 +++++++++++
 drivers/gpu/drm/drm_fops.c        |   22 +-
 drivers/gpu/drm/drm_irq.c         |   64 +-
 drivers/gpu/drm/drm_mm.c          |    1 +
 drivers/gpu/drm/drm_modes.c       |  576 +++++++++
 drivers/gpu/drm/drm_stub.c        |   30 +-
 drivers/gpu/drm/drm_sysfs.c       |  329 ++++-
 drivers/video/console/vgacon.c    |   17 +
 include/drm/Kbuild                |    2 +-
 include/drm/drm.h                 |   21 +
 include/drm/drmP.h                |   18 +
 include/drm/drm_crtc.h            |  737 +++++++++++
 include/drm/drm_crtc_helper.h     |  121 ++
 include/drm/drm_edid.h            |  202 +++
 include/drm/drm_mode.h            |  278 +++++
 include/linux/console.h           |    4 +
 20 files changed, 6469 insertions(+), 41 deletions(-)
 create mode 100644 drivers/gpu/drm/drm_crtc.c
 create mode 100644 drivers/gpu/drm/drm_crtc_helper.c
 create mode 100644 drivers/gpu/drm/drm_edid.c
 create mode 100644 drivers/gpu/drm/drm_modes.c
 create mode 100644 include/drm/drm_crtc.h
 create mode 100644 include/drm/drm_crtc_helper.h
 create mode 100644 include/drm/drm_edid.h
 create mode 100644 include/drm/drm_mode.h

(limited to 'include/linux')

diff --git a/drivers/gpu/drm/Makefile b/drivers/gpu/drm/Makefile
index 74da99495e2..30022c4a5c1 100644
--- a/drivers/gpu/drm/Makefile
+++ b/drivers/gpu/drm/Makefile
@@ -9,7 +9,8 @@ drm-y       :=	drm_auth.o drm_bufs.o drm_cache.o \
 		drm_drv.o drm_fops.o drm_gem.o drm_ioctl.o drm_irq.o \
 		drm_lock.o drm_memory.o drm_proc.o drm_stub.o drm_vm.o \
 		drm_agpsupport.o drm_scatter.o ati_pcigart.o drm_pci.o \
-		drm_sysfs.o drm_hashtab.o drm_sman.o drm_mm.o
+		drm_sysfs.o drm_hashtab.o drm_sman.o drm_mm.o \
+		drm_crtc.o drm_crtc_helper.o drm_modes.o drm_edid.o
 
 drm-$(CONFIG_COMPAT) += drm_ioc32.o
 
diff --git a/drivers/gpu/drm/drm_crtc.c b/drivers/gpu/drm/drm_crtc.c
new file mode 100644
index 00000000000..2e880240477
--- /dev/null
+++ b/drivers/gpu/drm/drm_crtc.c
@@ -0,0 +1,2497 @@
+/*
+ * Copyright (c) 2006-2008 Intel Corporation
+ * Copyright (c) 2007 Dave Airlie <airlied@linux.ie>
+ * Copyright (c) 2008 Red Hat Inc.
+ *
+ * DRM core CRTC related functions
+ *
+ * Permission to use, copy, modify, distribute, and sell this software and its
+ * documentation for any purpose is hereby granted without fee, provided that
+ * the above copyright notice appear in all copies and that both that copyright
+ * notice and this permission notice appear in supporting documentation, and
+ * that the name of the copyright holders not be used in advertising or
+ * publicity pertaining to distribution of the software without specific,
+ * written prior permission.  The copyright holders make no representations
+ * about the suitability of this software for any purpose.  It is provided "as
+ * is" without express or implied warranty.
+ *
+ * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
+ * INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO
+ * EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY SPECIAL, INDIRECT OR
+ * CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE,
+ * DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
+ * TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THIS SOFTWARE.
+ *
+ * Authors:
+ *      Keith Packard
+ *	Eric Anholt <eric@anholt.net>
+ *      Dave Airlie <airlied@linux.ie>
+ *      Jesse Barnes <jesse.barnes@intel.com>
+ */
+#include <linux/list.h>
+#include "drm.h"
+#include "drmP.h"
+#include "drm_crtc.h"
+
+struct drm_prop_enum_list {
+	int type;
+	char *name;
+};
+
+/* Avoid boilerplate.  I'm tired of typing. */
+#define DRM_ENUM_NAME_FN(fnname, list)				\
+	char *fnname(int val)					\
+	{							\
+		int i;						\
+		for (i = 0; i < ARRAY_SIZE(list); i++) {	\
+			if (list[i].type == val)		\
+				return list[i].name;		\
+		}						\
+		return "(unknown)";				\
+	}
+
+/*
+ * Global properties
+ */
+static struct drm_prop_enum_list drm_dpms_enum_list[] =
+{	{ DRM_MODE_DPMS_ON, "On" },
+	{ DRM_MODE_DPMS_STANDBY, "Standby" },
+	{ DRM_MODE_DPMS_SUSPEND, "Suspend" },
+	{ DRM_MODE_DPMS_OFF, "Off" }
+};
+
+DRM_ENUM_NAME_FN(drm_get_dpms_name, drm_dpms_enum_list)
+
+/*
+ * Optional properties
+ */
+static struct drm_prop_enum_list drm_scaling_mode_enum_list[] =
+{
+	{ DRM_MODE_SCALE_NON_GPU, "Non-GPU" },
+	{ DRM_MODE_SCALE_FULLSCREEN, "Fullscreen" },
+	{ DRM_MODE_SCALE_NO_SCALE, "No scale" },
+	{ DRM_MODE_SCALE_ASPECT, "Aspect" },
+};
+
+static struct drm_prop_enum_list drm_dithering_mode_enum_list[] =
+{
+	{ DRM_MODE_DITHERING_OFF, "Off" },
+	{ DRM_MODE_DITHERING_ON, "On" },
+};
+
+/*
+ * Non-global properties, but "required" for certain connectors.
+ */
+static struct drm_prop_enum_list drm_dvi_i_select_enum_list[] =
+{
+	{ DRM_MODE_SUBCONNECTOR_Automatic, "Automatic" }, /* DVI-I and TV-out */
+	{ DRM_MODE_SUBCONNECTOR_DVID,      "DVI-D"     }, /* DVI-I  */
+	{ DRM_MODE_SUBCONNECTOR_DVIA,      "DVI-A"     }, /* DVI-I  */
+};
+
+DRM_ENUM_NAME_FN(drm_get_dvi_i_select_name, drm_dvi_i_select_enum_list)
+
+static struct drm_prop_enum_list drm_dvi_i_subconnector_enum_list[] =
+{
+	{ DRM_MODE_SUBCONNECTOR_Unknown,   "Unknown"   }, /* DVI-I and TV-out */
+	{ DRM_MODE_SUBCONNECTOR_DVID,      "DVI-D"     }, /* DVI-I  */
+	{ DRM_MODE_SUBCONNECTOR_DVIA,      "DVI-A"     }, /* DVI-I  */
+};
+
+DRM_ENUM_NAME_FN(drm_get_dvi_i_subconnector_name,
+		 drm_dvi_i_subconnector_enum_list)
+
+static struct drm_prop_enum_list drm_tv_select_enum_list[] =
+{
+	{ DRM_MODE_SUBCONNECTOR_Automatic, "Automatic" }, /* DVI-I and TV-out */
+	{ DRM_MODE_SUBCONNECTOR_Composite, "Composite" }, /* TV-out */
+	{ DRM_MODE_SUBCONNECTOR_SVIDEO,    "SVIDEO"    }, /* TV-out */
+	{ DRM_MODE_SUBCONNECTOR_Component, "Component" }, /* TV-out */
+};
+
+DRM_ENUM_NAME_FN(drm_get_tv_select_name, drm_tv_select_enum_list)
+
+static struct drm_prop_enum_list drm_tv_subconnector_enum_list[] =
+{
+	{ DRM_MODE_SUBCONNECTOR_Unknown,   "Unknown"   }, /* DVI-I and TV-out */
+	{ DRM_MODE_SUBCONNECTOR_Composite, "Composite" }, /* TV-out */
+	{ DRM_MODE_SUBCONNECTOR_SVIDEO,    "SVIDEO"    }, /* TV-out */
+	{ DRM_MODE_SUBCONNECTOR_Component, "Component" }, /* TV-out */
+};
+
+DRM_ENUM_NAME_FN(drm_get_tv_subconnector_name,
+		 drm_tv_subconnector_enum_list)
+
+struct drm_conn_prop_enum_list {
+	int type;
+	char *name;
+	int count;
+};
+
+/*
+ * Connector and encoder types.
+ */
+static struct drm_conn_prop_enum_list drm_connector_enum_list[] =
+{	{ DRM_MODE_CONNECTOR_Unknown, "Unknown", 0 },
+	{ DRM_MODE_CONNECTOR_VGA, "VGA", 0 },
+	{ DRM_MODE_CONNECTOR_DVII, "DVI-I", 0 },
+	{ DRM_MODE_CONNECTOR_DVID, "DVI-D", 0 },
+	{ DRM_MODE_CONNECTOR_DVIA, "DVI-A", 0 },
+	{ DRM_MODE_CONNECTOR_Composite, "Composite", 0 },
+	{ DRM_MODE_CONNECTOR_SVIDEO, "SVIDEO", 0 },
+	{ DRM_MODE_CONNECTOR_LVDS, "LVDS", 0 },
+	{ DRM_MODE_CONNECTOR_Component, "Component", 0 },
+	{ DRM_MODE_CONNECTOR_9PinDIN, "9-pin DIN", 0 },
+	{ DRM_MODE_CONNECTOR_DisplayPort, "DisplayPort", 0 },
+	{ DRM_MODE_CONNECTOR_HDMIA, "HDMI Type A", 0 },
+	{ DRM_MODE_CONNECTOR_HDMIB, "HDMI Type B", 0 },
+};
+
+static struct drm_prop_enum_list drm_encoder_enum_list[] =
+{	{ DRM_MODE_ENCODER_NONE, "None" },
+	{ DRM_MODE_ENCODER_DAC, "DAC" },
+	{ DRM_MODE_ENCODER_TMDS, "TMDS" },
+	{ DRM_MODE_ENCODER_LVDS, "LVDS" },
+	{ DRM_MODE_ENCODER_TVDAC, "TV" },
+};
+
+char *drm_get_encoder_name(struct drm_encoder *encoder)
+{
+	static char buf[32];
+
+	snprintf(buf, 32, "%s-%d",
+		 drm_encoder_enum_list[encoder->encoder_type].name,
+		 encoder->base.id);
+	return buf;
+}
+
+char *drm_get_connector_name(struct drm_connector *connector)
+{
+	static char buf[32];
+
+	snprintf(buf, 32, "%s-%d",
+		 drm_connector_enum_list[connector->connector_type].name,
+		 connector->connector_type_id);
+	return buf;
+}
+EXPORT_SYMBOL(drm_get_connector_name);
+
+char *drm_get_connector_status_name(enum drm_connector_status status)
+{
+	if (status == connector_status_connected)
+		return "connected";
+	else if (status == connector_status_disconnected)
+		return "disconnected";
+	else
+		return "unknown";
+}
+
+/**
+ * drm_mode_object_get - allocate a new identifier
+ * @dev: DRM device
+ * @ptr: object pointer, used to generate unique ID
+ * @type: object type
+ *
+ * LOCKING:
+ * Caller must hold DRM mode_config lock.
+ *
+ * Create a unique identifier based on @ptr in @dev's identifier space.  Used
+ * for tracking modes, CRTCs and connectors.
+ *
+ * RETURNS:
+ * New unique (relative to other objects in @dev) integer identifier for the
+ * object.
+ */
+static int drm_mode_object_get(struct drm_device *dev,
+			       struct drm_mode_object *obj, uint32_t obj_type)
+{
+	int new_id = 0;
+	int ret;
+
+	WARN(!mutex_is_locked(&dev->mode_config.mutex),
+	     "%s called w/o mode_config lock\n", __FUNCTION__);
+again:
+	if (idr_pre_get(&dev->mode_config.crtc_idr, GFP_KERNEL) == 0) {
+		DRM_ERROR("Ran out memory getting a mode number\n");
+		return -EINVAL;
+	}
+
+	ret = idr_get_new_above(&dev->mode_config.crtc_idr, obj, 1, &new_id);
+	if (ret == -EAGAIN)
+		goto again;
+
+	obj->id = new_id;
+	obj->type = obj_type;
+	return 0;
+}
+
+/**
+ * drm_mode_object_put - free an identifer
+ * @dev: DRM device
+ * @id: ID to free
+ *
+ * LOCKING:
+ * Caller must hold DRM mode_config lock.
+ *
+ * Free @id from @dev's unique identifier pool.
+ */
+static void drm_mode_object_put(struct drm_device *dev,
+				struct drm_mode_object *object)
+{
+	idr_remove(&dev->mode_config.crtc_idr, object->id);
+}
+
+void *drm_mode_object_find(struct drm_device *dev, uint32_t id, uint32_t type)
+{
+	struct drm_mode_object *obj;
+
+	obj = idr_find(&dev->mode_config.crtc_idr, id);
+	if (!obj || (obj->type != type) || (obj->id != id))
+		return NULL;
+
+	return obj;
+}
+EXPORT_SYMBOL(drm_mode_object_find);
+
+/**
+ * drm_crtc_from_fb - find the CRTC structure associated with an fb
+ * @dev: DRM device
+ * @fb: framebuffer in question
+ *
+ * LOCKING:
+ * Caller must hold mode_config lock.
+ *
+ * Find CRTC in the mode_config structure that matches @fb.
+ *
+ * RETURNS:
+ * Pointer to the CRTC or NULL if it wasn't found.
+ */
+struct drm_crtc *drm_crtc_from_fb(struct drm_device *dev,
+				  struct drm_framebuffer *fb)
+{
+	struct drm_crtc *crtc;
+
+	list_for_each_entry(crtc, &dev->mode_config.crtc_list, head) {
+		if (crtc->fb == fb)
+			return crtc;
+	}
+	return NULL;
+}
+
+/**
+ * drm_framebuffer_init - initialize a framebuffer
+ * @dev: DRM device
+ *
+ * LOCKING:
+ * Caller must hold mode config lock.
+ *
+ * Allocates an ID for the framebuffer's parent mode object, sets its mode
+ * functions & device file and adds it to the master fd list.
+ *
+ * RETURNS:
+ * Zero on success, error code on falure.
+ */
+int drm_framebuffer_init(struct drm_device *dev, struct drm_framebuffer *fb,
+			 const struct drm_framebuffer_funcs *funcs)
+{
+	int ret;
+
+	ret = drm_mode_object_get(dev, &fb->base, DRM_MODE_OBJECT_FB);
+	if (ret) {
+		return ret;
+	}
+
+	fb->dev = dev;
+	fb->funcs = funcs;
+	dev->mode_config.num_fb++;
+	list_add(&fb->head, &dev->mode_config.fb_list);
+
+	return 0;
+}
+EXPORT_SYMBOL(drm_framebuffer_init);
+
+/**
+ * drm_framebuffer_cleanup - remove a framebuffer object
+ * @fb: framebuffer to remove
+ *
+ * LOCKING:
+ * Caller must hold mode config lock.
+ *
+ * Scans all the CRTCs in @dev's mode_config.  If they're using @fb, removes
+ * it, setting it to NULL.
+ */
+void drm_framebuffer_cleanup(struct drm_framebuffer *fb)
+{
+	struct drm_device *dev = fb->dev;
+	struct drm_crtc *crtc;
+
+	/* remove from any CRTC */
+	list_for_each_entry(crtc, &dev->mode_config.crtc_list, head) {
+		if (crtc->fb == fb)
+			crtc->fb = NULL;
+	}
+
+	drm_mode_object_put(dev, &fb->base);
+	list_del(&fb->head);
+	dev->mode_config.num_fb--;
+}
+EXPORT_SYMBOL(drm_framebuffer_cleanup);
+
+/**
+ * drm_crtc_init - Initialise a new CRTC object
+ * @dev: DRM device
+ * @crtc: CRTC object to init
+ * @funcs: callbacks for the new CRTC
+ *
+ * LOCKING:
+ * Caller must hold mode config lock.
+ *
+ * Inits a new object created as base part of an driver crtc object.
+ */
+void drm_crtc_init(struct drm_device *dev, struct drm_crtc *crtc,
+		   const struct drm_crtc_funcs *funcs)
+{
+	crtc->dev = dev;
+	crtc->funcs = funcs;
+
+	mutex_lock(&dev->mode_config.mutex);
+	drm_mode_object_get(dev, &crtc->base, DRM_MODE_OBJECT_CRTC);
+
+	list_add_tail(&crtc->head, &dev->mode_config.crtc_list);
+	dev->mode_config.num_crtc++;
+	mutex_unlock(&dev->mode_config.mutex);
+}
+EXPORT_SYMBOL(drm_crtc_init);
+
+/**
+ * drm_crtc_cleanup - Cleans up the core crtc usage.
+ * @crtc: CRTC to cleanup
+ *
+ * LOCKING:
+ * Caller must hold mode config lock.
+ *
+ * Cleanup @crtc. Removes from drm modesetting space
+ * does NOT free object, caller does that.
+ */
+void drm_crtc_cleanup(struct drm_crtc *crtc)
+{
+	struct drm_device *dev = crtc->dev;
+
+	if (crtc->gamma_store) {
+		kfree(crtc->gamma_store);
+		crtc->gamma_store = NULL;
+	}
+
+	drm_mode_object_put(dev, &crtc->base);
+	list_del(&crtc->head);
+	dev->mode_config.num_crtc--;
+}
+EXPORT_SYMBOL(drm_crtc_cleanup);
+
+/**
+ * drm_mode_probed_add - add a mode to a connector's probed mode list
+ * @connector: connector the new mode
+ * @mode: mode data
+ *
+ * LOCKING:
+ * Caller must hold mode config lock.
+ *
+ * Add @mode to @connector's mode list for later use.
+ */
+void drm_mode_probed_add(struct drm_connector *connector,
+			 struct drm_display_mode *mode)
+{
+	list_add(&mode->head, &connector->probed_modes);
+}
+EXPORT_SYMBOL(drm_mode_probed_add);
+
+/**
+ * drm_mode_remove - remove and free a mode
+ * @connector: connector list to modify
+ * @mode: mode to remove
+ *
+ * LOCKING:
+ * Caller must hold mode config lock.
+ *
+ * Remove @mode from @connector's mode list, then free it.
+ */
+void drm_mode_remove(struct drm_connector *connector,
+		     struct drm_display_mode *mode)
+{
+	list_del(&mode->head);
+	kfree(mode);
+}
+EXPORT_SYMBOL(drm_mode_remove);
+
+/**
+ * drm_connector_init - Init a preallocated connector
+ * @dev: DRM device
+ * @connector: the connector to init
+ * @funcs: callbacks for this connector
+ * @name: user visible name of the connector
+ *
+ * LOCKING:
+ * Caller must hold @dev's mode_config lock.
+ *
+ * Initialises a preallocated connector. Connectors should be
+ * subclassed as part of driver connector objects.
+ */
+void drm_connector_init(struct drm_device *dev,
+		     struct drm_connector *connector,
+		     const struct drm_connector_funcs *funcs,
+		     int connector_type)
+{
+	mutex_lock(&dev->mode_config.mutex);
+
+	connector->dev = dev;
+	connector->funcs = funcs;
+	drm_mode_object_get(dev, &connector->base, DRM_MODE_OBJECT_CONNECTOR);
+	connector->connector_type = connector_type;
+	connector->connector_type_id =
+		++drm_connector_enum_list[connector_type].count; /* TODO */
+	INIT_LIST_HEAD(&connector->user_modes);
+	INIT_LIST_HEAD(&connector->probed_modes);
+	INIT_LIST_HEAD(&connector->modes);
+	connector->edid_blob_ptr = NULL;
+
+	list_add_tail(&connector->head, &dev->mode_config.connector_list);
+	dev->mode_config.num_connector++;
+
+	drm_connector_attach_property(connector,
+				      dev->mode_config.edid_property, 0);
+
+	drm_connector_attach_property(connector,
+				      dev->mode_config.dpms_property, 0);
+
+	mutex_unlock(&dev->mode_config.mutex);
+}
+EXPORT_SYMBOL(drm_connector_init);
+
+/**
+ * drm_connector_cleanup - cleans up an initialised connector
+ * @connector: connector to cleanup
+ *
+ * LOCKING:
+ * Caller must hold @dev's mode_config lock.
+ *
+ * Cleans up the connector but doesn't free the object.
+ */
+void drm_connector_cleanup(struct drm_connector *connector)
+{
+	struct drm_device *dev = connector->dev;
+	struct drm_display_mode *mode, *t;
+
+	list_for_each_entry_safe(mode, t, &connector->probed_modes, head)
+		drm_mode_remove(connector, mode);
+
+	list_for_each_entry_safe(mode, t, &connector->modes, head)
+		drm_mode_remove(connector, mode);
+
+	list_for_each_entry_safe(mode, t, &connector->user_modes, head)
+		drm_mode_remove(connector, mode);
+
+	mutex_lock(&dev->mode_config.mutex);
+	drm_mode_object_put(dev, &connector->base);
+	list_del(&connector->head);
+	mutex_unlock(&dev->mode_config.mutex);
+}
+EXPORT_SYMBOL(drm_connector_cleanup);
+
+void drm_encoder_init(struct drm_device *dev,
+		      struct drm_encoder *encoder,
+		      const struct drm_encoder_funcs *funcs,
+		      int encoder_type)
+{
+	mutex_lock(&dev->mode_config.mutex);
+
+	encoder->dev = dev;
+
+	drm_mode_object_get(dev, &encoder->base, DRM_MODE_OBJECT_ENCODER);
+	encoder->encoder_type = encoder_type;
+	encoder->funcs = funcs;
+
+	list_add_tail(&encoder->head, &dev->mode_config.encoder_list);
+	dev->mode_config.num_encoder++;
+
+	mutex_unlock(&dev->mode_config.mutex);
+}
+EXPORT_SYMBOL(drm_encoder_init);
+
+void drm_encoder_cleanup(struct drm_encoder *encoder)
+{
+	struct drm_device *dev = encoder->dev;
+	mutex_lock(&dev->mode_config.mutex);
+	drm_mode_object_put(dev, &encoder->base);
+	list_del(&encoder->head);
+	mutex_unlock(&dev->mode_config.mutex);
+}
+EXPORT_SYMBOL(drm_encoder_cleanup);
+
+/**
+ * drm_mode_create - create a new display mode
+ * @dev: DRM device
+ *
+ * LOCKING:
+ * Caller must hold DRM mode_config lock.
+ *
+ * Create a new drm_display_mode, give it an ID, and return it.
+ *
+ * RETURNS:
+ * Pointer to new mode on success, NULL on error.
+ */
+struct drm_display_mode *drm_mode_create(struct drm_device *dev)
+{
+	struct drm_display_mode *nmode;
+
+	nmode = kzalloc(sizeof(struct drm_display_mode), GFP_KERNEL);
+	if (!nmode)
+		return NULL;
+
+	drm_mode_object_get(dev, &nmode->base, DRM_MODE_OBJECT_MODE);
+	return nmode;
+}
+EXPORT_SYMBOL(drm_mode_create);
+
+/**
+ * drm_mode_destroy - remove a mode
+ * @dev: DRM device
+ * @mode: mode to remove
+ *
+ * LOCKING:
+ * Caller must hold mode config lock.
+ *
+ * Free @mode's unique identifier, then free it.
+ */
+void drm_mode_destroy(struct drm_device *dev, struct drm_display_mode *mode)
+{
+	drm_mode_object_put(dev, &mode->base);
+
+	kfree(mode);
+}
+EXPORT_SYMBOL(drm_mode_destroy);
+
+static int drm_mode_create_standard_connector_properties(struct drm_device *dev)
+{
+	struct drm_property *edid;
+	struct drm_property *dpms;
+	int i;
+
+	/*
+	 * Standard properties (apply to all connectors)
+	 */
+	edid = drm_property_create(dev, DRM_MODE_PROP_BLOB |
+				   DRM_MODE_PROP_IMMUTABLE,
+				   "EDID", 0);
+	dev->mode_config.edid_property = edid;
+
+	dpms = drm_property_create(dev, DRM_MODE_PROP_ENUM,
+				   "DPMS", ARRAY_SIZE(drm_dpms_enum_list));
+	for (i = 0; i < ARRAY_SIZE(drm_dpms_enum_list); i++)
+		drm_property_add_enum(dpms, i, drm_dpms_enum_list[i].type,
+				      drm_dpms_enum_list[i].name);
+	dev->mode_config.dpms_property = dpms;
+
+	return 0;
+}
+
+/**
+ * drm_mode_create_dvi_i_properties - create DVI-I specific connector properties
+ * @dev: DRM device
+ *
+ * Called by a driver the first time a DVI-I connector is made.
+ */
+int drm_mode_create_dvi_i_properties(struct drm_device *dev)
+{
+	struct drm_property *dvi_i_selector;
+	struct drm_property *dvi_i_subconnector;
+	int i;
+
+	if (dev->mode_config.dvi_i_select_subconnector_property)
+		return 0;
+
+	dvi_i_selector =
+		drm_property_create(dev, DRM_MODE_PROP_ENUM,
+				    "select subconnector",
+				    ARRAY_SIZE(drm_dvi_i_select_enum_list));
+	for (i = 0; i < ARRAY_SIZE(drm_dvi_i_select_enum_list); i++)
+		drm_property_add_enum(dvi_i_selector, i,
+				      drm_dvi_i_select_enum_list[i].type,
+				      drm_dvi_i_select_enum_list[i].name);
+	dev->mode_config.dvi_i_select_subconnector_property = dvi_i_selector;
+
+	dvi_i_subconnector =
+		drm_property_create(dev, DRM_MODE_PROP_ENUM |
+				    DRM_MODE_PROP_IMMUTABLE,
+				    "subconnector",
+				    ARRAY_SIZE(drm_dvi_i_subconnector_enum_list));
+	for (i = 0; i < ARRAY_SIZE(drm_dvi_i_subconnector_enum_list); i++)
+		drm_property_add_enum(dvi_i_subconnector, i,
+				      drm_dvi_i_subconnector_enum_list[i].type,
+				      drm_dvi_i_subconnector_enum_list[i].name);
+	dev->mode_config.dvi_i_subconnector_property = dvi_i_subconnector;
+
+	return 0;
+}
+EXPORT_SYMBOL(drm_mode_create_dvi_i_properties);
+
+/**
+ * drm_create_tv_properties - create TV specific connector properties
+ * @dev: DRM device
+ * @num_modes: number of different TV formats (modes) supported
+ * @modes: array of pointers to strings containing name of each format
+ *
+ * Called by a driver's TV initialization routine, this function creates
+ * the TV specific connector properties for a given device.  Caller is
+ * responsible for allocating a list of format names and passing them to
+ * this routine.
+ */
+int drm_mode_create_tv_properties(struct drm_device *dev, int num_modes,
+				  char *modes[])
+{
+	struct drm_property *tv_selector;
+	struct drm_property *tv_subconnector;
+	int i;
+
+	if (dev->mode_config.tv_select_subconnector_property)
+		return 0;
+
+	/*
+	 * Basic connector properties
+	 */
+	tv_selector = drm_property_create(dev, DRM_MODE_PROP_ENUM,
+					  "select subconnector",
+					  ARRAY_SIZE(drm_tv_select_enum_list));
+	for (i = 0; i < ARRAY_SIZE(drm_tv_select_enum_list); i++)
+		drm_property_add_enum(tv_selector, i,
+				      drm_tv_select_enum_list[i].type,
+				      drm_tv_select_enum_list[i].name);
+	dev->mode_config.tv_select_subconnector_property = tv_selector;
+
+	tv_subconnector =
+		drm_property_create(dev, DRM_MODE_PROP_ENUM |
+				    DRM_MODE_PROP_IMMUTABLE, "subconnector",
+				    ARRAY_SIZE(drm_tv_subconnector_enum_list));
+	for (i = 0; i < ARRAY_SIZE(drm_tv_subconnector_enum_list); i++)
+		drm_property_add_enum(tv_subconnector, i,
+				      drm_tv_subconnector_enum_list[i].type,
+				      drm_tv_subconnector_enum_list[i].name);
+	dev->mode_config.tv_subconnector_property = tv_subconnector;
+
+	/*
+	 * Other, TV specific properties: margins & TV modes.
+	 */
+	dev->mode_config.tv_left_margin_property =
+		drm_property_create(dev, DRM_MODE_PROP_RANGE,
+				    "left margin", 2);
+	dev->mode_config.tv_left_margin_property->values[0] = 0;
+	dev->mode_config.tv_left_margin_property->values[1] = 100;
+
+	dev->mode_config.tv_right_margin_property =
+		drm_property_create(dev, DRM_MODE_PROP_RANGE,
+				    "right margin", 2);
+	dev->mode_config.tv_right_margin_property->values[0] = 0;
+	dev->mode_config.tv_right_margin_property->values[1] = 100;
+
+	dev->mode_config.tv_top_margin_property =
+		drm_property_create(dev, DRM_MODE_PROP_RANGE,
+				    "top margin", 2);
+	dev->mode_config.tv_top_margin_property->values[0] = 0;
+	dev->mode_config.tv_top_margin_property->values[1] = 100;
+
+	dev->mode_config.tv_bottom_margin_property =
+		drm_property_create(dev, DRM_MODE_PROP_RANGE,
+				    "bottom margin", 2);
+	dev->mode_config.tv_bottom_margin_property->values[0] = 0;
+	dev->mode_config.tv_bottom_margin_property->values[1] = 100;
+
+	dev->mode_config.tv_mode_property =
+		drm_property_create(dev, DRM_MODE_PROP_ENUM,
+				    "mode", num_modes);
+	for (i = 0; i < num_modes; i++)
+		drm_property_add_enum(dev->mode_config.tv_mode_property, i,
+				      i, modes[i]);
+
+	return 0;
+}
+EXPORT_SYMBOL(drm_mode_create_tv_properties);
+
+/**
+ * drm_mode_create_scaling_mode_property - create scaling mode property
+ * @dev: DRM device
+ *
+ * Called by a driver the first time it's needed, must be attached to desired
+ * connectors.
+ */
+int drm_mode_create_scaling_mode_property(struct drm_device *dev)
+{
+	struct drm_property *scaling_mode;
+	int i;
+
+	if (dev->mode_config.scaling_mode_property)
+		return 0;
+
+	scaling_mode =
+		drm_property_create(dev, DRM_MODE_PROP_ENUM, "scaling mode",
+				    ARRAY_SIZE(drm_scaling_mode_enum_list));
+	for (i = 0; i < ARRAY_SIZE(drm_scaling_mode_enum_list); i++)
+		drm_property_add_enum(scaling_mode, i,
+				      drm_scaling_mode_enum_list[i].type,
+				      drm_scaling_mode_enum_list[i].name);
+
+	dev->mode_config.scaling_mode_property = scaling_mode;
+
+	return 0;
+}
+EXPORT_SYMBOL(drm_mode_create_scaling_mode_property);
+
+/**
+ * drm_mode_create_dithering_property - create dithering property
+ * @dev: DRM device
+ *
+ * Called by a driver the first time it's needed, must be attached to desired
+ * connectors.
+ */
+int drm_mode_create_dithering_property(struct drm_device *dev)
+{
+	struct drm_property *dithering_mode;
+	int i;
+
+	if (dev->mode_config.dithering_mode_property)
+		return 0;
+
+	dithering_mode =
+		drm_property_create(dev, DRM_MODE_PROP_ENUM, "dithering",
+				    ARRAY_SIZE(drm_dithering_mode_enum_list));
+	for (i = 0; i < ARRAY_SIZE(drm_dithering_mode_enum_list); i++)
+		drm_property_add_enum(dithering_mode, i,
+				      drm_dithering_mode_enum_list[i].type,
+				      drm_dithering_mode_enum_list[i].name);
+	dev->mode_config.dithering_mode_property = dithering_mode;
+
+	return 0;
+}
+EXPORT_SYMBOL(drm_mode_create_dithering_property);
+
+/**
+ * drm_mode_config_init - initialize DRM mode_configuration structure
+ * @dev: DRM device
+ *
+ * LOCKING:
+ * None, should happen single threaded at init time.
+ *
+ * Initialize @dev's mode_config structure, used for tracking the graphics
+ * configuration of @dev.
+ */
+void drm_mode_config_init(struct drm_device *dev)
+{
+	mutex_init(&dev->mode_config.mutex);
+	INIT_LIST_HEAD(&dev->mode_config.fb_list);
+	INIT_LIST_HEAD(&dev->mode_config.fb_kernel_list);
+	INIT_LIST_HEAD(&dev->mode_config.crtc_list);
+	INIT_LIST_HEAD(&dev->mode_config.connector_list);
+	INIT_LIST_HEAD(&dev->mode_config.encoder_list);
+	INIT_LIST_HEAD(&dev->mode_config.property_list);
+	INIT_LIST_HEAD(&dev->mode_config.property_blob_list);
+	idr_init(&dev->mode_config.crtc_idr);
+
+	mutex_lock(&dev->mode_config.mutex);
+	drm_mode_create_standard_connector_properties(dev);
+	mutex_unlock(&dev->mode_config.mutex);
+
+	/* Just to be sure */
+	dev->mode_config.num_fb = 0;
+	dev->mode_config.num_connector = 0;
+	dev->mode_config.num_crtc = 0;
+	dev->mode_config.num_encoder = 0;
+	dev->mode_config.hotplug_counter = 0;
+}
+EXPORT_SYMBOL(drm_mode_config_init);
+
+int drm_mode_group_init(struct drm_device *dev, struct drm_mode_group *group)
+{
+	uint32_t total_objects = 0;
+
+	total_objects += dev->mode_config.num_crtc;
+	total_objects += dev->mode_config.num_connector;
+	total_objects += dev->mode_config.num_encoder;
+
+	if (total_objects == 0)
+		return -EINVAL;
+
+	group->id_list = kzalloc(total_objects * sizeof(uint32_t), GFP_KERNEL);
+	if (!group->id_list)
+		return -ENOMEM;
+
+	group->num_crtcs = 0;
+	group->num_connectors = 0;
+	group->num_encoders = 0;
+	return 0;
+}
+
+int drm_mode_group_init_legacy_group(struct drm_device *dev,
+				     struct drm_mode_group *group)
+{
+	struct drm_crtc *crtc;
+	struct drm_encoder *encoder;
+	struct drm_connector *connector;
+	int ret;
+
+	if ((ret = drm_mode_group_init(dev, group)))
+		return ret;
+
+	list_for_each_entry(crtc, &dev->mode_config.crtc_list, head)
+		group->id_list[group->num_crtcs++] = crtc->base.id;
+
+	list_for_each_entry(encoder, &dev->mode_config.encoder_list, head)
+		group->id_list[group->num_crtcs + group->num_encoders++] =
+		encoder->base.id;
+
+	list_for_each_entry(connector, &dev->mode_config.connector_list, head)
+		group->id_list[group->num_crtcs + group->num_encoders +
+			       group->num_connectors++] = connector->base.id;
+
+	return 0;
+}
+
+/**
+ * drm_mode_config_cleanup - free up DRM mode_config info
+ * @dev: DRM device
+ *
+ * LOCKING:
+ * Caller must hold mode config lock.
+ *
+ * Free up all the connectors and CRTCs associated with this DRM device, then
+ * free up the framebuffers and associated buffer objects.
+ *
+ * FIXME: cleanup any dangling user buffer objects too
+ */
+void drm_mode_config_cleanup(struct drm_device *dev)
+{
+	struct drm_connector *connector, *ot;
+	struct drm_crtc *crtc, *ct;
+	struct drm_encoder *encoder, *enct;
+	struct drm_framebuffer *fb, *fbt;
+	struct drm_property *property, *pt;
+
+	list_for_each_entry_safe(encoder, enct, &dev->mode_config.encoder_list,
+				 head) {
+		encoder->funcs->destroy(encoder);
+	}
+
+	list_for_each_entry_safe(connector, ot,
+				 &dev->mode_config.connector_list, head) {
+		connector->funcs->destroy(connector);
+	}
+
+	list_for_each_entry_safe(property, pt, &dev->mode_config.property_list,
+				 head) {
+		drm_property_destroy(dev, property);
+	}
+
+	list_for_each_entry_safe(fb, fbt, &dev->mode_config.fb_list, head) {
+		fb->funcs->destroy(fb);
+	}
+
+	list_for_each_entry_safe(crtc, ct, &dev->mode_config.crtc_list, head) {
+		crtc->funcs->destroy(crtc);
+	}
+
+}
+EXPORT_SYMBOL(drm_mode_config_cleanup);
+
+int drm_mode_hotplug_ioctl(struct drm_device *dev,
+			   void *data, struct drm_file *file_priv)
+{
+	struct drm_mode_hotplug *arg = data;
+
+	arg->counter = dev->mode_config.hotplug_counter;
+
+	return 0;
+}
+
+/**
+ * drm_crtc_convert_to_umode - convert a drm_display_mode into a modeinfo
+ * @out: drm_mode_modeinfo struct to return to the user
+ * @in: drm_display_mode to use
+ *
+ * LOCKING:
+ * None.
+ *
+ * Convert a drm_display_mode into a drm_mode_modeinfo structure to return to
+ * the user.
+ */
+void drm_crtc_convert_to_umode(struct drm_mode_modeinfo *out,
+			       struct drm_display_mode *in)
+{
+	out->clock = in->clock;
+	out->hdisplay = in->hdisplay;
+	out->hsync_start = in->hsync_start;
+	out->hsync_end = in->hsync_end;
+	out->htotal = in->htotal;
+	out->hskew = in->hskew;
+	out->vdisplay = in->vdisplay;
+	out->vsync_start = in->vsync_start;
+	out->vsync_end = in->vsync_end;
+	out->vtotal = in->vtotal;
+	out->vscan = in->vscan;
+	out->vrefresh = in->vrefresh;
+	out->flags = in->flags;
+	out->type = in->type;
+	strncpy(out->name, in->name, DRM_DISPLAY_MODE_LEN);
+	out->name[DRM_DISPLAY_MODE_LEN-1] = 0;
+}
+
+/**
+ * drm_crtc_convert_to_umode - convert a modeinfo into a drm_display_mode
+ * @out: drm_display_mode to return to the user
+ * @in: drm_mode_modeinfo to use
+ *
+ * LOCKING:
+ * None.
+ *
+ * Convert a drm_mode_modeinfo into a drm_display_mode structure to return to
+ * the caller.
+ */
+void drm_crtc_convert_umode(struct drm_display_mode *out,
+			    struct drm_mode_modeinfo *in)
+{
+	out->clock = in->clock;
+	out->hdisplay = in->hdisplay;
+	out->hsync_start = in->hsync_start;
+	out->hsync_end = in->hsync_end;
+	out->htotal = in->htotal;
+	out->hskew = in->hskew;
+	out->vdisplay = in->vdisplay;
+	out->vsync_start = in->vsync_start;
+	out->vsync_end = in->vsync_end;
+	out->vtotal = in->vtotal;
+	out->vscan = in->vscan;
+	out->vrefresh = in->vrefresh;
+	out->flags = in->flags;
+	out->type = in->type;
+	strncpy(out->name, in->name, DRM_DISPLAY_MODE_LEN);
+	out->name[DRM_DISPLAY_MODE_LEN-1] = 0;
+}
+
+/**
+ * drm_mode_getresources - get graphics configuration
+ * @inode: inode from the ioctl
+ * @filp: file * from the ioctl
+ * @cmd: cmd from ioctl
+ * @arg: arg from ioctl
+ *
+ * LOCKING:
+ * Takes mode config lock.
+ *
+ * Construct a set of configuration description structures and return
+ * them to the user, including CRTC, connector and framebuffer configuration.
+ *
+ * Called by the user via ioctl.
+ *
+ * RETURNS:
+ * Zero on success, errno on failure.
+ */
+int drm_mode_getresources(struct drm_device *dev, void *data,
+			  struct drm_file *file_priv)
+{
+	struct drm_mode_card_res *card_res = data;
+	struct list_head *lh;
+	struct drm_framebuffer *fb;
+	struct drm_connector *connector;
+	struct drm_crtc *crtc;
+	struct drm_encoder *encoder;
+	int ret = 0;
+	int connector_count = 0;
+	int crtc_count = 0;
+	int fb_count = 0;
+	int encoder_count = 0;
+	int copied = 0, i;
+	uint32_t __user *fb_id;
+	uint32_t __user *crtc_id;
+	uint32_t __user *connector_id;
+	uint32_t __user *encoder_id;
+	struct drm_mode_group *mode_group;
+
+	mutex_lock(&dev->mode_config.mutex);
+
+	/*
+	 * For the non-control nodes we need to limit the list of resources
+	 * by IDs in the group list for this node
+	 */
+	list_for_each(lh, &file_priv->fbs)
+		fb_count++;
+
+	mode_group = &file_priv->master->minor->mode_group;
+	if (file_priv->master->minor->type == DRM_MINOR_CONTROL) {
+
+		list_for_each(lh, &dev->mode_config.crtc_list)
+			crtc_count++;
+
+		list_for_each(lh, &dev->mode_config.connector_list)
+			connector_count++;
+
+		list_for_each(lh, &dev->mode_config.encoder_list)
+			encoder_count++;
+	} else {
+
+		crtc_count = mode_group->num_crtcs;
+		connector_count = mode_group->num_connectors;
+		encoder_count = mode_group->num_encoders;
+	}
+
+	card_res->max_height = dev->mode_config.max_height;
+	card_res->min_height = dev->mode_config.min_height;
+	card_res->max_width = dev->mode_config.max_width;
+	card_res->min_width = dev->mode_config.min_width;
+
+	/* handle this in 4 parts */
+	/* FBs */
+	if (card_res->count_fbs >= fb_count) {
+		copied = 0;
+		fb_id = (uint32_t __user *)(unsigned long)card_res->fb_id_ptr;
+		list_for_each_entry(fb, &file_priv->fbs, head) {
+			if (put_user(fb->base.id, fb_id + copied)) {
+				ret = -EFAULT;
+				goto out;
+			}
+			copied++;
+		}
+	}
+	card_res->count_fbs = fb_count;
+
+	/* CRTCs */
+	if (card_res->count_crtcs >= crtc_count) {
+		copied = 0;
+		crtc_id = (uint32_t __user *)(unsigned long)card_res->crtc_id_ptr;
+		if (file_priv->master->minor->type == DRM_MINOR_CONTROL) {
+			list_for_each_entry(crtc, &dev->mode_config.crtc_list,
+					    head) {
+				DRM_DEBUG("CRTC ID is %d\n", crtc->base.id);
+				if (put_user(crtc->base.id, crtc_id + copied)) {
+					ret = -EFAULT;
+					goto out;
+				}
+				copied++;
+			}
+		} else {
+			for (i = 0; i < mode_group->num_crtcs; i++) {
+				if (put_user(mode_group->id_list[i],
+					     crtc_id + copied)) {
+					ret = -EFAULT;
+					goto out;
+				}
+				copied++;
+			}
+		}
+	}
+	card_res->count_crtcs = crtc_count;
+
+	/* Encoders */
+	if (card_res->count_encoders >= encoder_count) {
+		copied = 0;
+		encoder_id = (uint32_t __user *)(unsigned long)card_res->encoder_id_ptr;
+		if (file_priv->master->minor->type == DRM_MINOR_CONTROL) {
+			list_for_each_entry(encoder,
+					    &dev->mode_config.encoder_list,
+					    head) {
+				DRM_DEBUG("ENCODER ID is %d\n",
+					  encoder->base.id);
+				if (put_user(encoder->base.id, encoder_id +
+					     copied)) {
+					ret = -EFAULT;
+					goto out;
+				}
+				copied++;
+			}
+		} else {
+			for (i = mode_group->num_crtcs; i < mode_group->num_crtcs + mode_group->num_encoders; i++) {
+				if (put_user(mode_group->id_list[i],
+					     encoder_id + copied)) {
+					ret = -EFAULT;
+					goto out;
+				}
+				copied++;
+			}
+
+		}
+	}
+	card_res->count_encoders = encoder_count;
+
+	/* Connectors */
+	if (card_res->count_connectors >= connector_count) {
+		copied = 0;
+		connector_id = (uint32_t __user *)(unsigned long)card_res->connector_id_ptr;
+		if (file_priv->master->minor->type == DRM_MINOR_CONTROL) {
+			list_for_each_entry(connector,
+					    &dev->mode_config.connector_list,
+					    head) {
+				DRM_DEBUG("CONNECTOR ID is %d\n",
+					  connector->base.id);
+				if (put_user(connector->base.id,
+					     connector_id + copied)) {
+					ret = -EFAULT;
+					goto out;
+				}
+				copied++;
+			}
+		} else {
+			int start = mode_group->num_crtcs +
+				mode_group->num_encoders;
+			for (i = start; i < start + mode_group->num_connectors; i++) {
+				if (put_user(mode_group->id_list[i],
+					     connector_id + copied)) {
+					ret = -EFAULT;
+					goto out;
+				}
+				copied++;
+			}
+		}
+	}
+	card_res->count_connectors = connector_count;
+
+	DRM_DEBUG("Counted %d %d %d\n", card_res->count_crtcs,
+		  card_res->count_connectors, card_res->count_encoders);
+
+out:
+	mutex_unlock(&dev->mode_config.mutex);
+	return ret;
+}
+
+/**
+ * drm_mode_getcrtc - get CRTC configuration
+ * @inode: inode from the ioctl
+ * @filp: file * from the ioctl
+ * @cmd: cmd from ioctl
+ * @arg: arg from ioctl
+ *
+ * LOCKING:
+ * Caller? (FIXME)
+ *
+ * Construct a CRTC configuration structure to return to the user.
+ *
+ * Called by the user via ioctl.
+ *
+ * RETURNS:
+ * Zero on success, errno on failure.
+ */
+int drm_mode_getcrtc(struct drm_device *dev,
+		     void *data, struct drm_file *file_priv)
+{
+	struct drm_mode_crtc *crtc_resp = data;
+	struct drm_crtc *crtc;
+	struct drm_mode_object *obj;
+	int ret = 0;
+
+	mutex_lock(&dev->mode_config.mutex);
+
+	obj = drm_mode_object_find(dev, crtc_resp->crtc_id,
+				   DRM_MODE_OBJECT_CRTC);
+	if (!obj) {
+		ret = -EINVAL;
+		goto out;
+	}
+	crtc = obj_to_crtc(obj);
+
+	crtc_resp->x = crtc->x;
+	crtc_resp->y = crtc->y;
+	crtc_resp->gamma_size = crtc->gamma_size;
+	if (crtc->fb)
+		crtc_resp->fb_id = crtc->fb->base.id;
+	else
+		crtc_resp->fb_id = 0;
+
+	if (crtc->enabled) {
+
+		drm_crtc_convert_to_umode(&crtc_resp->mode, &crtc->mode);
+		crtc_resp->mode_valid = 1;
+
+	} else {
+		crtc_resp->mode_valid = 0;
+	}
+
+out:
+	mutex_unlock(&dev->mode_config.mutex);
+	return ret;
+}
+
+/**
+ * drm_mode_getconnector - get connector configuration
+ * @inode: inode from the ioctl
+ * @filp: file * from the ioctl
+ * @cmd: cmd from ioctl
+ * @arg: arg from ioctl
+ *
+ * LOCKING:
+ * Caller? (FIXME)
+ *
+ * Construct a connector configuration structure to return to the user.
+ *
+ * Called by the user via ioctl.
+ *
+ * RETURNS:
+ * Zero on success, errno on failure.
+ */
+int drm_mode_getconnector(struct drm_device *dev, void *data,
+			  struct drm_file *file_priv)
+{
+	struct drm_mode_get_connector *out_resp = data;
+	struct drm_mode_object *obj;
+	struct drm_connector *connector;
+	struct drm_display_mode *mode;
+	int mode_count = 0;
+	int props_count = 0;
+	int encoders_count = 0;
+	int ret = 0;
+	int copied = 0;
+	int i;
+	struct drm_mode_modeinfo u_mode;
+	struct drm_mode_modeinfo __user *mode_ptr;
+	uint32_t __user *prop_ptr;
+	uint64_t __user *prop_values;
+	uint32_t __user *encoder_ptr;
+
+	memset(&u_mode, 0, sizeof(struct drm_mode_modeinfo));
+
+	DRM_DEBUG("connector id %d:\n", out_resp->connector_id);
+
+	mutex_lock(&dev->mode_config.mutex);
+
+	obj = drm_mode_object_find(dev, out_resp->connector_id,
+				   DRM_MODE_OBJECT_CONNECTOR);
+	if (!obj) {
+		ret = -EINVAL;
+		goto out;
+	}
+	connector = obj_to_connector(obj);
+
+	for (i = 0; i < DRM_CONNECTOR_MAX_PROPERTY; i++) {
+		if (connector->property_ids[i] != 0) {
+			props_count++;
+		}
+	}
+
+	for (i = 0; i < DRM_CONNECTOR_MAX_ENCODER; i++) {
+		if (connector->encoder_ids[i] != 0) {
+			encoders_count++;
+		}
+	}
+
+	if (out_resp->count_modes == 0) {
+		connector->funcs->fill_modes(connector,
+					     dev->mode_config.max_width,
+					     dev->mode_config.max_height);
+	}
+
+	/* delayed so we get modes regardless of pre-fill_modes state */
+	list_for_each_entry(mode, &connector->modes, head)
+		mode_count++;
+
+	out_resp->connector_id = connector->base.id;
+	out_resp->connector_type = connector->connector_type;
+	out_resp->connector_type_id = connector->connector_type_id;
+	out_resp->mm_width = connector->display_info.width_mm;
+	out_resp->mm_height = connector->display_info.height_mm;
+	out_resp->subpixel = connector->display_info.subpixel_order;
+	out_resp->connection = connector->status;
+	if (connector->encoder)
+		out_resp->encoder_id = connector->encoder->base.id;
+	else
+		out_resp->encoder_id = 0;
+
+	/*
+	 * This ioctl is called twice, once to determine how much space is
+	 * needed, and the 2nd time to fill it.
+	 */
+	if ((out_resp->count_modes >= mode_count) && mode_count) {
+		copied = 0;
+		mode_ptr = (struct drm_mode_modeinfo *)(unsigned long)out_resp->modes_ptr;
+		list_for_each_entry(mode, &connector->modes, head) {
+			drm_crtc_convert_to_umode(&u_mode, mode);
+			if (copy_to_user(mode_ptr + copied,
+					 &u_mode, sizeof(u_mode))) {
+				ret = -EFAULT;
+				goto out;
+			}
+			copied++;
+		}
+	}
+	out_resp->count_modes = mode_count;
+
+	if ((out_resp->count_props >= props_count) && props_count) {
+		copied = 0;
+		prop_ptr = (uint32_t *)(unsigned long)(out_resp->props_ptr);
+		prop_values = (uint64_t *)(unsigned long)(out_resp->prop_values_ptr);
+		for (i = 0; i < DRM_CONNECTOR_MAX_PROPERTY; i++) {
+			if (connector->property_ids[i] != 0) {
+				if (put_user(connector->property_ids[i],
+					     prop_ptr + copied)) {
+					ret = -EFAULT;
+					goto out;
+				}
+
+				if (put_user(connector->property_values[i],
+					     prop_values + copied)) {
+					ret = -EFAULT;
+					goto out;
+				}
+				copied++;
+			}
+		}
+	}
+	out_resp->count_props = props_count;
+
+	if ((out_resp->count_encoders >= encoders_count) && encoders_count) {
+		copied = 0;
+		encoder_ptr = (uint32_t *)(unsigned long)(out_resp->encoders_ptr);
+		for (i = 0; i < DRM_CONNECTOR_MAX_ENCODER; i++) {
+			if (connector->encoder_ids[i] != 0) {
+				if (put_user(connector->encoder_ids[i],
+					     encoder_ptr + copied)) {
+					ret = -EFAULT;
+					goto out;
+				}
+				copied++;
+			}
+		}
+	}
+	out_resp->count_encoders = encoders_count;
+
+out:
+	mutex_unlock(&dev->mode_config.mutex);
+	return ret;
+}
+
+int drm_mode_getencoder(struct drm_device *dev, void *data,
+			struct drm_file *file_priv)
+{
+	struct drm_mode_get_encoder *enc_resp = data;
+	struct drm_mode_object *obj;
+	struct drm_encoder *encoder;
+	int ret = 0;
+
+	mutex_lock(&dev->mode_config.mutex);
+	obj = drm_mode_object_find(dev, enc_resp->encoder_id,
+				   DRM_MODE_OBJECT_ENCODER);
+	if (!obj) {
+		ret = -EINVAL;
+		goto out;
+	}
+	encoder = obj_to_encoder(obj);
+
+	if (encoder->crtc)
+		enc_resp->crtc_id = encoder->crtc->base.id;
+	else
+		enc_resp->crtc_id = 0;
+	enc_resp->encoder_type = encoder->encoder_type;
+	enc_resp->encoder_id = encoder->base.id;
+	enc_resp->possible_crtcs = encoder->possible_crtcs;
+	enc_resp->possible_clones = encoder->possible_clones;
+
+out:
+	mutex_unlock(&dev->mode_config.mutex);
+	return ret;
+}
+
+/**
+ * drm_mode_setcrtc - set CRTC configuration
+ * @inode: inode from the ioctl
+ * @filp: file * from the ioctl
+ * @cmd: cmd from ioctl
+ * @arg: arg from ioctl
+ *
+ * LOCKING:
+ * Caller? (FIXME)
+ *
+ * Build a new CRTC configuration based on user request.
+ *
+ * Called by the user via ioctl.
+ *
+ * RETURNS:
+ * Zero on success, errno on failure.
+ */
+int drm_mode_setcrtc(struct drm_device *dev, void *data,
+		     struct drm_file *file_priv)
+{
+	struct drm_mode_config *config = &dev->mode_config;
+	struct drm_mode_crtc *crtc_req = data;
+	struct drm_mode_object *obj;
+	struct drm_crtc *crtc, *crtcfb;
+	struct drm_connector **connector_set = NULL, *connector;
+	struct drm_framebuffer *fb = NULL;
+	struct drm_display_mode *mode = NULL;
+	struct drm_mode_set set;
+	uint32_t __user *set_connectors_ptr;
+	int ret = 0;
+	int i;
+
+	mutex_lock(&dev->mode_config.mutex);
+	obj = drm_mode_object_find(dev, crtc_req->crtc_id,
+				   DRM_MODE_OBJECT_CRTC);
+	if (!obj) {
+		DRM_DEBUG("Unknown CRTC ID %d\n", crtc_req->crtc_id);
+		ret = -EINVAL;
+		goto out;
+	}
+	crtc = obj_to_crtc(obj);
+
+	if (crtc_req->mode_valid) {
+		/* If we have a mode we need a framebuffer. */
+		/* If we pass -1, set the mode with the currently bound fb */
+		if (crtc_req->fb_id == -1) {
+			list_for_each_entry(crtcfb,
+					    &dev->mode_config.crtc_list, head) {
+				if (crtcfb == crtc) {
+					DRM_DEBUG("Using current fb for setmode\n");
+					fb = crtc->fb;
+				}
+			}
+		} else {
+			obj = drm_mode_object_find(dev, crtc_req->fb_id,
+						   DRM_MODE_OBJECT_FB);
+			if (!obj) {
+				DRM_DEBUG("Unknown FB ID%d\n", crtc_req->fb_id);
+				ret = -EINVAL;
+				goto out;
+			}
+			fb = obj_to_fb(obj);
+		}
+
+		mode = drm_mode_create(dev);
+		drm_crtc_convert_umode(mode, &crtc_req->mode);
+		drm_mode_set_crtcinfo(mode, CRTC_INTERLACE_HALVE_V);
+	}
+
+	if (crtc_req->count_connectors == 0 && mode) {
+		DRM_DEBUG("Count connectors is 0 but mode set\n");
+		ret = -EINVAL;
+		goto out;
+	}
+
+	if (crtc_req->count_connectors > 0 && !mode && !fb) {
+		DRM_DEBUG("Count connectors is %d but no mode or fb set\n",
+			  crtc_req->count_connectors);
+		ret = -EINVAL;
+		goto out;
+	}
+
+	if (crtc_req->count_connectors > 0) {
+		u32 out_id;
+
+		/* Avoid unbounded kernel memory allocation */
+		if (crtc_req->count_connectors > config->num_connector) {
+			ret = -EINVAL;
+			goto out;
+		}
+
+		connector_set = kmalloc(crtc_req->count_connectors *
+					sizeof(struct drm_connector *),
+					GFP_KERNEL);
+		if (!connector_set) {
+			ret = -ENOMEM;
+			goto out;
+		}
+
+		for (i = 0; i < crtc_req->count_connectors; i++) {
+			set_connectors_ptr = (uint32_t *)(unsigned long)crtc_req->set_connectors_ptr;
+			if (get_user(out_id, &set_connectors_ptr[i])) {
+				ret = -EFAULT;
+				goto out;
+			}
+
+			obj = drm_mode_object_find(dev, out_id,
+						   DRM_MODE_OBJECT_CONNECTOR);
+			if (!obj) {
+				DRM_DEBUG("Connector id %d unknown\n", out_id);
+				ret = -EINVAL;
+				goto out;
+			}
+			connector = obj_to_connector(obj);
+
+			connector_set[i] = connector;
+		}
+	}
+
+	set.crtc = crtc;
+	set.x = crtc_req->x;
+	set.y = crtc_req->y;
+	set.mode = mode;
+	set.connectors = connector_set;
+	set.num_connectors = crtc_req->count_connectors;
+	set.fb =fb;
+	ret = crtc->funcs->set_config(&set);
+
+out:
+	kfree(connector_set);
+	mutex_unlock(&dev->mode_config.mutex);
+	return ret;
+}
+
+int drm_mode_cursor_ioctl(struct drm_device *dev,
+			void *data, struct drm_file *file_priv)
+{
+	struct drm_mode_cursor *req = data;
+	struct drm_mode_object *obj;
+	struct drm_crtc *crtc;
+	int ret = 0;
+
+	DRM_DEBUG("\n");
+
+	if (!req->flags) {
+		DRM_ERROR("no operation set\n");
+		return -EINVAL;
+	}
+
+	mutex_lock(&dev->mode_config.mutex);
+	obj = drm_mode_object_find(dev, req->crtc, DRM_MODE_OBJECT_CRTC);
+	if (!obj) {
+		DRM_DEBUG("Unknown CRTC ID %d\n", req->crtc);
+		ret = -EINVAL;
+		goto out;
+	}
+	crtc = obj_to_crtc(obj);
+
+	if (req->flags & DRM_MODE_CURSOR_BO) {
+		if (!crtc->funcs->cursor_set) {
+			DRM_ERROR("crtc does not support cursor\n");
+			ret = -ENXIO;
+			goto out;
+		}
+		/* Turns off the cursor if handle is 0 */
+		ret = crtc->funcs->cursor_set(crtc, file_priv, req->handle,
+					      req->width, req->height);
+	}
+
+	if (req->flags & DRM_MODE_CURSOR_MOVE) {
+		if (crtc->funcs->cursor_move) {
+			ret = crtc->funcs->cursor_move(crtc, req->x, req->y);
+		} else {
+			DRM_ERROR("crtc does not support cursor\n");
+			ret = -EFAULT;
+			goto out;
+		}
+	}
+out:
+	mutex_unlock(&dev->mode_config.mutex);
+	return ret;
+}
+
+/**
+ * drm_mode_addfb - add an FB to the graphics configuration
+ * @inode: inode from the ioctl
+ * @filp: file * from the ioctl
+ * @cmd: cmd from ioctl
+ * @arg: arg from ioctl
+ *
+ * LOCKING:
+ * Takes mode config lock.
+ *
+ * Add a new FB to the specified CRTC, given a user request.
+ *
+ * Called by the user via ioctl.
+ *
+ * RETURNS:
+ * Zero on success, errno on failure.
+ */
+int drm_mode_addfb(struct drm_device *dev,
+		   void *data, struct drm_file *file_priv)
+{
+	struct drm_mode_fb_cmd *r = data;
+	struct drm_mode_config *config = &dev->mode_config;
+	struct drm_framebuffer *fb;
+	int ret = 0;
+
+	if ((config->min_width > r->width) || (r->width > config->max_width)) {
+		DRM_ERROR("mode new framebuffer width not within limits\n");
+		return -EINVAL;
+	}
+	if ((config->min_height > r->height) || (r->height > config->max_height)) {
+		DRM_ERROR("mode new framebuffer height not within limits\n");
+		return -EINVAL;
+	}
+
+	mutex_lock(&dev->mode_config.mutex);
+
+	/* TODO check buffer is sufficently large */
+	/* TODO setup destructor callback */
+
+	fb = dev->mode_config.funcs->fb_create(dev, file_priv, r);
+	if (!fb) {
+		DRM_ERROR("could not create framebuffer\n");
+		ret = -EINVAL;
+		goto out;
+	}
+
+	r->buffer_id = fb->base.id;
+	list_add(&fb->filp_head, &file_priv->fbs);
+
+out:
+	mutex_unlock(&dev->mode_config.mutex);
+	return ret;
+}
+
+/**
+ * drm_mode_rmfb - remove an FB from the configuration
+ * @inode: inode from the ioctl
+ * @filp: file * from the ioctl
+ * @cmd: cmd from ioctl
+ * @arg: arg from ioctl
+ *
+ * LOCKING:
+ * Takes mode config lock.
+ *
+ * Remove the FB specified by the user.
+ *
+ * Called by the user via ioctl.
+ *
+ * RETURNS:
+ * Zero on success, errno on failure.
+ */
+int drm_mode_rmfb(struct drm_device *dev,
+		   void *data, struct drm_file *file_priv)
+{
+	struct drm_mode_object *obj;
+	struct drm_framebuffer *fb = NULL;
+	struct drm_framebuffer *fbl = NULL;
+	uint32_t *id = data;
+	int ret = 0;
+	int found = 0;
+
+	mutex_lock(&dev->mode_config.mutex);
+	obj = drm_mode_object_find(dev, *id, DRM_MODE_OBJECT_FB);
+	/* TODO check that we realy get a framebuffer back. */
+	if (!obj) {
+		DRM_ERROR("mode invalid framebuffer id\n");
+		ret = -EINVAL;
+		goto out;
+	}
+	fb = obj_to_fb(obj);
+
+	list_for_each_entry(fbl, &file_priv->fbs, filp_head)
+		if (fb == fbl)
+			found = 1;
+
+	if (!found) {
+		DRM_ERROR("tried to remove a fb that we didn't own\n");
+		ret = -EINVAL;
+		goto out;
+	}
+
+	/* TODO release all crtc connected to the framebuffer */
+	/* TODO unhock the destructor from the buffer object */
+
+	list_del(&fb->filp_head);
+	fb->funcs->destroy(fb);
+
+out:
+	mutex_unlock(&dev->mode_config.mutex);
+	return ret;
+}
+
+/**
+ * drm_mode_getfb - get FB info
+ * @inode: inode from the ioctl
+ * @filp: file * from the ioctl
+ * @cmd: cmd from ioctl
+ * @arg: arg from ioctl
+ *
+ * LOCKING:
+ * Caller? (FIXME)
+ *
+ * Lookup the FB given its ID and return info about it.
+ *
+ * Called by the user via ioctl.
+ *
+ * RETURNS:
+ * Zero on success, errno on failure.
+ */
+int drm_mode_getfb(struct drm_device *dev,
+		   void *data, struct drm_file *file_priv)
+{
+	struct drm_mode_fb_cmd *r = data;
+	struct drm_mode_object *obj;
+	struct drm_framebuffer *fb;
+	int ret = 0;
+
+	mutex_lock(&dev->mode_config.mutex);
+	obj = drm_mode_object_find(dev, r->buffer_id, DRM_MODE_OBJECT_FB);
+	if (!obj) {
+		DRM_ERROR("invalid framebuffer id\n");
+		ret = -EINVAL;
+		goto out;
+	}
+	fb = obj_to_fb(obj);
+
+	r->height = fb->height;
+	r->width = fb->width;
+	r->depth = fb->depth;
+	r->bpp = fb->bits_per_pixel;
+	r->pitch = fb->pitch;
+	fb->funcs->create_handle(fb, file_priv, &r->handle);
+
+out:
+	mutex_unlock(&dev->mode_config.mutex);
+	return ret;
+}
+
+/**
+ * drm_fb_release - remove and free the FBs on this file
+ * @filp: file * from the ioctl
+ *
+ * LOCKING:
+ * Takes mode config lock.
+ *
+ * Destroy all the FBs associated with @filp.
+ *
+ * Called by the user via ioctl.
+ *
+ * RETURNS:
+ * Zero on success, errno on failure.
+ */
+void drm_fb_release(struct file *filp)
+{
+	struct drm_file *priv = filp->private_data;
+	struct drm_device *dev = priv->minor->dev;
+	struct drm_framebuffer *fb, *tfb;
+
+	mutex_lock(&dev->mode_config.mutex);
+	list_for_each_entry_safe(fb, tfb, &priv->fbs, filp_head) {
+		list_del(&fb->filp_head);
+		fb->funcs->destroy(fb);
+	}
+	mutex_unlock(&dev->mode_config.mutex);
+}
+
+/**
+ * drm_mode_attachmode - add a mode to the user mode list
+ * @dev: DRM device
+ * @connector: connector to add the mode to
+ * @mode: mode to add
+ *
+ * Add @mode to @connector's user mode list.
+ */
+static int drm_mode_attachmode(struct drm_device *dev,
+			       struct drm_connector *connector,
+			       struct drm_display_mode *mode)
+{
+	int ret = 0;
+
+	list_add_tail(&mode->head, &connector->user_modes);
+	return ret;
+}
+
+int drm_mode_attachmode_crtc(struct drm_device *dev, struct drm_crtc *crtc,
+			     struct drm_display_mode *mode)
+{
+	struct drm_connector *connector;
+	int ret = 0;
+	struct drm_display_mode *dup_mode;
+	int need_dup = 0;
+	list_for_each_entry(connector, &dev->mode_config.connector_list, head) {
+		if (!connector->encoder)
+			break;
+		if (connector->encoder->crtc == crtc) {
+			if (need_dup)
+				dup_mode = drm_mode_duplicate(dev, mode);
+			else
+				dup_mode = mode;
+			ret = drm_mode_attachmode(dev, connector, dup_mode);
+			if (ret)
+				return ret;
+			need_dup = 1;
+		}
+	}
+	return 0;
+}
+EXPORT_SYMBOL(drm_mode_attachmode_crtc);
+
+static int drm_mode_detachmode(struct drm_device *dev,
+			       struct drm_connector *connector,
+			       struct drm_display_mode *mode)
+{
+	int found = 0;
+	int ret = 0;
+	struct drm_display_mode *match_mode, *t;
+
+	list_for_each_entry_safe(match_mode, t, &connector->user_modes, head) {
+		if (drm_mode_equal(match_mode, mode)) {
+			list_del(&match_mode->head);
+			drm_mode_destroy(dev, match_mode);
+			found = 1;
+			break;
+		}
+	}
+
+	if (!found)
+		ret = -EINVAL;
+
+	return ret;
+}
+
+int drm_mode_detachmode_crtc(struct drm_device *dev, struct drm_display_mode *mode)
+{
+	struct drm_connector *connector;
+
+	list_for_each_entry(connector, &dev->mode_config.connector_list, head) {
+		drm_mode_detachmode(dev, connector, mode);
+	}
+	return 0;
+}
+EXPORT_SYMBOL(drm_mode_detachmode_crtc);
+
+/**
+ * drm_fb_attachmode - Attach a user mode to an connector
+ * @inode: inode from the ioctl
+ * @filp: file * from the ioctl
+ * @cmd: cmd from ioctl
+ * @arg: arg from ioctl
+ *
+ * This attaches a user specified mode to an connector.
+ * Called by the user via ioctl.
+ *
+ * RETURNS:
+ * Zero on success, errno on failure.
+ */
+int drm_mode_attachmode_ioctl(struct drm_device *dev,
+			      void *data, struct drm_file *file_priv)
+{
+	struct drm_mode_mode_cmd *mode_cmd = data;
+	struct drm_connector *connector;
+	struct drm_display_mode *mode;
+	struct drm_mode_object *obj;
+	struct drm_mode_modeinfo *umode = &mode_cmd->mode;
+	int ret = 0;
+
+	mutex_lock(&dev->mode_config.mutex);
+
+	obj = drm_mode_object_find(dev, mode_cmd->connector_id, DRM_MODE_OBJECT_CONNECTOR);
+	if (!obj) {
+		ret = -EINVAL;
+		goto out;
+	}
+	connector = obj_to_connector(obj);
+
+	mode = drm_mode_create(dev);
+	if (!mode) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	drm_crtc_convert_umode(mode, umode);
+
+	ret = drm_mode_attachmode(dev, connector, mode);
+out:
+	mutex_unlock(&dev->mode_config.mutex);
+	return ret;
+}
+
+
+/**
+ * drm_fb_detachmode - Detach a user specified mode from an connector
+ * @inode: inode from the ioctl
+ * @filp: file * from the ioctl
+ * @cmd: cmd from ioctl
+ * @arg: arg from ioctl
+ *
+ * Called by the user via ioctl.
+ *
+ * RETURNS:
+ * Zero on success, errno on failure.
+ */
+int drm_mode_detachmode_ioctl(struct drm_device *dev,
+			      void *data, struct drm_file *file_priv)
+{
+	struct drm_mode_object *obj;
+	struct drm_mode_mode_cmd *mode_cmd = data;
+	struct drm_connector *connector;
+	struct drm_display_mode mode;
+	struct drm_mode_modeinfo *umode = &mode_cmd->mode;
+	int ret = 0;
+
+	mutex_lock(&dev->mode_config.mutex);
+
+	obj = drm_mode_object_find(dev, mode_cmd->connector_id, DRM_MODE_OBJECT_CONNECTOR);
+	if (!obj) {
+		ret = -EINVAL;
+		goto out;
+	}
+	connector = obj_to_connector(obj);
+
+	drm_crtc_convert_umode(&mode, umode);
+	ret = drm_mode_detachmode(dev, connector, &mode);
+out:
+	mutex_unlock(&dev->mode_config.mutex);
+	return ret;
+}
+
+struct drm_property *drm_property_create(struct drm_device *dev, int flags,
+					 const char *name, int num_values)
+{
+	struct drm_property *property = NULL;
+
+	property = kzalloc(sizeof(struct drm_property), GFP_KERNEL);
+	if (!property)
+		return NULL;
+
+	if (num_values) {
+		property->values = kzalloc(sizeof(uint64_t)*num_values, GFP_KERNEL);
+		if (!property->values)
+			goto fail;
+	}
+
+	drm_mode_object_get(dev, &property->base, DRM_MODE_OBJECT_PROPERTY);
+	property->flags = flags;
+	property->num_values = num_values;
+	INIT_LIST_HEAD(&property->enum_blob_list);
+
+	if (name)
+		strncpy(property->name, name, DRM_PROP_NAME_LEN);
+
+	list_add_tail(&property->head, &dev->mode_config.property_list);
+	return property;
+fail:
+	kfree(property);
+	return NULL;
+}
+EXPORT_SYMBOL(drm_property_create);
+
+int drm_property_add_enum(struct drm_property *property, int index,
+			  uint64_t value, const char *name)
+{
+	struct drm_property_enum *prop_enum;
+
+	if (!(property->flags & DRM_MODE_PROP_ENUM))
+		return -EINVAL;
+
+	if (!list_empty(&property->enum_blob_list)) {
+		list_for_each_entry(prop_enum, &property->enum_blob_list, head) {
+			if (prop_enum->value == value) {
+				strncpy(prop_enum->name, name, DRM_PROP_NAME_LEN);
+				prop_enum->name[DRM_PROP_NAME_LEN-1] = '\0';
+				return 0;
+			}
+		}
+	}
+
+	prop_enum = kzalloc(sizeof(struct drm_property_enum), GFP_KERNEL);
+	if (!prop_enum)
+		return -ENOMEM;
+
+	strncpy(prop_enum->name, name, DRM_PROP_NAME_LEN);
+	prop_enum->name[DRM_PROP_NAME_LEN-1] = '\0';
+	prop_enum->value = value;
+
+	property->values[index] = value;
+	list_add_tail(&prop_enum->head, &property->enum_blob_list);
+	return 0;
+}
+EXPORT_SYMBOL(drm_property_add_enum);
+
+void drm_property_destroy(struct drm_device *dev, struct drm_property *property)
+{
+	struct drm_property_enum *prop_enum, *pt;
+
+	list_for_each_entry_safe(prop_enum, pt, &property->enum_blob_list, head) {
+		list_del(&prop_enum->head);
+		kfree(prop_enum);
+	}
+
+	if (property->num_values)
+		kfree(property->values);
+	drm_mode_object_put(dev, &property->base);
+	list_del(&property->head);
+	kfree(property);
+}
+EXPORT_SYMBOL(drm_property_destroy);
+
+int drm_connector_attach_property(struct drm_connector *connector,
+			       struct drm_property *property, uint64_t init_val)
+{
+	int i;
+
+	for (i = 0; i < DRM_CONNECTOR_MAX_PROPERTY; i++) {
+		if (connector->property_ids[i] == 0) {
+			connector->property_ids[i] = property->base.id;
+			connector->property_values[i] = init_val;
+			break;
+		}
+	}
+
+	if (i == DRM_CONNECTOR_MAX_PROPERTY)
+		return -EINVAL;
+	return 0;
+}
+EXPORT_SYMBOL(drm_connector_attach_property);
+
+int drm_connector_property_set_value(struct drm_connector *connector,
+				  struct drm_property *property, uint64_t value)
+{
+	int i;
+
+	for (i = 0; i < DRM_CONNECTOR_MAX_PROPERTY; i++) {
+		if (connector->property_ids[i] == property->base.id) {
+			connector->property_values[i] = value;
+			break;
+		}
+	}
+
+	if (i == DRM_CONNECTOR_MAX_PROPERTY)
+		return -EINVAL;
+	return 0;
+}
+EXPORT_SYMBOL(drm_connector_property_set_value);
+
+int drm_connector_property_get_value(struct drm_connector *connector,
+				  struct drm_property *property, uint64_t *val)
+{
+	int i;
+
+	for (i = 0; i < DRM_CONNECTOR_MAX_PROPERTY; i++) {
+		if (connector->property_ids[i] == property->base.id) {
+			*val = connector->property_values[i];
+			break;
+		}
+	}
+
+	if (i == DRM_CONNECTOR_MAX_PROPERTY)
+		return -EINVAL;
+	return 0;
+}
+EXPORT_SYMBOL(drm_connector_property_get_value);
+
+int drm_mode_getproperty_ioctl(struct drm_device *dev,
+			       void *data, struct drm_file *file_priv)
+{
+	struct drm_mode_object *obj;
+	struct drm_mode_get_property *out_resp = data;
+	struct drm_property *property;
+	int enum_count = 0;
+	int blob_count = 0;
+	int value_count = 0;
+	int ret = 0, i;
+	int copied;
+	struct drm_property_enum *prop_enum;
+	struct drm_mode_property_enum __user *enum_ptr;
+	struct drm_property_blob *prop_blob;
+	uint32_t *blob_id_ptr;
+	uint64_t __user *values_ptr;
+	uint32_t __user *blob_length_ptr;
+
+	mutex_lock(&dev->mode_config.mutex);
+	obj = drm_mode_object_find(dev, out_resp->prop_id, DRM_MODE_OBJECT_PROPERTY);
+	if (!obj) {
+		ret = -EINVAL;
+		goto done;
+	}
+	property = obj_to_property(obj);
+
+	if (property->flags & DRM_MODE_PROP_ENUM) {
+		list_for_each_entry(prop_enum, &property->enum_blob_list, head)
+			enum_count++;
+	} else if (property->flags & DRM_MODE_PROP_BLOB) {
+		list_for_each_entry(prop_blob, &property->enum_blob_list, head)
+			blob_count++;
+	}
+
+	value_count = property->num_values;
+
+	strncpy(out_resp->name, property->name, DRM_PROP_NAME_LEN);
+	out_resp->name[DRM_PROP_NAME_LEN-1] = 0;
+	out_resp->flags = property->flags;
+
+	if ((out_resp->count_values >= value_count) && value_count) {
+		values_ptr = (uint64_t *)(unsigned long)out_resp->values_ptr;
+		for (i = 0; i < value_count; i++) {
+			if (copy_to_user(values_ptr + i, &property->values[i], sizeof(uint64_t))) {
+				ret = -EFAULT;
+				goto done;
+			}
+		}
+	}
+	out_resp->count_values = value_count;
+
+	if (property->flags & DRM_MODE_PROP_ENUM) {
+		if ((out_resp->count_enum_blobs >= enum_count) && enum_count) {
+			copied = 0;
+			enum_ptr = (struct drm_mode_property_enum *)(unsigned long)out_resp->enum_blob_ptr;
+			list_for_each_entry(prop_enum, &property->enum_blob_list, head) {
+
+				if (copy_to_user(&enum_ptr[copied].value, &prop_enum->value, sizeof(uint64_t))) {
+					ret = -EFAULT;
+					goto done;
+				}
+
+				if (copy_to_user(&enum_ptr[copied].name,
+						 &prop_enum->name, DRM_PROP_NAME_LEN)) {
+					ret = -EFAULT;
+					goto done;
+				}
+				copied++;
+			}
+		}
+		out_resp->count_enum_blobs = enum_count;
+	}
+
+	if (property->flags & DRM_MODE_PROP_BLOB) {
+		if ((out_resp->count_enum_blobs >= blob_count) && blob_count) {
+			copied = 0;
+			blob_id_ptr = (uint32_t *)(unsigned long)out_resp->enum_blob_ptr;
+			blob_length_ptr = (uint32_t *)(unsigned long)out_resp->values_ptr;
+
+			list_for_each_entry(prop_blob, &property->enum_blob_list, head) {
+				if (put_user(prop_blob->base.id, blob_id_ptr + copied)) {
+					ret = -EFAULT;
+					goto done;
+				}
+
+				if (put_user(prop_blob->length, blob_length_ptr + copied)) {
+					ret = -EFAULT;
+					goto done;
+				}
+
+				copied++;
+			}
+		}
+		out_resp->count_enum_blobs = blob_count;
+	}
+done:
+	mutex_unlock(&dev->mode_config.mutex);
+	return ret;
+}
+
+static struct drm_property_blob *drm_property_create_blob(struct drm_device *dev, int length,
+							  void *data)
+{
+	struct drm_property_blob *blob;
+
+	if (!length || !data)
+		return NULL;
+
+	blob = kzalloc(sizeof(struct drm_property_blob)+length, GFP_KERNEL);
+	if (!blob)
+		return NULL;
+
+	blob->data = (void *)((char *)blob + sizeof(struct drm_property_blob));
+	blob->length = length;
+
+	memcpy(blob->data, data, length);
+
+	drm_mode_object_get(dev, &blob->base, DRM_MODE_OBJECT_BLOB);
+
+	list_add_tail(&blob->head, &dev->mode_config.property_blob_list);
+	return blob;
+}
+
+static void drm_property_destroy_blob(struct drm_device *dev,
+			       struct drm_property_blob *blob)
+{
+	drm_mode_object_put(dev, &blob->base);
+	list_del(&blob->head);
+	kfree(blob);
+}
+
+int drm_mode_getblob_ioctl(struct drm_device *dev,
+			   void *data, struct drm_file *file_priv)
+{
+	struct drm_mode_object *obj;
+	struct drm_mode_get_blob *out_resp = data;
+	struct drm_property_blob *blob;
+	int ret = 0;
+	void *blob_ptr;
+
+	mutex_lock(&dev->mode_config.mutex);
+	obj = drm_mode_object_find(dev, out_resp->blob_id, DRM_MODE_OBJECT_BLOB);
+	if (!obj) {
+		ret = -EINVAL;
+		goto done;
+	}
+	blob = obj_to_blob(obj);
+
+	if (out_resp->length == blob->length) {
+		blob_ptr = (void *)(unsigned long)out_resp->data;
+		if (copy_to_user(blob_ptr, blob->data, blob->length)){
+			ret = -EFAULT;
+			goto done;
+		}
+	}
+	out_resp->length = blob->length;
+
+done:
+	mutex_unlock(&dev->mode_config.mutex);
+	return ret;
+}
+
+int drm_mode_connector_update_edid_property(struct drm_connector *connector,
+					    struct edid *edid)
+{
+	struct drm_device *dev = connector->dev;
+	int ret = 0;
+
+	if (connector->edid_blob_ptr)
+		drm_property_destroy_blob(dev, connector->edid_blob_ptr);
+
+	/* Delete edid, when there is none. */
+	if (!edid) {
+		connector->edid_blob_ptr = NULL;
+		ret = drm_connector_property_set_value(connector, dev->mode_config.edid_property, 0);
+		return ret;
+	}
+
+	connector->edid_blob_ptr = drm_property_create_blob(connector->dev, 128, edid);
+
+	ret = drm_connector_property_set_value(connector,
+					       dev->mode_config.edid_property,
+					       connector->edid_blob_ptr->base.id);
+
+	return ret;
+}
+EXPORT_SYMBOL(drm_mode_connector_update_edid_property);
+
+int drm_mode_connector_property_set_ioctl(struct drm_device *dev,
+				       void *data, struct drm_file *file_priv)
+{
+	struct drm_mode_connector_set_property *out_resp = data;
+	struct drm_mode_object *obj;
+	struct drm_property *property;
+	struct drm_connector *connector;
+	int ret = -EINVAL;
+	int i;
+
+	mutex_lock(&dev->mode_config.mutex);
+
+	obj = drm_mode_object_find(dev, out_resp->connector_id, DRM_MODE_OBJECT_CONNECTOR);
+	if (!obj) {
+		goto out;
+	}
+	connector = obj_to_connector(obj);
+
+	for (i = 0; i < DRM_CONNECTOR_MAX_PROPERTY; i++) {
+		if (connector->property_ids[i] == out_resp->prop_id)
+			break;
+	}
+
+	if (i == DRM_CONNECTOR_MAX_PROPERTY) {
+		goto out;
+	}
+
+	obj = drm_mode_object_find(dev, out_resp->prop_id, DRM_MODE_OBJECT_PROPERTY);
+	if (!obj) {
+		goto out;
+	}
+	property = obj_to_property(obj);
+
+	if (property->flags & DRM_MODE_PROP_IMMUTABLE)
+		goto out;
+
+	if (property->flags & DRM_MODE_PROP_RANGE) {
+		if (out_resp->value < property->values[0])
+			goto out;
+
+		if (out_resp->value > property->values[1])
+			goto out;
+	} else {
+		int found = 0;
+		for (i = 0; i < property->num_values; i++) {
+			if (property->values[i] == out_resp->value) {
+				found = 1;
+				break;
+			}
+		}
+		if (!found) {
+			goto out;
+		}
+	}
+
+	if (connector->funcs->set_property)
+		ret = connector->funcs->set_property(connector, property, out_resp->value);
+
+	/* store the property value if succesful */
+	if (!ret)
+		drm_connector_property_set_value(connector, property, out_resp->value);
+out:
+	mutex_unlock(&dev->mode_config.mutex);
+	return ret;
+}
+
+
+int drm_mode_replacefb(struct drm_device *dev,
+		       void *data, struct drm_file *file_priv)
+{
+	struct drm_mode_fb_cmd *r = data;
+	struct drm_mode_object *obj;
+	struct drm_framebuffer *fb;
+	int found = 0;
+	struct drm_framebuffer *fbl = NULL;
+	int ret = 0;
+
+	/* right replace the current bo attached to this fb with a new bo */
+	mutex_lock(&dev->mode_config.mutex);
+	obj = drm_mode_object_find(dev, r->buffer_id, DRM_MODE_OBJECT_FB);
+	if (!obj) {
+		ret = -EINVAL;
+		goto out;
+	}
+	fb = obj_to_fb(obj);
+
+	list_for_each_entry(fbl, &file_priv->fbs, filp_head)
+		if (fb == fbl)
+			found = 1;
+
+	if (!found) {
+		DRM_ERROR("tried to replace an fb we didn't own\n");
+		ret = -EINVAL;
+		goto out;
+	}
+
+	if (dev->mode_config.funcs->resize_fb)
+		ret = dev->mode_config.funcs->resize_fb(dev, file_priv, fb, r);
+	else
+		ret = -EINVAL;
+out:
+	mutex_unlock(&dev->mode_config.mutex);
+	return ret;
+
+}
+
+int drm_mode_connector_attach_encoder(struct drm_connector *connector,
+				      struct drm_encoder *encoder)
+{
+	int i;
+
+	for (i = 0; i < DRM_CONNECTOR_MAX_ENCODER; i++) {
+		if (connector->encoder_ids[i] == 0) {
+			connector->encoder_ids[i] = encoder->base.id;
+			return 0;
+		}
+	}
+	return -ENOMEM;
+}
+EXPORT_SYMBOL(drm_mode_connector_attach_encoder);
+
+void drm_mode_connector_detach_encoder(struct drm_connector *connector,
+				    struct drm_encoder *encoder)
+{
+	int i;
+	for (i = 0; i < DRM_CONNECTOR_MAX_ENCODER; i++) {
+		if (connector->encoder_ids[i] == encoder->base.id) {
+			connector->encoder_ids[i] = 0;
+			if (connector->encoder == encoder)
+				connector->encoder = NULL;
+			break;
+		}
+	}
+}
+EXPORT_SYMBOL(drm_mode_connector_detach_encoder);
+
+bool drm_mode_crtc_set_gamma_size(struct drm_crtc *crtc,
+				  int gamma_size)
+{
+	crtc->gamma_size = gamma_size;
+
+	crtc->gamma_store = kzalloc(gamma_size * sizeof(uint16_t) * 3, GFP_KERNEL);
+	if (!crtc->gamma_store) {
+		crtc->gamma_size = 0;
+		return false;
+	}
+
+	return true;
+}
+EXPORT_SYMBOL(drm_mode_crtc_set_gamma_size);
+
+int drm_mode_gamma_set_ioctl(struct drm_device *dev,
+			     void *data, struct drm_file *file_priv)
+{
+	struct drm_mode_crtc_lut *crtc_lut = data;
+	struct drm_mode_object *obj;
+	struct drm_crtc *crtc;
+	void *r_base, *g_base, *b_base;
+	int size;
+	int ret = 0;
+
+	mutex_lock(&dev->mode_config.mutex);
+	obj = drm_mode_object_find(dev, crtc_lut->crtc_id, DRM_MODE_OBJECT_CRTC);
+	if (!obj) {
+		ret = -EINVAL;
+		goto out;
+	}
+	crtc = obj_to_crtc(obj);
+
+	/* memcpy into gamma store */
+	if (crtc_lut->gamma_size != crtc->gamma_size) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	size = crtc_lut->gamma_size * (sizeof(uint16_t));
+	r_base = crtc->gamma_store;
+	if (copy_from_user(r_base, (void __user *)(unsigned long)crtc_lut->red, size)) {
+		ret = -EFAULT;
+		goto out;
+	}
+
+	g_base = r_base + size;
+	if (copy_from_user(g_base, (void __user *)(unsigned long)crtc_lut->green, size)) {
+		ret = -EFAULT;
+		goto out;
+	}
+
+	b_base = g_base + size;
+	if (copy_from_user(b_base, (void __user *)(unsigned long)crtc_lut->blue, size)) {
+		ret = -EFAULT;
+		goto out;
+	}
+
+	crtc->funcs->gamma_set(crtc, r_base, g_base, b_base, crtc->gamma_size);
+
+out:
+	mutex_unlock(&dev->mode_config.mutex);
+	return ret;
+
+}
+
+int drm_mode_gamma_get_ioctl(struct drm_device *dev,
+			     void *data, struct drm_file *file_priv)
+{
+	struct drm_mode_crtc_lut *crtc_lut = data;
+	struct drm_mode_object *obj;
+	struct drm_crtc *crtc;
+	void *r_base, *g_base, *b_base;
+	int size;
+	int ret = 0;
+
+	mutex_lock(&dev->mode_config.mutex);
+	obj = drm_mode_object_find(dev, crtc_lut->crtc_id, DRM_MODE_OBJECT_CRTC);
+	if (!obj) {
+		ret = -EINVAL;
+		goto out;
+	}
+	crtc = obj_to_crtc(obj);
+
+	/* memcpy into gamma store */
+	if (crtc_lut->gamma_size != crtc->gamma_size) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	size = crtc_lut->gamma_size * (sizeof(uint16_t));
+	r_base = crtc->gamma_store;
+	if (copy_to_user((void __user *)(unsigned long)crtc_lut->red, r_base, size)) {
+		ret = -EFAULT;
+		goto out;
+	}
+
+	g_base = r_base + size;
+	if (copy_to_user((void __user *)(unsigned long)crtc_lut->green, g_base, size)) {
+		ret = -EFAULT;
+		goto out;
+	}
+
+	b_base = g_base + size;
+	if (copy_to_user((void __user *)(unsigned long)crtc_lut->blue, b_base, size)) {
+		ret = -EFAULT;
+		goto out;
+	}
+out:
+	mutex_unlock(&dev->mode_config.mutex);
+	return ret;
+}
diff --git a/drivers/gpu/drm/drm_crtc_helper.c b/drivers/gpu/drm/drm_crtc_helper.c
new file mode 100644
index 00000000000..887ed33b069
--- /dev/null
+++ b/drivers/gpu/drm/drm_crtc_helper.c
@@ -0,0 +1,822 @@
+/*
+ * Copyright (c) 2006-2008 Intel Corporation
+ * Copyright (c) 2007 Dave Airlie <airlied@linux.ie>
+ *
+ * DRM core CRTC related functions
+ *
+ * Permission to use, copy, modify, distribute, and sell this software and its
+ * documentation for any purpose is hereby granted without fee, provided that
+ * the above copyright notice appear in all copies and that both that copyright
+ * notice and this permission notice appear in supporting documentation, and
+ * that the name of the copyright holders not be used in advertising or
+ * publicity pertaining to distribution of the software without specific,
+ * written prior permission.  The copyright holders make no representations
+ * about the suitability of this software for any purpose.  It is provided "as
+ * is" without express or implied warranty.
+ *
+ * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
+ * INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO
+ * EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY SPECIAL, INDIRECT OR
+ * CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE,
+ * DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
+ * TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THIS SOFTWARE.
+ *
+ * Authors:
+ *      Keith Packard
+ *	Eric Anholt <eric@anholt.net>
+ *      Dave Airlie <airlied@linux.ie>
+ *      Jesse Barnes <jesse.barnes@intel.com>
+ */
+
+#include "drmP.h"
+#include "drm_crtc.h"
+#include "drm_crtc_helper.h"
+
+/*
+ * Detailed mode info for a standard 640x480@60Hz monitor
+ */
+static struct drm_display_mode std_mode[] = {
+	{ DRM_MODE("640x480", DRM_MODE_TYPE_DEFAULT, 25200, 640, 656,
+		   752, 800, 0, 480, 490, 492, 525, 0,
+		   DRM_MODE_FLAG_NHSYNC | DRM_MODE_FLAG_NVSYNC) },
+};
+
+/**
+ * drm_helper_probe_connector_modes - get complete set of display modes
+ * @dev: DRM device
+ * @maxX: max width for modes
+ * @maxY: max height for modes
+ *
+ * LOCKING:
+ * Caller must hold mode config lock.
+ *
+ * Based on @dev's mode_config layout, scan all the connectors and try to detect
+ * modes on them.  Modes will first be added to the connector's probed_modes
+ * list, then culled (based on validity and the @maxX, @maxY parameters) and
+ * put into the normal modes list.
+ *
+ * Intended to be used either at bootup time or when major configuration
+ * changes have occurred.
+ *
+ * FIXME: take into account monitor limits
+ */
+void drm_helper_probe_single_connector_modes(struct drm_connector *connector,
+					     uint32_t maxX, uint32_t maxY)
+{
+	struct drm_device *dev = connector->dev;
+	struct drm_display_mode *mode, *t;
+	struct drm_connector_helper_funcs *connector_funcs =
+		connector->helper_private;
+	int ret;
+
+	DRM_DEBUG("%s\n", drm_get_connector_name(connector));
+	/* set all modes to the unverified state */
+	list_for_each_entry_safe(mode, t, &connector->modes, head)
+		mode->status = MODE_UNVERIFIED;
+
+	connector->status = connector->funcs->detect(connector);
+
+	if (connector->status == connector_status_disconnected) {
+		DRM_DEBUG("%s is disconnected\n",
+			  drm_get_connector_name(connector));
+		/* TODO set EDID to NULL */
+		return;
+	}
+
+	ret = (*connector_funcs->get_modes)(connector);
+
+	if (ret) {
+		drm_mode_connector_list_update(connector);
+	}
+
+	if (maxX && maxY)
+		drm_mode_validate_size(dev, &connector->modes, maxX,
+				       maxY, 0);
+	list_for_each_entry_safe(mode, t, &connector->modes, head) {
+		if (mode->status == MODE_OK)
+			mode->status = connector_funcs->mode_valid(connector,
+								   mode);
+	}
+
+
+	drm_mode_prune_invalid(dev, &connector->modes, true);
+
+	if (list_empty(&connector->modes)) {
+		struct drm_display_mode *stdmode;
+
+		DRM_DEBUG("No valid modes on %s\n",
+			  drm_get_connector_name(connector));
+
+		/* Should we do this here ???
+		 * When no valid EDID modes are available we end up
+		 * here and bailed in the past, now we add a standard
+		 * 640x480@60Hz mode and carry on.
+		 */
+		stdmode = drm_mode_duplicate(dev, &std_mode[0]);
+		drm_mode_probed_add(connector, stdmode);
+		drm_mode_list_concat(&connector->probed_modes,
+				     &connector->modes);
+
+		DRM_DEBUG("Adding standard 640x480 @ 60Hz to %s\n",
+			  drm_get_connector_name(connector));
+	}
+
+	drm_mode_sort(&connector->modes);
+
+	DRM_DEBUG("Probed modes for %s\n", drm_get_connector_name(connector));
+	list_for_each_entry_safe(mode, t, &connector->modes, head) {
+		mode->vrefresh = drm_mode_vrefresh(mode);
+
+		drm_mode_set_crtcinfo(mode, CRTC_INTERLACE_HALVE_V);
+		drm_mode_debug_printmodeline(mode);
+	}
+}
+EXPORT_SYMBOL(drm_helper_probe_single_connector_modes);
+
+void drm_helper_probe_connector_modes(struct drm_device *dev, uint32_t maxX,
+				      uint32_t maxY)
+{
+	struct drm_connector *connector;
+
+	list_for_each_entry(connector, &dev->mode_config.connector_list, head) {
+		drm_helper_probe_single_connector_modes(connector, maxX, maxY);
+	}
+}
+EXPORT_SYMBOL(drm_helper_probe_connector_modes);
+
+
+/**
+ * drm_helper_crtc_in_use - check if a given CRTC is in a mode_config
+ * @crtc: CRTC to check
+ *
+ * LOCKING:
+ * Caller must hold mode config lock.
+ *
+ * Walk @crtc's DRM device's mode_config and see if it's in use.
+ *
+ * RETURNS:
+ * True if @crtc is part of the mode_config, false otherwise.
+ */
+bool drm_helper_crtc_in_use(struct drm_crtc *crtc)
+{
+	struct drm_encoder *encoder;
+	struct drm_device *dev = crtc->dev;
+	/* FIXME: Locking around list access? */
+	list_for_each_entry(encoder, &dev->mode_config.encoder_list, head)
+		if (encoder->crtc == crtc)
+			return true;
+	return false;
+}
+EXPORT_SYMBOL(drm_helper_crtc_in_use);
+
+/**
+ * drm_disable_unused_functions - disable unused objects
+ * @dev: DRM device
+ *
+ * LOCKING:
+ * Caller must hold mode config lock.
+ *
+ * If an connector or CRTC isn't part of @dev's mode_config, it can be disabled
+ * by calling its dpms function, which should power it off.
+ */
+void drm_helper_disable_unused_functions(struct drm_device *dev)
+{
+	struct drm_encoder *encoder;
+	struct drm_encoder_helper_funcs *encoder_funcs;
+	struct drm_crtc *crtc;
+
+	list_for_each_entry(encoder, &dev->mode_config.encoder_list, head) {
+		encoder_funcs = encoder->helper_private;
+		if (!encoder->crtc)
+			(*encoder_funcs->dpms)(encoder, DRM_MODE_DPMS_OFF);
+	}
+
+	list_for_each_entry(crtc, &dev->mode_config.crtc_list, head) {
+		struct drm_crtc_helper_funcs *crtc_funcs = crtc->helper_private;
+		crtc->enabled = drm_helper_crtc_in_use(crtc);
+		if (!crtc->enabled) {
+			crtc_funcs->dpms(crtc, DRM_MODE_DPMS_OFF);
+			crtc->fb = NULL;
+		}
+	}
+}
+EXPORT_SYMBOL(drm_helper_disable_unused_functions);
+
+static struct drm_display_mode *drm_has_preferred_mode(struct drm_connector *connector, int width, int height)
+{
+	struct drm_display_mode *mode;
+
+	list_for_each_entry(mode, &connector->modes, head) {
+		if (drm_mode_width(mode) > width ||
+		    drm_mode_height(mode) > height)
+			continue;
+		if (mode->type & DRM_MODE_TYPE_PREFERRED)
+			return mode;
+	}
+	return NULL;
+}
+
+static bool drm_connector_enabled(struct drm_connector *connector, bool strict)
+{
+	bool enable;
+
+	if (strict) {
+		enable = connector->status == connector_status_connected;
+	} else {
+		enable = connector->status != connector_status_disconnected;
+	}
+	return enable;
+}
+
+static void drm_enable_connectors(struct drm_device *dev, bool *enabled)
+{
+	bool any_enabled = false;
+	struct drm_connector *connector;
+	int i = 0;
+
+	list_for_each_entry(connector, &dev->mode_config.connector_list, head) {
+		enabled[i] = drm_connector_enabled(connector, true);
+		any_enabled |= enabled[i];
+		i++;
+	}
+
+	if (any_enabled)
+		return;
+
+	i = 0;
+	list_for_each_entry(connector, &dev->mode_config.connector_list, head) {
+		enabled[i] = drm_connector_enabled(connector, false);
+		i++;
+	}
+}
+
+static bool drm_target_preferred(struct drm_device *dev,
+				 struct drm_display_mode **modes,
+				 bool *enabled, int width, int height)
+{
+	struct drm_connector *connector;
+	int i = 0;
+
+	list_for_each_entry(connector, &dev->mode_config.connector_list, head) {
+
+		if (enabled[i] == false) {
+			i++;
+			continue;
+		}
+
+		modes[i] = drm_has_preferred_mode(connector, width, height);
+		if (!modes[i]) {
+			list_for_each_entry(modes[i], &connector->modes, head)
+				break;
+		}
+		i++;
+	}
+	return true;
+}
+
+static int drm_pick_crtcs(struct drm_device *dev,
+			  struct drm_crtc **best_crtcs,
+			  struct drm_display_mode **modes,
+			  int n, int width, int height)
+{
+	int c, o;
+	struct drm_connector *connector;
+	struct drm_connector_helper_funcs *connector_funcs;
+	struct drm_encoder *encoder;
+	struct drm_crtc *best_crtc;
+	int my_score, best_score, score;
+	struct drm_crtc **crtcs, *crtc;
+
+	if (n == dev->mode_config.num_connector)
+		return 0;
+	c = 0;
+	list_for_each_entry(connector, &dev->mode_config.connector_list, head) {
+		if (c == n)
+			break;
+		c++;
+	}
+
+	best_crtcs[n] = NULL;
+	best_crtc = NULL;
+	best_score = drm_pick_crtcs(dev, best_crtcs, modes, n+1, width, height);
+	if (modes[n] == NULL)
+		return best_score;
+
+	crtcs = kmalloc(dev->mode_config.num_connector *
+			sizeof(struct drm_crtc *), GFP_KERNEL);
+	if (!crtcs)
+		return best_score;
+
+	my_score = 1;
+	if (connector->status == connector_status_connected)
+		my_score++;
+	if (drm_has_preferred_mode(connector, width, height))
+		my_score++;
+
+	connector_funcs = connector->helper_private;
+	encoder = connector_funcs->best_encoder(connector);
+	if (!encoder)
+		goto out;
+
+	connector->encoder = encoder;
+
+	/* select a crtc for this connector and then attempt to configure
+	   remaining connectors */
+	c = 0;
+	list_for_each_entry(crtc, &dev->mode_config.crtc_list, head) {
+
+		if ((connector->encoder->possible_crtcs & (1 << c)) == 0) {
+			c++;
+			continue;
+		}
+
+		for (o = 0; o < n; o++)
+			if (best_crtcs[o] == crtc)
+				break;
+
+		if (o < n) {
+			/* ignore cloning for now */
+			c++;
+			continue;
+		}
+
+		crtcs[n] = crtc;
+		memcpy(crtcs, best_crtcs, n * sizeof(struct drm_crtc *));
+		score = my_score + drm_pick_crtcs(dev, crtcs, modes, n + 1,
+						  width, height);
+		if (score > best_score) {
+			best_crtc = crtc;
+			best_score = score;
+			memcpy(best_crtcs, crtcs,
+			       dev->mode_config.num_connector *
+			       sizeof(struct drm_crtc *));
+		}
+		c++;
+	}
+out:
+	kfree(crtcs);
+	return best_score;
+}
+
+static void drm_setup_crtcs(struct drm_device *dev)
+{
+	struct drm_crtc **crtcs;
+	struct drm_display_mode **modes;
+	struct drm_encoder *encoder;
+	struct drm_connector *connector;
+	bool *enabled;
+	int width, height;
+	int i, ret;
+
+	width = dev->mode_config.max_width;
+	height = dev->mode_config.max_height;
+
+	/* clean out all the encoder/crtc combos */
+	list_for_each_entry(encoder, &dev->mode_config.encoder_list, head) {
+		encoder->crtc = NULL;
+	}
+
+	crtcs = kcalloc(dev->mode_config.num_connector,
+			sizeof(struct drm_crtc *), GFP_KERNEL);
+	modes = kcalloc(dev->mode_config.num_connector,
+			sizeof(struct drm_display_mode *), GFP_KERNEL);
+	enabled = kcalloc(dev->mode_config.num_connector,
+			  sizeof(bool), GFP_KERNEL);
+
+	drm_enable_connectors(dev, enabled);
+
+	ret = drm_target_preferred(dev, modes, enabled, width, height);
+	if (!ret)
+		DRM_ERROR("Unable to find initial modes\n");
+
+	drm_pick_crtcs(dev, crtcs, modes, 0, width, height);
+
+	i = 0;
+	list_for_each_entry(connector, &dev->mode_config.connector_list, head) {
+		struct drm_display_mode *mode = modes[i];
+		struct drm_crtc *crtc = crtcs[i];
+
+		if (connector->encoder == NULL) {
+			i++;
+			continue;
+		}
+
+		if (mode && crtc) {
+			crtc->desired_mode = mode;
+			connector->encoder->crtc = crtc;
+		} else
+			connector->encoder->crtc = NULL;
+		i++;
+	}
+
+	kfree(crtcs);
+	kfree(modes);
+	kfree(enabled);
+}
+/**
+ * drm_crtc_set_mode - set a mode
+ * @crtc: CRTC to program
+ * @mode: mode to use
+ * @x: width of mode
+ * @y: height of mode
+ *
+ * LOCKING:
+ * Caller must hold mode config lock.
+ *
+ * Try to set @mode on @crtc.  Give @crtc and its associated connectors a chance
+ * to fixup or reject the mode prior to trying to set it.
+ *
+ * RETURNS:
+ * True if the mode was set successfully, or false otherwise.
+ */
+bool drm_crtc_helper_set_mode(struct drm_crtc *crtc,
+			      struct drm_display_mode *mode,
+			      int x, int y)
+{
+	struct drm_device *dev = crtc->dev;
+	struct drm_display_mode *adjusted_mode, saved_mode;
+	struct drm_crtc_helper_funcs *crtc_funcs = crtc->helper_private;
+	struct drm_encoder_helper_funcs *encoder_funcs;
+	int saved_x, saved_y;
+	struct drm_encoder *encoder;
+	bool ret = true;
+
+	adjusted_mode = drm_mode_duplicate(dev, mode);
+
+	crtc->enabled = drm_helper_crtc_in_use(crtc);
+
+	if (!crtc->enabled)
+		return true;
+
+	saved_mode = crtc->mode;
+	saved_x = crtc->x;
+	saved_y = crtc->y;
+
+	/* Update crtc values up front so the driver can rely on them for mode
+	 * setting.
+	 */
+	crtc->mode = *mode;
+	crtc->x = x;
+	crtc->y = y;
+
+	if (drm_mode_equal(&saved_mode, &crtc->mode)) {
+		if (saved_x != crtc->x || saved_y != crtc->y) {
+			crtc_funcs->mode_set_base(crtc, crtc->x, crtc->y);
+			goto done;
+		}
+	}
+
+	/* Pass our mode to the connectors and the CRTC to give them a chance to
+	 * adjust it according to limitations or connector properties, and also
+	 * a chance to reject the mode entirely.
+	 */
+	list_for_each_entry(encoder, &dev->mode_config.encoder_list, head) {
+
+		if (encoder->crtc != crtc)
+			continue;
+		encoder_funcs = encoder->helper_private;
+		if (!(ret = encoder_funcs->mode_fixup(encoder, mode,
+						      adjusted_mode))) {
+			goto done;
+		}
+	}
+
+	if (!(ret = crtc_funcs->mode_fixup(crtc, mode, adjusted_mode))) {
+		goto done;
+	}
+
+	/* Prepare the encoders and CRTCs before setting the mode. */
+	list_for_each_entry(encoder, &dev->mode_config.encoder_list, head) {
+
+		if (encoder->crtc != crtc)
+			continue;
+		encoder_funcs = encoder->helper_private;
+		/* Disable the encoders as the first thing we do. */
+		encoder_funcs->prepare(encoder);
+	}
+
+	crtc_funcs->prepare(crtc);
+
+	/* Set up the DPLL and any encoders state that needs to adjust or depend
+	 * on the DPLL.
+	 */
+	crtc_funcs->mode_set(crtc, mode, adjusted_mode, x, y);
+
+	list_for_each_entry(encoder, &dev->mode_config.encoder_list, head) {
+
+		if (encoder->crtc != crtc)
+			continue;
+
+		DRM_INFO("%s: set mode %s %x\n", drm_get_encoder_name(encoder),
+			 mode->name, mode->base.id);
+		encoder_funcs = encoder->helper_private;
+		encoder_funcs->mode_set(encoder, mode, adjusted_mode);
+	}
+
+	/* Now enable the clocks, plane, pipe, and connectors that we set up. */
+	crtc_funcs->commit(crtc);
+
+	list_for_each_entry(encoder, &dev->mode_config.encoder_list, head) {
+
+		if (encoder->crtc != crtc)
+			continue;
+
+		encoder_funcs = encoder->helper_private;
+		encoder_funcs->commit(encoder);
+
+	}
+
+	/* XXX free adjustedmode */
+	drm_mode_destroy(dev, adjusted_mode);
+	/* FIXME: add subpixel order */
+done:
+	if (!ret) {
+		crtc->mode = saved_mode;
+		crtc->x = saved_x;
+		crtc->y = saved_y;
+	}
+
+	return ret;
+}
+EXPORT_SYMBOL(drm_crtc_helper_set_mode);
+
+
+/**
+ * drm_crtc_helper_set_config - set a new config from userspace
+ * @crtc: CRTC to setup
+ * @crtc_info: user provided configuration
+ * @new_mode: new mode to set
+ * @connector_set: set of connectors for the new config
+ * @fb: new framebuffer
+ *
+ * LOCKING:
+ * Caller must hold mode config lock.
+ *
+ * Setup a new configuration, provided by the user in @crtc_info, and enable
+ * it.
+ *
+ * RETURNS:
+ * Zero. (FIXME)
+ */
+int drm_crtc_helper_set_config(struct drm_mode_set *set)
+{
+	struct drm_device *dev;
+	struct drm_crtc **save_crtcs, *new_crtc;
+	struct drm_encoder **save_encoders, *new_encoder;
+	bool save_enabled;
+	bool changed = false;
+	bool flip_or_move = false;
+	struct drm_connector *connector;
+	int count = 0, ro, fail = 0;
+	struct drm_crtc_helper_funcs *crtc_funcs;
+	int ret = 0;
+
+	DRM_DEBUG("\n");
+
+	if (!set)
+		return -EINVAL;
+
+	if (!set->crtc)
+		return -EINVAL;
+
+	if (!set->crtc->helper_private)
+		return -EINVAL;
+
+	crtc_funcs = set->crtc->helper_private;
+
+	DRM_DEBUG("crtc: %p %d fb: %p connectors: %p num_connectors: %d (x, y) (%i, %i)\n",
+		  set->crtc, set->crtc->base.id, set->fb, set->connectors,
+		  (int)set->num_connectors, set->x, set->y);
+
+	dev = set->crtc->dev;
+
+	/* save previous config */
+	save_enabled = set->crtc->enabled;
+
+	/* this is meant to be num_connector not num_crtc */
+	save_crtcs = kzalloc(dev->mode_config.num_connector *
+			     sizeof(struct drm_crtc *), GFP_KERNEL);
+	if (!save_crtcs)
+		return -ENOMEM;
+
+	save_encoders = kzalloc(dev->mode_config.num_connector *
+				sizeof(struct drm_encoders *), GFP_KERNEL);
+	if (!save_encoders) {
+		kfree(save_crtcs);
+		return -ENOMEM;
+	}
+
+	/* We should be able to check here if the fb has the same properties
+	 * and then just flip_or_move it */
+	if (set->crtc->fb != set->fb) {
+		/* if we have no fb then its a change not a flip */
+		if (set->crtc->fb == NULL)
+			changed = true;
+		else
+			flip_or_move = true;
+	}
+
+	if (set->x != set->crtc->x || set->y != set->crtc->y)
+		flip_or_move = true;
+
+	if (set->mode && !drm_mode_equal(set->mode, &set->crtc->mode)) {
+		DRM_DEBUG("modes are different\n");
+		drm_mode_debug_printmodeline(&set->crtc->mode);
+		drm_mode_debug_printmodeline(set->mode);
+		changed = true;
+	}
+
+	/* a) traverse passed in connector list and get encoders for them */
+	count = 0;
+	list_for_each_entry(connector, &dev->mode_config.connector_list, head) {
+		struct drm_connector_helper_funcs *connector_funcs =
+			connector->helper_private;
+		save_encoders[count++] = connector->encoder;
+		new_encoder = connector->encoder;
+		for (ro = 0; ro < set->num_connectors; ro++) {
+			if (set->connectors[ro] == connector) {
+				new_encoder = connector_funcs->best_encoder(connector);
+				/* if we can't get an encoder for a connector
+				   we are setting now - then fail */
+				if (new_encoder == NULL)
+					/* don't break so fail path works correct */
+					fail = 1;
+				break;
+			}
+		}
+
+		if (new_encoder != connector->encoder) {
+			changed = true;
+			connector->encoder = new_encoder;
+		}
+	}
+
+	if (fail) {
+		ret = -EINVAL;
+		goto fail_no_encoder;
+	}
+
+	count = 0;
+	list_for_each_entry(connector, &dev->mode_config.connector_list, head) {
+		if (!connector->encoder)
+			continue;
+
+		save_crtcs[count++] = connector->encoder->crtc;
+
+		if (connector->encoder->crtc == set->crtc)
+			new_crtc = NULL;
+		else
+			new_crtc = connector->encoder->crtc;
+
+		for (ro = 0; ro < set->num_connectors; ro++) {
+			if (set->connectors[ro] == connector)
+				new_crtc = set->crtc;
+		}
+		if (new_crtc != connector->encoder->crtc) {
+			changed = true;
+			connector->encoder->crtc = new_crtc;
+		}
+	}
+
+	/* mode_set_base is not a required function */
+	if (flip_or_move && !crtc_funcs->mode_set_base)
+		changed = true;
+
+	if (changed) {
+		set->crtc->fb = set->fb;
+		set->crtc->enabled = (set->mode != NULL);
+		if (set->mode != NULL) {
+			DRM_DEBUG("attempting to set mode from userspace\n");
+			drm_mode_debug_printmodeline(set->mode);
+			if (!drm_crtc_helper_set_mode(set->crtc, set->mode,
+						      set->x, set->y)) {
+				ret = -EINVAL;
+				goto fail_set_mode;
+			}
+			/* TODO are these needed? */
+			set->crtc->desired_x = set->x;
+			set->crtc->desired_y = set->y;
+			set->crtc->desired_mode = set->mode;
+		}
+		drm_helper_disable_unused_functions(dev);
+	} else if (flip_or_move) {
+		if (set->crtc->fb != set->fb)
+			set->crtc->fb = set->fb;
+		crtc_funcs->mode_set_base(set->crtc, set->x, set->y);
+	}
+
+	kfree(save_encoders);
+	kfree(save_crtcs);
+	return 0;
+
+fail_set_mode:
+	set->crtc->enabled = save_enabled;
+	count = 0;
+	list_for_each_entry(connector, &dev->mode_config.connector_list, head)
+		connector->encoder->crtc = save_crtcs[count++];
+fail_no_encoder:
+	kfree(save_crtcs);
+	count = 0;
+	list_for_each_entry(connector, &dev->mode_config.connector_list, head) {
+		connector->encoder = save_encoders[count++];
+	}
+	kfree(save_encoders);
+	return ret;
+}
+EXPORT_SYMBOL(drm_crtc_helper_set_config);
+
+bool drm_helper_plugged_event(struct drm_device *dev)
+{
+	DRM_DEBUG("\n");
+
+	drm_helper_probe_connector_modes(dev, dev->mode_config.max_width,
+					 dev->mode_config.max_height);
+
+	drm_setup_crtcs(dev);
+
+	/* alert the driver fb layer */
+	dev->mode_config.funcs->fb_changed(dev);
+
+	/* FIXME: send hotplug event */
+	return true;
+}
+/**
+ * drm_initial_config - setup a sane initial connector configuration
+ * @dev: DRM device
+ * @can_grow: this configuration is growable
+ *
+ * LOCKING:
+ * Called at init time, must take mode config lock.
+ *
+ * Scan the CRTCs and connectors and try to put together an initial setup.
+ * At the moment, this is a cloned configuration across all heads with
+ * a new framebuffer object as the backing store.
+ *
+ * RETURNS:
+ * Zero if everything went ok, nonzero otherwise.
+ */
+bool drm_helper_initial_config(struct drm_device *dev, bool can_grow)
+{
+	int ret = false;
+
+	drm_helper_plugged_event(dev);
+	return ret;
+}
+EXPORT_SYMBOL(drm_helper_initial_config);
+
+/**
+ * drm_hotplug_stage_two
+ * @dev DRM device
+ * @connector hotpluged connector
+ *
+ * LOCKING.
+ * Caller must hold mode config lock, function might grab struct lock.
+ *
+ * Stage two of a hotplug.
+ *
+ * RETURNS:
+ * Zero on success, errno on failure.
+ */
+int drm_helper_hotplug_stage_two(struct drm_device *dev)
+{
+	dev->mode_config.hotplug_counter++;
+
+	drm_helper_plugged_event(dev);
+
+	return 0;
+}
+EXPORT_SYMBOL(drm_helper_hotplug_stage_two);
+
+int drm_helper_mode_fill_fb_struct(struct drm_framebuffer *fb,
+				   struct drm_mode_fb_cmd *mode_cmd)
+{
+	fb->width = mode_cmd->width;
+	fb->height = mode_cmd->height;
+	fb->pitch = mode_cmd->pitch;
+	fb->bits_per_pixel = mode_cmd->bpp;
+	fb->depth = mode_cmd->depth;
+
+	return 0;
+}
+EXPORT_SYMBOL(drm_helper_mode_fill_fb_struct);
+
+int drm_helper_resume_force_mode(struct drm_device *dev)
+{
+	struct drm_crtc *crtc;
+	int ret;
+
+	list_for_each_entry(crtc, &dev->mode_config.crtc_list, head) {
+
+		if (!crtc->enabled)
+			continue;
+
+		ret = drm_crtc_helper_set_mode(crtc, &crtc->mode, crtc->x,
+					       crtc->y);
+
+		if (ret == false)
+			DRM_ERROR("failed to set mode on crtc %p\n", crtc);
+	}
+	return 0;
+}
+EXPORT_SYMBOL(drm_helper_resume_force_mode);
diff --git a/drivers/gpu/drm/drm_drv.c b/drivers/gpu/drm/drm_drv.c
index 98a781375f6..0b9f3164a3b 100644
--- a/drivers/gpu/drm/drm_drv.c
+++ b/drivers/gpu/drm/drm_drv.c
@@ -126,6 +126,26 @@ static struct drm_ioctl_desc drm_ioctls[] = {
 	DRM_IOCTL_DEF(DRM_IOCTL_GEM_CLOSE, drm_gem_close_ioctl, 0),
 	DRM_IOCTL_DEF(DRM_IOCTL_GEM_FLINK, drm_gem_flink_ioctl, DRM_AUTH),
 	DRM_IOCTL_DEF(DRM_IOCTL_GEM_OPEN, drm_gem_open_ioctl, DRM_AUTH),
+
+	DRM_IOCTL_DEF(DRM_IOCTL_MODE_GETRESOURCES, drm_mode_getresources, DRM_MASTER|DRM_CONTROL_ALLOW),
+	DRM_IOCTL_DEF(DRM_IOCTL_MODE_GETCRTC, drm_mode_getcrtc, DRM_MASTER|DRM_CONTROL_ALLOW),
+	DRM_IOCTL_DEF(DRM_IOCTL_MODE_GETCONNECTOR, drm_mode_getconnector, DRM_MASTER|DRM_CONTROL_ALLOW),
+	DRM_IOCTL_DEF(DRM_IOCTL_MODE_SETCRTC, drm_mode_setcrtc, DRM_MASTER|DRM_CONTROL_ALLOW),
+	DRM_IOCTL_DEF(DRM_IOCTL_MODE_CURSOR, drm_mode_cursor_ioctl, DRM_MASTER|DRM_CONTROL_ALLOW),
+	DRM_IOCTL_DEF(DRM_IOCTL_MODE_ADDFB, drm_mode_addfb, DRM_MASTER|DRM_CONTROL_ALLOW),
+	DRM_IOCTL_DEF(DRM_IOCTL_MODE_RMFB, drm_mode_rmfb, DRM_MASTER|DRM_CONTROL_ALLOW),
+	DRM_IOCTL_DEF(DRM_IOCTL_MODE_GETFB, drm_mode_getfb, DRM_MASTER|DRM_CONTROL_ALLOW),
+
+	DRM_IOCTL_DEF(DRM_IOCTL_MODE_SETPROPERTY, drm_mode_connector_property_set_ioctl, DRM_MASTER|DRM_CONTROL_ALLOW),
+	DRM_IOCTL_DEF(DRM_IOCTL_MODE_GETPROPBLOB, drm_mode_getblob_ioctl, DRM_MASTER|DRM_CONTROL_ALLOW),
+	DRM_IOCTL_DEF(DRM_IOCTL_MODE_ATTACHMODE, drm_mode_attachmode_ioctl, DRM_MASTER|DRM_CONTROL_ALLOW),
+	DRM_IOCTL_DEF(DRM_IOCTL_MODE_DETACHMODE, drm_mode_detachmode_ioctl, DRM_MASTER|DRM_CONTROL_ALLOW),
+	DRM_IOCTL_DEF(DRM_IOCTL_MODE_GETPROPERTY, drm_mode_getproperty_ioctl, DRM_MASTER | DRM_CONTROL_ALLOW),
+
+	DRM_IOCTL_DEF(DRM_IOCTL_MODE_REPLACEFB, drm_mode_replacefb, DRM_MASTER|DRM_ROOT_ONLY|DRM_CONTROL_ALLOW),
+	DRM_IOCTL_DEF(DRM_IOCTL_MODE_GETENCODER, drm_mode_getencoder, DRM_MASTER|DRM_CONTROL_ALLOW),
+	DRM_IOCTL_DEF(DRM_IOCTL_MODE_GETGAMMA, drm_mode_gamma_get_ioctl, DRM_MASTER),
+	DRM_IOCTL_DEF(DRM_IOCTL_MODE_SETGAMMA, drm_mode_gamma_set_ioctl, DRM_MASTER),
 };
 
 #define DRM_CORE_IOCTL_COUNT	ARRAY_SIZE( drm_ioctls )
@@ -150,7 +170,7 @@ int drm_lastclose(struct drm_device * dev)
 		dev->driver->lastclose(dev);
 	DRM_DEBUG("driver lastclose completed\n");
 
-	if (dev->irq_enabled)
+	if (dev->irq_enabled && !drm_core_check_feature(dev, DRIVER_MODESET))
 		drm_irq_uninstall(dev);
 
 	mutex_lock(&dev->struct_mutex);
@@ -160,7 +180,8 @@ int drm_lastclose(struct drm_device * dev)
 	del_timer(&dev->timer);
 
 	/* Clear AGP information */
-	if (drm_core_has_AGP(dev) && dev->agp) {
+	if (drm_core_has_AGP(dev) && dev->agp &&
+	    !drm_core_check_feature(dev, DRIVER_MODESET)) {
 		struct drm_agp_mem *entry, *tempe;
 
 		/* Remove AGP resources, but leave dev->agp
@@ -179,7 +200,8 @@ int drm_lastclose(struct drm_device * dev)
 		dev->agp->acquired = 0;
 		dev->agp->enabled = 0;
 	}
-	if (drm_core_check_feature(dev, DRIVER_SG) && dev->sg) {
+	if (drm_core_check_feature(dev, DRIVER_SG) && dev->sg &&
+	    !drm_core_check_feature(dev, DRIVER_MODESET)) {
 		drm_sg_cleanup(dev->sg);
 		dev->sg = NULL;
 	}
@@ -206,7 +228,8 @@ int drm_lastclose(struct drm_device * dev)
 	}
 	dev->queue_count = 0;
 
-	if (drm_core_check_feature(dev, DRIVER_HAVE_DMA))
+	if (drm_core_check_feature(dev, DRIVER_HAVE_DMA) &&
+	    !drm_core_check_feature(dev, DRIVER_MODESET))
 		drm_dma_takedown(dev);
 
 	dev->dev_mapping = NULL;
@@ -307,6 +330,9 @@ static void drm_cleanup(struct drm_device * dev)
 	drm_ht_remove(&dev->map_hash);
 	drm_ctxbitmap_cleanup(dev);
 
+	if (drm_core_check_feature(dev, DRIVER_MODESET))
+		drm_put_minor(&dev->control);
+
 	if (driver->driver_features & DRIVER_GEM)
 		drm_gem_destroy(dev);
 
diff --git a/drivers/gpu/drm/drm_edid.c b/drivers/gpu/drm/drm_edid.c
new file mode 100644
index 00000000000..681753e57dc
--- /dev/null
+++ b/drivers/gpu/drm/drm_edid.c
@@ -0,0 +1,732 @@
+/*
+ * Copyright (c) 2006 Luc Verhaegen (quirks list)
+ * Copyright (c) 2007-2008 Intel Corporation
+ *   Jesse Barnes <jesse.barnes@intel.com>
+ *
+ * DDC probing routines (drm_ddc_read & drm_do_probe_ddc_edid) originally from
+ * FB layer.
+ *   Copyright (C) 2006 Dennis Munsie <dmunsie@cecropia.com>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sub license,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+#include <linux/kernel.h>
+#include <linux/i2c.h>
+#include <linux/i2c-algo-bit.h>
+#include "drmP.h"
+#include "drm_edid.h"
+
+/*
+ * TODO:
+ *   - support EDID 1.4 (incl. CE blocks)
+ */
+
+/*
+ * EDID blocks out in the wild have a variety of bugs, try to collect
+ * them here (note that userspace may work around broken monitors first,
+ * but fixes should make their way here so that the kernel "just works"
+ * on as many displays as possible).
+ */
+
+/* First detailed mode wrong, use largest 60Hz mode */
+#define EDID_QUIRK_PREFER_LARGE_60		(1 << 0)
+/* Reported 135MHz pixel clock is too high, needs adjustment */
+#define EDID_QUIRK_135_CLOCK_TOO_HIGH		(1 << 1)
+/* Prefer the largest mode at 75 Hz */
+#define EDID_QUIRK_PREFER_LARGE_75		(1 << 2)
+/* Detail timing is in cm not mm */
+#define EDID_QUIRK_DETAILED_IN_CM		(1 << 3)
+/* Detailed timing descriptors have bogus size values, so just take the
+ * maximum size and use that.
+ */
+#define EDID_QUIRK_DETAILED_USE_MAXIMUM_SIZE	(1 << 4)
+/* Monitor forgot to set the first detailed is preferred bit. */
+#define EDID_QUIRK_FIRST_DETAILED_PREFERRED	(1 << 5)
+/* use +hsync +vsync for detailed mode */
+#define EDID_QUIRK_DETAILED_SYNC_PP		(1 << 6)
+
+static struct edid_quirk {
+	char *vendor;
+	int product_id;
+	u32 quirks;
+} edid_quirk_list[] = {
+	/* Acer AL1706 */
+	{ "ACR", 44358, EDID_QUIRK_PREFER_LARGE_60 },
+	/* Acer F51 */
+	{ "API", 0x7602, EDID_QUIRK_PREFER_LARGE_60 },
+	/* Unknown Acer */
+	{ "ACR", 2423, EDID_QUIRK_FIRST_DETAILED_PREFERRED },
+
+	/* Belinea 10 15 55 */
+	{ "MAX", 1516, EDID_QUIRK_PREFER_LARGE_60 },
+	{ "MAX", 0x77e, EDID_QUIRK_PREFER_LARGE_60 },
+
+	/* Envision Peripherals, Inc. EN-7100e */
+	{ "EPI", 59264, EDID_QUIRK_135_CLOCK_TOO_HIGH },
+
+	/* Funai Electronics PM36B */
+	{ "FCM", 13600, EDID_QUIRK_PREFER_LARGE_75 |
+	  EDID_QUIRK_DETAILED_IN_CM },
+
+	/* LG Philips LCD LP154W01-A5 */
+	{ "LPL", 0, EDID_QUIRK_DETAILED_USE_MAXIMUM_SIZE },
+	{ "LPL", 0x2a00, EDID_QUIRK_DETAILED_USE_MAXIMUM_SIZE },
+
+	/* Philips 107p5 CRT */
+	{ "PHL", 57364, EDID_QUIRK_FIRST_DETAILED_PREFERRED },
+
+	/* Proview AY765C */
+	{ "PTS", 765, EDID_QUIRK_FIRST_DETAILED_PREFERRED },
+
+	/* Samsung SyncMaster 205BW.  Note: irony */
+	{ "SAM", 541, EDID_QUIRK_DETAILED_SYNC_PP },
+	/* Samsung SyncMaster 22[5-6]BW */
+	{ "SAM", 596, EDID_QUIRK_PREFER_LARGE_60 },
+	{ "SAM", 638, EDID_QUIRK_PREFER_LARGE_60 },
+};
+
+
+/* Valid EDID header has these bytes */
+static u8 edid_header[] = { 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00 };
+
+/**
+ * edid_is_valid - sanity check EDID data
+ * @edid: EDID data
+ *
+ * Sanity check the EDID block by looking at the header, the version number
+ * and the checksum.  Return 0 if the EDID doesn't check out, or 1 if it's
+ * valid.
+ */
+static bool edid_is_valid(struct edid *edid)
+{
+	int i;
+	u8 csum = 0;
+	u8 *raw_edid = (u8 *)edid;
+
+	if (memcmp(edid->header, edid_header, sizeof(edid_header)))
+		goto bad;
+	if (edid->version != 1) {
+		DRM_ERROR("EDID has major version %d, instead of 1\n", edid->version);
+		goto bad;
+	}
+	if (edid->revision <= 0 || edid->revision > 3) {
+		DRM_ERROR("EDID has minor version %d, which is not between 0-3\n", edid->revision);
+		goto bad;
+	}
+
+	for (i = 0; i < EDID_LENGTH; i++)
+		csum += raw_edid[i];
+	if (csum) {
+		DRM_ERROR("EDID checksum is invalid, remainder is %d\n", csum);
+		goto bad;
+	}
+
+	return 1;
+
+bad:
+	if (raw_edid) {
+		DRM_ERROR("Raw EDID:\n");
+		print_hex_dump_bytes(KERN_ERR, DUMP_PREFIX_NONE, raw_edid, EDID_LENGTH);
+		printk("\n");
+	}
+	return 0;
+}
+
+/**
+ * edid_vendor - match a string against EDID's obfuscated vendor field
+ * @edid: EDID to match
+ * @vendor: vendor string
+ *
+ * Returns true if @vendor is in @edid, false otherwise
+ */
+static bool edid_vendor(struct edid *edid, char *vendor)
+{
+	char edid_vendor[3];
+
+	edid_vendor[0] = ((edid->mfg_id[0] & 0x7c) >> 2) + '@';
+	edid_vendor[1] = (((edid->mfg_id[0] & 0x3) << 3) |
+			  ((edid->mfg_id[1] & 0xe0) >> 5)) + '@';
+	edid_vendor[2] = (edid->mfg_id[2] & 0x1f) + '@';
+
+	return !strncmp(edid_vendor, vendor, 3);
+}
+
+/**
+ * edid_get_quirks - return quirk flags for a given EDID
+ * @edid: EDID to process
+ *
+ * This tells subsequent routines what fixes they need to apply.
+ */
+static u32 edid_get_quirks(struct edid *edid)
+{
+	struct edid_quirk *quirk;
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(edid_quirk_list); i++) {
+		quirk = &edid_quirk_list[i];
+
+		if (edid_vendor(edid, quirk->vendor) &&
+		    (EDID_PRODUCT_ID(edid) == quirk->product_id))
+			return quirk->quirks;
+	}
+
+	return 0;
+}
+
+#define MODE_SIZE(m) ((m)->hdisplay * (m)->vdisplay)
+#define MODE_REFRESH_DIFF(m,r) (abs((m)->vrefresh - target_refresh))
+
+
+/**
+ * edid_fixup_preferred - set preferred modes based on quirk list
+ * @connector: has mode list to fix up
+ * @quirks: quirks list
+ *
+ * Walk the mode list for @connector, clearing the preferred status
+ * on existing modes and setting it anew for the right mode ala @quirks.
+ */
+static void edid_fixup_preferred(struct drm_connector *connector,
+				 u32 quirks)
+{
+	struct drm_display_mode *t, *cur_mode, *preferred_mode;
+	int target_refresh;
+
+	if (list_empty(&connector->probed_modes))
+		return;
+
+	if (quirks & EDID_QUIRK_PREFER_LARGE_60)
+		target_refresh = 60;
+	if (quirks & EDID_QUIRK_PREFER_LARGE_75)
+		target_refresh = 75;
+
+	preferred_mode = list_first_entry(&connector->probed_modes,
+					  struct drm_display_mode, head);
+
+	list_for_each_entry_safe(cur_mode, t, &connector->probed_modes, head) {
+		cur_mode->type &= ~DRM_MODE_TYPE_PREFERRED;
+
+		if (cur_mode == preferred_mode)
+			continue;
+
+		/* Largest mode is preferred */
+		if (MODE_SIZE(cur_mode) > MODE_SIZE(preferred_mode))
+			preferred_mode = cur_mode;
+
+		/* At a given size, try to get closest to target refresh */
+		if ((MODE_SIZE(cur_mode) == MODE_SIZE(preferred_mode)) &&
+		    MODE_REFRESH_DIFF(cur_mode, target_refresh) <
+		    MODE_REFRESH_DIFF(preferred_mode, target_refresh)) {
+			preferred_mode = cur_mode;
+		}
+	}
+
+	preferred_mode->type |= DRM_MODE_TYPE_PREFERRED;
+}
+
+/**
+ * drm_mode_std - convert standard mode info (width, height, refresh) into mode
+ * @t: standard timing params
+ *
+ * Take the standard timing params (in this case width, aspect, and refresh)
+ * and convert them into a real mode using CVT.
+ *
+ * Punts for now, but should eventually use the FB layer's CVT based mode
+ * generation code.
+ */
+struct drm_display_mode *drm_mode_std(struct drm_device *dev,
+				      struct std_timing *t)
+{
+	struct drm_display_mode *mode;
+	int hsize = t->hsize * 8 + 248, vsize;
+
+	mode = drm_mode_create(dev);
+	if (!mode)
+		return NULL;
+
+	if (t->aspect_ratio == 0)
+		vsize = (hsize * 10) / 16;
+	else if (t->aspect_ratio == 1)
+		vsize = (hsize * 3) / 4;
+	else if (t->aspect_ratio == 2)
+		vsize = (hsize * 4) / 5;
+	else
+		vsize = (hsize * 9) / 16;
+
+	drm_mode_set_name(mode);
+
+	return mode;
+}
+
+/**
+ * drm_mode_detailed - create a new mode from an EDID detailed timing section
+ * @dev: DRM device (needed to create new mode)
+ * @edid: EDID block
+ * @timing: EDID detailed timing info
+ * @quirks: quirks to apply
+ *
+ * An EDID detailed timing block contains enough info for us to create and
+ * return a new struct drm_display_mode.
+ */
+static struct drm_display_mode *drm_mode_detailed(struct drm_device *dev,
+						  struct edid *edid,
+						  struct detailed_timing *timing,
+						  u32 quirks)
+{
+	struct drm_display_mode *mode;
+	struct detailed_pixel_timing *pt = &timing->data.pixel_data;
+
+	if (pt->stereo) {
+		printk(KERN_WARNING "stereo mode not supported\n");
+		return NULL;
+	}
+	if (!pt->separate_sync) {
+		printk(KERN_WARNING "integrated sync not supported\n");
+		return NULL;
+	}
+
+	mode = drm_mode_create(dev);
+	if (!mode)
+		return NULL;
+
+	mode->type = DRM_MODE_TYPE_DRIVER;
+
+	if (quirks & EDID_QUIRK_135_CLOCK_TOO_HIGH)
+		timing->pixel_clock = 1088;
+
+	mode->clock = timing->pixel_clock * 10;
+
+	mode->hdisplay = (pt->hactive_hi << 8) | pt->hactive_lo;
+	mode->hsync_start = mode->hdisplay + ((pt->hsync_offset_hi << 8) |
+					      pt->hsync_offset_lo);
+	mode->hsync_end = mode->hsync_start +
+		((pt->hsync_pulse_width_hi << 8) |
+		 pt->hsync_pulse_width_lo);
+	mode->htotal = mode->hdisplay + ((pt->hblank_hi << 8) | pt->hblank_lo);
+
+	mode->vdisplay = (pt->vactive_hi << 8) | pt->vactive_lo;
+	mode->vsync_start = mode->vdisplay + ((pt->vsync_offset_hi << 8) |
+					      pt->vsync_offset_lo);
+	mode->vsync_end = mode->vsync_start +
+		((pt->vsync_pulse_width_hi << 8) |
+		 pt->vsync_pulse_width_lo);
+	mode->vtotal = mode->vdisplay + ((pt->vblank_hi << 8) | pt->vblank_lo);
+
+	drm_mode_set_name(mode);
+
+	if (pt->interlaced)
+		mode->flags |= DRM_MODE_FLAG_INTERLACE;
+
+	if (quirks & EDID_QUIRK_DETAILED_SYNC_PP) {
+		pt->hsync_positive = 1;
+		pt->vsync_positive = 1;
+	}
+
+	mode->flags |= pt->hsync_positive ? DRM_MODE_FLAG_PHSYNC : DRM_MODE_FLAG_NHSYNC;
+	mode->flags |= pt->vsync_positive ? DRM_MODE_FLAG_PVSYNC : DRM_MODE_FLAG_NVSYNC;
+
+	mode->width_mm = pt->width_mm_lo | (pt->width_mm_hi << 8);
+	mode->height_mm = pt->height_mm_lo | (pt->height_mm_hi << 8);
+
+	if (quirks & EDID_QUIRK_DETAILED_IN_CM) {
+		mode->width_mm *= 10;
+		mode->height_mm *= 10;
+	}
+
+	if (quirks & EDID_QUIRK_DETAILED_USE_MAXIMUM_SIZE) {
+		mode->width_mm = edid->width_cm * 10;
+		mode->height_mm = edid->height_cm * 10;
+	}
+
+	return mode;
+}
+
+/*
+ * Detailed mode info for the EDID "established modes" data to use.
+ */
+static struct drm_display_mode edid_est_modes[] = {
+	{ DRM_MODE("800x600", DRM_MODE_TYPE_DRIVER, 40000, 800, 840,
+		   968, 1056, 0, 600, 601, 605, 628, 0,
+		   DRM_MODE_FLAG_PHSYNC | DRM_MODE_FLAG_PVSYNC) }, /* 800x600@60Hz */
+	{ DRM_MODE("800x600", DRM_MODE_TYPE_DRIVER, 36000, 800, 824,
+		   896, 1024, 0, 600, 601, 603,  625, 0,
+		   DRM_MODE_FLAG_PHSYNC | DRM_MODE_FLAG_PVSYNC) }, /* 800x600@56Hz */
+	{ DRM_MODE("640x480", DRM_MODE_TYPE_DRIVER, 31500, 640, 656,
+		   720, 840, 0, 480, 481, 484, 500, 0,
+		   DRM_MODE_FLAG_NHSYNC | DRM_MODE_FLAG_NVSYNC) }, /* 640x480@75Hz */
+	{ DRM_MODE("640x480", DRM_MODE_TYPE_DRIVER, 31500, 640, 664,
+		   704,  832, 0, 480, 489, 491, 520, 0,
+		   DRM_MODE_FLAG_NHSYNC | DRM_MODE_FLAG_NVSYNC) }, /* 640x480@72Hz */
+	{ DRM_MODE("640x480", DRM_MODE_TYPE_DRIVER, 30240, 640, 704,
+		   768,  864, 0, 480, 483, 486, 525, 0,
+		   DRM_MODE_FLAG_NHSYNC | DRM_MODE_FLAG_NVSYNC) }, /* 640x480@67Hz */
+	{ DRM_MODE("640x480", DRM_MODE_TYPE_DRIVER, 25200, 640, 656,
+		   752, 800, 0, 480, 490, 492, 525, 0,
+		   DRM_MODE_FLAG_NHSYNC | DRM_MODE_FLAG_NVSYNC) }, /* 640x480@60Hz */
+	{ DRM_MODE("720x400", DRM_MODE_TYPE_DRIVER, 35500, 720, 738,
+		   846, 900, 0, 400, 421, 423,  449, 0,
+		   DRM_MODE_FLAG_NHSYNC | DRM_MODE_FLAG_NVSYNC) }, /* 720x400@88Hz */
+	{ DRM_MODE("720x400", DRM_MODE_TYPE_DRIVER, 28320, 720, 738,
+		   846,  900, 0, 400, 412, 414, 449, 0,
+		   DRM_MODE_FLAG_NHSYNC | DRM_MODE_FLAG_PVSYNC) }, /* 720x400@70Hz */
+	{ DRM_MODE("1280x1024", DRM_MODE_TYPE_DRIVER, 135000, 1280, 1296,
+		   1440, 1688, 0, 1024, 1025, 1028, 1066, 0,
+		   DRM_MODE_FLAG_PHSYNC | DRM_MODE_FLAG_PVSYNC) }, /* 1280x1024@75Hz */
+	{ DRM_MODE("1024x768", DRM_MODE_TYPE_DRIVER, 78800, 1024, 1040,
+		   1136, 1312, 0,  768, 769, 772, 800, 0,
+		   DRM_MODE_FLAG_PHSYNC | DRM_MODE_FLAG_PVSYNC) }, /* 1024x768@75Hz */
+	{ DRM_MODE("1024x768", DRM_MODE_TYPE_DRIVER, 75000, 1024, 1048,
+		   1184, 1328, 0,  768, 771, 777, 806, 0,
+		   DRM_MODE_FLAG_NHSYNC | DRM_MODE_FLAG_NVSYNC) }, /* 1024x768@70Hz */
+	{ DRM_MODE("1024x768", DRM_MODE_TYPE_DRIVER, 65000, 1024, 1048,
+		   1184, 1344, 0,  768, 771, 777, 806, 0,
+		   DRM_MODE_FLAG_NHSYNC | DRM_MODE_FLAG_NVSYNC) }, /* 1024x768@60Hz */
+	{ DRM_MODE("1024x768", DRM_MODE_TYPE_DRIVER,44900, 1024, 1032,
+		   1208, 1264, 0, 768, 768, 776, 817, 0,
+		   DRM_MODE_FLAG_PHSYNC | DRM_MODE_FLAG_PVSYNC | DRM_MODE_FLAG_INTERLACE) }, /* 1024x768@43Hz */
+	{ DRM_MODE("832x624", DRM_MODE_TYPE_DRIVER, 57284, 832, 864,
+		   928, 1152, 0, 624, 625, 628, 667, 0,
+		   DRM_MODE_FLAG_NHSYNC | DRM_MODE_FLAG_NVSYNC) }, /* 832x624@75Hz */
+	{ DRM_MODE("800x600", DRM_MODE_TYPE_DRIVER, 49500, 800, 816,
+		   896, 1056, 0, 600, 601, 604,  625, 0,
+		   DRM_MODE_FLAG_PHSYNC | DRM_MODE_FLAG_PVSYNC) }, /* 800x600@75Hz */
+	{ DRM_MODE("800x600", DRM_MODE_TYPE_DRIVER, 50000, 800, 856,
+		   976, 1040, 0, 600, 637, 643, 666, 0,
+		   DRM_MODE_FLAG_PHSYNC | DRM_MODE_FLAG_PVSYNC) }, /* 800x600@72Hz */
+	{ DRM_MODE("1152x864", DRM_MODE_TYPE_DRIVER, 108000, 1152, 1216,
+		   1344, 1600, 0,  864, 865, 868, 900, 0,
+		   DRM_MODE_FLAG_PHSYNC | DRM_MODE_FLAG_PVSYNC) }, /* 1152x864@75Hz */
+};
+
+#define EDID_EST_TIMINGS 16
+#define EDID_STD_TIMINGS 8
+#define EDID_DETAILED_TIMINGS 4
+
+/**
+ * add_established_modes - get est. modes from EDID and add them
+ * @edid: EDID block to scan
+ *
+ * Each EDID block contains a bitmap of the supported "established modes" list
+ * (defined above).  Tease them out and add them to the global modes list.
+ */
+static int add_established_modes(struct drm_connector *connector, struct edid *edid)
+{
+	struct drm_device *dev = connector->dev;
+	unsigned long est_bits = edid->established_timings.t1 |
+		(edid->established_timings.t2 << 8) |
+		((edid->established_timings.mfg_rsvd & 0x80) << 9);
+	int i, modes = 0;
+
+	for (i = 0; i <= EDID_EST_TIMINGS; i++)
+		if (est_bits & (1<<i)) {
+			struct drm_display_mode *newmode;
+			newmode = drm_mode_duplicate(dev, &edid_est_modes[i]);
+			if (newmode) {
+				drm_mode_probed_add(connector, newmode);
+				modes++;
+			}
+		}
+
+	return modes;
+}
+
+/**
+ * add_standard_modes - get std. modes from EDID and add them
+ * @edid: EDID block to scan
+ *
+ * Standard modes can be calculated using the CVT standard.  Grab them from
+ * @edid, calculate them, and add them to the list.
+ */
+static int add_standard_modes(struct drm_connector *connector, struct edid *edid)
+{
+	struct drm_device *dev = connector->dev;
+	int i, modes = 0;
+
+	for (i = 0; i < EDID_STD_TIMINGS; i++) {
+		struct std_timing *t = &edid->standard_timings[i];
+		struct drm_display_mode *newmode;
+
+		/* If std timings bytes are 1, 1 it's empty */
+		if (t->hsize == 1 && (t->aspect_ratio | t->vfreq) == 1)
+			continue;
+
+		newmode = drm_mode_std(dev, &edid->standard_timings[i]);
+		if (newmode) {
+			drm_mode_probed_add(connector, newmode);
+			modes++;
+		}
+	}
+
+	return modes;
+}
+
+/**
+ * add_detailed_modes - get detailed mode info from EDID data
+ * @connector: attached connector
+ * @edid: EDID block to scan
+ * @quirks: quirks to apply
+ *
+ * Some of the detailed timing sections may contain mode information.  Grab
+ * it and add it to the list.
+ */
+static int add_detailed_info(struct drm_connector *connector,
+			     struct edid *edid, u32 quirks)
+{
+	struct drm_device *dev = connector->dev;
+	int i, j, modes = 0;
+
+	for (i = 0; i < EDID_DETAILED_TIMINGS; i++) {
+		struct detailed_timing *timing = &edid->detailed_timings[i];
+		struct detailed_non_pixel *data = &timing->data.other_data;
+		struct drm_display_mode *newmode;
+
+		/* EDID up to and including 1.2 may put monitor info here */
+		if (edid->version == 1 && edid->revision < 3)
+			continue;
+
+		/* Detailed mode timing */
+		if (timing->pixel_clock) {
+			newmode = drm_mode_detailed(dev, edid, timing, quirks);
+			if (!newmode)
+				continue;
+
+			/* First detailed mode is preferred */
+			if (i == 0 && edid->preferred_timing)
+				newmode->type |= DRM_MODE_TYPE_PREFERRED;
+			drm_mode_probed_add(connector, newmode);
+
+			modes++;
+			continue;
+		}
+
+		/* Other timing or info */
+		switch (data->type) {
+		case EDID_DETAIL_MONITOR_SERIAL:
+			break;
+		case EDID_DETAIL_MONITOR_STRING:
+			break;
+		case EDID_DETAIL_MONITOR_RANGE:
+			/* Get monitor range data */
+			break;
+		case EDID_DETAIL_MONITOR_NAME:
+			break;
+		case EDID_DETAIL_MONITOR_CPDATA:
+			break;
+		case EDID_DETAIL_STD_MODES:
+			/* Five modes per detailed section */
+			for (j = 0; j < 5; i++) {
+				struct std_timing *std;
+				struct drm_display_mode *newmode;
+
+				std = &data->data.timings[j];
+				newmode = drm_mode_std(dev, std);
+				if (newmode) {
+					drm_mode_probed_add(connector, newmode);
+					modes++;
+				}
+			}
+			break;
+		default:
+			break;
+		}
+	}
+
+	return modes;
+}
+
+#define DDC_ADDR 0x50
+
+unsigned char *drm_do_probe_ddc_edid(struct i2c_adapter *adapter)
+{
+	unsigned char start = 0x0;
+	unsigned char *buf = kmalloc(EDID_LENGTH, GFP_KERNEL);
+	struct i2c_msg msgs[] = {
+		{
+			.addr	= DDC_ADDR,
+			.flags	= 0,
+			.len	= 1,
+			.buf	= &start,
+		}, {
+			.addr	= DDC_ADDR,
+			.flags	= I2C_M_RD,
+			.len	= EDID_LENGTH,
+			.buf	= buf,
+		}
+	};
+
+	if (!buf) {
+		dev_warn(&adapter->dev, "unable to allocate memory for EDID "
+			 "block.\n");
+		return NULL;
+	}
+
+	if (i2c_transfer(adapter, msgs, 2) == 2)
+		return buf;
+
+	dev_info(&adapter->dev, "unable to read EDID block.\n");
+	kfree(buf);
+	return NULL;
+}
+EXPORT_SYMBOL(drm_do_probe_ddc_edid);
+
+static unsigned char *drm_ddc_read(struct i2c_adapter *adapter)
+{
+	struct i2c_algo_bit_data *algo_data = adapter->algo_data;
+	unsigned char *edid = NULL;
+	int i, j;
+
+	algo_data->setscl(algo_data->data, 1);
+
+	for (i = 0; i < 1; i++) {
+		/* For some old monitors we need the
+		 * following process to initialize/stop DDC
+		 */
+		algo_data->setsda(algo_data->data, 1);
+		msleep(13);
+
+		algo_data->setscl(algo_data->data, 1);
+		for (j = 0; j < 5; j++) {
+			msleep(10);
+			if (algo_data->getscl(algo_data->data))
+				break;
+		}
+		if (j == 5)
+			continue;
+
+		algo_data->setsda(algo_data->data, 0);
+		msleep(15);
+		algo_data->setscl(algo_data->data, 0);
+		msleep(15);
+		algo_data->setsda(algo_data->data, 1);
+		msleep(15);
+
+		/* Do the real work */
+		edid = drm_do_probe_ddc_edid(adapter);
+		algo_data->setsda(algo_data->data, 0);
+		algo_data->setscl(algo_data->data, 0);
+		msleep(15);
+
+		algo_data->setscl(algo_data->data, 1);
+		for (j = 0; j < 10; j++) {
+			msleep(10);
+			if (algo_data->getscl(algo_data->data))
+				break;
+		}
+
+		algo_data->setsda(algo_data->data, 1);
+		msleep(15);
+		algo_data->setscl(algo_data->data, 0);
+		algo_data->setsda(algo_data->data, 0);
+		if (edid)
+			break;
+	}
+	/* Release the DDC lines when done or the Apple Cinema HD display
+	 * will switch off
+	 */
+	algo_data->setsda(algo_data->data, 1);
+	algo_data->setscl(algo_data->data, 1);
+
+	return edid;
+}
+
+/**
+ * drm_get_edid - get EDID data, if available
+ * @connector: connector we're probing
+ * @adapter: i2c adapter to use for DDC
+ *
+ * Poke the given connector's i2c channel to grab EDID data if possible.
+ *
+ * Return edid data or NULL if we couldn't find any.
+ */
+struct edid *drm_get_edid(struct drm_connector *connector,
+			  struct i2c_adapter *adapter)
+{
+	struct edid *edid;
+
+	edid = (struct edid *)drm_ddc_read(adapter);
+	if (!edid) {
+		dev_warn(&connector->dev->pdev->dev, "%s: no EDID data\n",
+			 drm_get_connector_name(connector));
+		return NULL;
+	}
+	if (!edid_is_valid(edid)) {
+		dev_warn(&connector->dev->pdev->dev, "%s: EDID invalid.\n",
+			 drm_get_connector_name(connector));
+		kfree(edid);
+		return NULL;
+	}
+
+	connector->display_info.raw_edid = (char *)edid;
+
+	return edid;
+}
+EXPORT_SYMBOL(drm_get_edid);
+
+/**
+ * drm_add_edid_modes - add modes from EDID data, if available
+ * @connector: connector we're probing
+ * @edid: edid data
+ *
+ * Add the specified modes to the connector's mode list.
+ *
+ * Return number of modes added or 0 if we couldn't find any.
+ */
+int drm_add_edid_modes(struct drm_connector *connector, struct edid *edid)
+{
+	int num_modes = 0;
+	u32 quirks;
+
+	if (edid == NULL) {
+		return 0;
+	}
+	if (!edid_is_valid(edid)) {
+		dev_warn(&connector->dev->pdev->dev, "%s: EDID invalid.\n",
+			 drm_get_connector_name(connector));
+		return 0;
+	}
+
+	quirks = edid_get_quirks(edid);
+
+	num_modes += add_established_modes(connector, edid);
+	num_modes += add_standard_modes(connector, edid);
+	num_modes += add_detailed_info(connector, edid, quirks);
+
+	if (quirks & (EDID_QUIRK_PREFER_LARGE_60 | EDID_QUIRK_PREFER_LARGE_75))
+		edid_fixup_preferred(connector, quirks);
+
+	connector->display_info.serration_vsync = edid->serration_vsync;
+	connector->display_info.sync_on_green = edid->sync_on_green;
+	connector->display_info.composite_sync = edid->composite_sync;
+	connector->display_info.separate_syncs = edid->separate_syncs;
+	connector->display_info.blank_to_black = edid->blank_to_black;
+	connector->display_info.video_level = edid->video_level;
+	connector->display_info.digital = edid->digital;
+	connector->display_info.width_mm = edid->width_cm * 10;
+	connector->display_info.height_mm = edid->height_cm * 10;
+	connector->display_info.gamma = edid->gamma;
+	connector->display_info.gtf_supported = edid->default_gtf;
+	connector->display_info.standard_color = edid->standard_color;
+	connector->display_info.display_type = edid->display_type;
+	connector->display_info.active_off_supported = edid->pm_active_off;
+	connector->display_info.suspend_supported = edid->pm_suspend;
+	connector->display_info.standby_supported = edid->pm_standby;
+	connector->display_info.gamma = edid->gamma;
+
+	return num_modes;
+}
+EXPORT_SYMBOL(drm_add_edid_modes);
diff --git a/drivers/gpu/drm/drm_fops.c b/drivers/gpu/drm/drm_fops.c
index 3a6c439652a..3733e36d135 100644
--- a/drivers/gpu/drm/drm_fops.c
+++ b/drivers/gpu/drm/drm_fops.c
@@ -35,7 +35,6 @@
  */
 
 #include "drmP.h"
-#include "drm_sarea.h"
 #include <linux/poll.h>
 #include <linux/smp_lock.h>
 
@@ -55,10 +54,12 @@ static int drm_setup(struct drm_device * dev)
 
 	atomic_set(&dev->ioctl_count, 0);
 	atomic_set(&dev->vma_count, 0);
-	dev->buf_use = 0;
-	atomic_set(&dev->buf_alloc, 0);
 
-	if (drm_core_check_feature(dev, DRIVER_HAVE_DMA)) {
+	if (drm_core_check_feature(dev, DRIVER_HAVE_DMA) &&
+	    !drm_core_check_feature(dev, DRIVER_MODESET)) {
+		dev->buf_use = 0;
+		atomic_set(&dev->buf_alloc, 0);
+
 		i = drm_dma_setup(dev);
 		if (i < 0)
 			return i;
@@ -138,14 +139,14 @@ int drm_open(struct inode *inode, struct file *filp)
 		}
 		spin_unlock(&dev->count_lock);
 	}
-
 out:
 	mutex_lock(&dev->struct_mutex);
-	if (dev->dev_mapping == NULL)
-		dev->dev_mapping = inode->i_mapping;
-	else if (dev->dev_mapping != inode->i_mapping)
-		WARN(1, "dev->dev_mapping not inode mapping (%p expected %p)\n",
-		     dev->dev_mapping, inode->i_mapping);
+	if (minor->type == DRM_MINOR_LEGACY) {
+		BUG_ON((dev->dev_mapping != NULL) &&
+			(dev->dev_mapping != inode->i_mapping));
+		if (dev->dev_mapping == NULL)
+			dev->dev_mapping = inode->i_mapping;
+	}
 	mutex_unlock(&dev->struct_mutex);
 
 	return retcode;
@@ -251,6 +252,7 @@ static int drm_open_helper(struct inode *inode, struct file *filp,
 	priv->lock_count = 0;
 
 	INIT_LIST_HEAD(&priv->lhead);
+	INIT_LIST_HEAD(&priv->fbs);
 
 	if (dev->driver->driver_features & DRIVER_GEM)
 		drm_gem_open(dev, priv);
diff --git a/drivers/gpu/drm/drm_irq.c b/drivers/gpu/drm/drm_irq.c
index 1e787f894b3..1608f8dbfda 100644
--- a/drivers/gpu/drm/drm_irq.c
+++ b/drivers/gpu/drm/drm_irq.c
@@ -305,6 +305,8 @@ int drm_control(struct drm_device *dev, void *data,
 	case DRM_INST_HANDLER:
 		if (!drm_core_check_feature(dev, DRIVER_HAVE_IRQ))
 			return 0;
+		if (drm_core_check_feature(dev, DRIVER_MODESET))
+			return 0;
 		if (dev->if_version < DRM_IF_VERSION(1, 2) &&
 		    ctl->irq != dev->pdev->irq)
 			return -EINVAL;
@@ -312,6 +314,8 @@ int drm_control(struct drm_device *dev, void *data,
 	case DRM_UNINST_HANDLER:
 		if (!drm_core_check_feature(dev, DRIVER_HAVE_IRQ))
 			return 0;
+		if (drm_core_check_feature(dev, DRIVER_MODESET))
+			return 0;
 		return drm_irq_uninstall(dev);
 	default:
 		return -EINVAL;
@@ -426,6 +430,45 @@ void drm_vblank_put(struct drm_device *dev, int crtc)
 }
 EXPORT_SYMBOL(drm_vblank_put);
 
+/**
+ * drm_vblank_pre_modeset - account for vblanks across mode sets
+ * @dev: DRM device
+ * @crtc: CRTC in question
+ * @post: post or pre mode set?
+ *
+ * Account for vblank events across mode setting events, which will likely
+ * reset the hardware frame counter.
+ */
+void drm_vblank_pre_modeset(struct drm_device *dev, int crtc)
+{
+	/*
+	 * To avoid all the problems that might happen if interrupts
+	 * were enabled/disabled around or between these calls, we just
+	 * have the kernel take a reference on the CRTC (just once though
+	 * to avoid corrupting the count if multiple, mismatch calls occur),
+	 * so that interrupts remain enabled in the interim.
+	 */
+	if (!dev->vblank_inmodeset[crtc]) {
+		dev->vblank_inmodeset[crtc] = 1;
+		drm_vblank_get(dev, crtc);
+	}
+}
+EXPORT_SYMBOL(drm_vblank_pre_modeset);
+
+void drm_vblank_post_modeset(struct drm_device *dev, int crtc)
+{
+	unsigned long irqflags;
+
+	if (dev->vblank_inmodeset[crtc]) {
+		spin_lock_irqsave(&dev->vbl_lock, irqflags);
+		dev->vblank_disable_allowed = 1;
+		dev->vblank_inmodeset[crtc] = 0;
+		spin_unlock_irqrestore(&dev->vbl_lock, irqflags);
+		drm_vblank_put(dev, crtc);
+	}
+}
+EXPORT_SYMBOL(drm_vblank_post_modeset);
+
 /**
  * drm_modeset_ctl - handle vblank event counter changes across mode switch
  * @DRM_IOCTL_ARGS: standard ioctl arguments
@@ -441,7 +484,6 @@ int drm_modeset_ctl(struct drm_device *dev, void *data,
 		    struct drm_file *file_priv)
 {
 	struct drm_modeset_ctl *modeset = data;
-	unsigned long irqflags;
 	int crtc, ret = 0;
 
 	/* If drm_vblank_init() hasn't been called yet, just no-op */
@@ -454,28 +496,12 @@ int drm_modeset_ctl(struct drm_device *dev, void *data,
 		goto out;
 	}
 
-	/*
-	 * To avoid all the problems that might happen if interrupts
-	 * were enabled/disabled around or between these calls, we just
-	 * have the kernel take a reference on the CRTC (just once though
-	 * to avoid corrupting the count if multiple, mismatch calls occur),
-	 * so that interrupts remain enabled in the interim.
-	 */
 	switch (modeset->cmd) {
 	case _DRM_PRE_MODESET:
-		if (!dev->vblank_inmodeset[crtc]) {
-			dev->vblank_inmodeset[crtc] = 1;
-			drm_vblank_get(dev, crtc);
-		}
+		drm_vblank_pre_modeset(dev, crtc);
 		break;
 	case _DRM_POST_MODESET:
-		if (dev->vblank_inmodeset[crtc]) {
-			spin_lock_irqsave(&dev->vbl_lock, irqflags);
-			dev->vblank_disable_allowed = 1;
-			dev->vblank_inmodeset[crtc] = 0;
-			spin_unlock_irqrestore(&dev->vbl_lock, irqflags);
-			drm_vblank_put(dev, crtc);
-		}
+		drm_vblank_post_modeset(dev, crtc);
 		break;
 	default:
 		ret = -EINVAL;
diff --git a/drivers/gpu/drm/drm_mm.c b/drivers/gpu/drm/drm_mm.c
index 217ad7dc707..367c590ffbb 100644
--- a/drivers/gpu/drm/drm_mm.c
+++ b/drivers/gpu/drm/drm_mm.c
@@ -296,3 +296,4 @@ void drm_mm_takedown(struct drm_mm * mm)
 
 	drm_free(entry, sizeof(*entry), DRM_MEM_MM);
 }
+EXPORT_SYMBOL(drm_mm_takedown);
diff --git a/drivers/gpu/drm/drm_modes.c b/drivers/gpu/drm/drm_modes.c
new file mode 100644
index 00000000000..c9b80fdd463
--- /dev/null
+++ b/drivers/gpu/drm/drm_modes.c
@@ -0,0 +1,576 @@
+/*
+ * The list_sort function is (presumably) licensed under the GPL (see the
+ * top level "COPYING" file for details).
+ *
+ * The remainder of this file is:
+ *
+ * Copyright © 1997-2003 by The XFree86 Project, Inc.
+ * Copyright © 2007 Dave Airlie
+ * Copyright © 2007-2008 Intel Corporation
+ *   Jesse Barnes <jesse.barnes@intel.com>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * Except as contained in this notice, the name of the copyright holder(s)
+ * and author(s) shall not be used in advertising or otherwise to promote
+ * the sale, use or other dealings in this Software without prior written
+ * authorization from the copyright holder(s) and author(s).
+ */
+
+#include <linux/list.h>
+#include "drmP.h"
+#include "drm.h"
+#include "drm_crtc.h"
+
+/**
+ * drm_mode_debug_printmodeline - debug print a mode
+ * @dev: DRM device
+ * @mode: mode to print
+ *
+ * LOCKING:
+ * None.
+ *
+ * Describe @mode using DRM_DEBUG.
+ */
+void drm_mode_debug_printmodeline(struct drm_display_mode *mode)
+{
+	DRM_DEBUG("Modeline %d:\"%s\" %d %d %d %d %d %d %d %d %d %d 0x%x 0x%x\n",
+		  mode->base.id, mode->name, mode->vrefresh, mode->clock,
+		  mode->hdisplay, mode->hsync_start,
+		  mode->hsync_end, mode->htotal,
+		  mode->vdisplay, mode->vsync_start,
+		  mode->vsync_end, mode->vtotal, mode->type, mode->flags);
+}
+EXPORT_SYMBOL(drm_mode_debug_printmodeline);
+
+/**
+ * drm_mode_set_name - set the name on a mode
+ * @mode: name will be set in this mode
+ *
+ * LOCKING:
+ * None.
+ *
+ * Set the name of @mode to a standard format.
+ */
+void drm_mode_set_name(struct drm_display_mode *mode)
+{
+	snprintf(mode->name, DRM_DISPLAY_MODE_LEN, "%dx%d", mode->hdisplay,
+		 mode->vdisplay);
+}
+EXPORT_SYMBOL(drm_mode_set_name);
+
+/**
+ * drm_mode_list_concat - move modes from one list to another
+ * @head: source list
+ * @new: dst list
+ *
+ * LOCKING:
+ * Caller must ensure both lists are locked.
+ *
+ * Move all the modes from @head to @new.
+ */
+void drm_mode_list_concat(struct list_head *head, struct list_head *new)
+{
+
+	struct list_head *entry, *tmp;
+
+	list_for_each_safe(entry, tmp, head) {
+		list_move_tail(entry, new);
+	}
+}
+EXPORT_SYMBOL(drm_mode_list_concat);
+
+/**
+ * drm_mode_width - get the width of a mode
+ * @mode: mode
+ *
+ * LOCKING:
+ * None.
+ *
+ * Return @mode's width (hdisplay) value.
+ *
+ * FIXME: is this needed?
+ *
+ * RETURNS:
+ * @mode->hdisplay
+ */
+int drm_mode_width(struct drm_display_mode *mode)
+{
+	return mode->hdisplay;
+
+}
+EXPORT_SYMBOL(drm_mode_width);
+
+/**
+ * drm_mode_height - get the height of a mode
+ * @mode: mode
+ *
+ * LOCKING:
+ * None.
+ *
+ * Return @mode's height (vdisplay) value.
+ *
+ * FIXME: is this needed?
+ *
+ * RETURNS:
+ * @mode->vdisplay
+ */
+int drm_mode_height(struct drm_display_mode *mode)
+{
+	return mode->vdisplay;
+}
+EXPORT_SYMBOL(drm_mode_height);
+
+/**
+ * drm_mode_vrefresh - get the vrefresh of a mode
+ * @mode: mode
+ *
+ * LOCKING:
+ * None.
+ *
+ * Return @mode's vrefresh rate or calculate it if necessary.
+ *
+ * FIXME: why is this needed?  shouldn't vrefresh be set already?
+ *
+ * RETURNS:
+ * Vertical refresh rate of @mode x 1000. For precision reasons.
+ */
+int drm_mode_vrefresh(struct drm_display_mode *mode)
+{
+	int refresh = 0;
+	unsigned int calc_val;
+
+	if (mode->vrefresh > 0)
+		refresh = mode->vrefresh;
+	else if (mode->htotal > 0 && mode->vtotal > 0) {
+		/* work out vrefresh the value will be x1000 */
+		calc_val = (mode->clock * 1000);
+
+		calc_val /= mode->htotal;
+		calc_val *= 1000;
+		calc_val /= mode->vtotal;
+
+		refresh = calc_val;
+		if (mode->flags & DRM_MODE_FLAG_INTERLACE)
+			refresh *= 2;
+		if (mode->flags & DRM_MODE_FLAG_DBLSCAN)
+			refresh /= 2;
+		if (mode->vscan > 1)
+			refresh /= mode->vscan;
+	}
+	return refresh;
+}
+EXPORT_SYMBOL(drm_mode_vrefresh);
+
+/**
+ * drm_mode_set_crtcinfo - set CRTC modesetting parameters
+ * @p: mode
+ * @adjust_flags: unused? (FIXME)
+ *
+ * LOCKING:
+ * None.
+ *
+ * Setup the CRTC modesetting parameters for @p, adjusting if necessary.
+ */
+void drm_mode_set_crtcinfo(struct drm_display_mode *p, int adjust_flags)
+{
+	if ((p == NULL) || ((p->type & DRM_MODE_TYPE_CRTC_C) == DRM_MODE_TYPE_BUILTIN))
+		return;
+
+	p->crtc_hdisplay = p->hdisplay;
+	p->crtc_hsync_start = p->hsync_start;
+	p->crtc_hsync_end = p->hsync_end;
+	p->crtc_htotal = p->htotal;
+	p->crtc_hskew = p->hskew;
+	p->crtc_vdisplay = p->vdisplay;
+	p->crtc_vsync_start = p->vsync_start;
+	p->crtc_vsync_end = p->vsync_end;
+	p->crtc_vtotal = p->vtotal;
+
+	if (p->flags & DRM_MODE_FLAG_INTERLACE) {
+		if (adjust_flags & CRTC_INTERLACE_HALVE_V) {
+			p->crtc_vdisplay /= 2;
+			p->crtc_vsync_start /= 2;
+			p->crtc_vsync_end /= 2;
+			p->crtc_vtotal /= 2;
+		}
+
+		p->crtc_vtotal |= 1;
+	}
+
+	if (p->flags & DRM_MODE_FLAG_DBLSCAN) {
+		p->crtc_vdisplay *= 2;
+		p->crtc_vsync_start *= 2;
+		p->crtc_vsync_end *= 2;
+		p->crtc_vtotal *= 2;
+	}
+
+	if (p->vscan > 1) {
+		p->crtc_vdisplay *= p->vscan;
+		p->crtc_vsync_start *= p->vscan;
+		p->crtc_vsync_end *= p->vscan;
+		p->crtc_vtotal *= p->vscan;
+	}
+
+	p->crtc_vblank_start = min(p->crtc_vsync_start, p->crtc_vdisplay);
+	p->crtc_vblank_end = max(p->crtc_vsync_end, p->crtc_vtotal);
+	p->crtc_hblank_start = min(p->crtc_hsync_start, p->crtc_hdisplay);
+	p->crtc_hblank_end = max(p->crtc_hsync_end, p->crtc_htotal);
+
+	p->crtc_hadjusted = false;
+	p->crtc_vadjusted = false;
+}
+EXPORT_SYMBOL(drm_mode_set_crtcinfo);
+
+
+/**
+ * drm_mode_duplicate - allocate and duplicate an existing mode
+ * @m: mode to duplicate
+ *
+ * LOCKING:
+ * None.
+ *
+ * Just allocate a new mode, copy the existing mode into it, and return
+ * a pointer to it.  Used to create new instances of established modes.
+ */
+struct drm_display_mode *drm_mode_duplicate(struct drm_device *dev,
+					    struct drm_display_mode *mode)
+{
+	struct drm_display_mode *nmode;
+	int new_id;
+
+	nmode = drm_mode_create(dev);
+	if (!nmode)
+		return NULL;
+
+	new_id = nmode->base.id;
+	*nmode = *mode;
+	nmode->base.id = new_id;
+	INIT_LIST_HEAD(&nmode->head);
+	return nmode;
+}
+EXPORT_SYMBOL(drm_mode_duplicate);
+
+/**
+ * drm_mode_equal - test modes for equality
+ * @mode1: first mode
+ * @mode2: second mode
+ *
+ * LOCKING:
+ * None.
+ *
+ * Check to see if @mode1 and @mode2 are equivalent.
+ *
+ * RETURNS:
+ * True if the modes are equal, false otherwise.
+ */
+bool drm_mode_equal(struct drm_display_mode *mode1, struct drm_display_mode *mode2)
+{
+	/* do clock check convert to PICOS so fb modes get matched
+	 * the same */
+	if (mode1->clock && mode2->clock) {
+		if (KHZ2PICOS(mode1->clock) != KHZ2PICOS(mode2->clock))
+			return false;
+	} else if (mode1->clock != mode2->clock)
+		return false;
+
+	if (mode1->hdisplay == mode2->hdisplay &&
+	    mode1->hsync_start == mode2->hsync_start &&
+	    mode1->hsync_end == mode2->hsync_end &&
+	    mode1->htotal == mode2->htotal &&
+	    mode1->hskew == mode2->hskew &&
+	    mode1->vdisplay == mode2->vdisplay &&
+	    mode1->vsync_start == mode2->vsync_start &&
+	    mode1->vsync_end == mode2->vsync_end &&
+	    mode1->vtotal == mode2->vtotal &&
+	    mode1->vscan == mode2->vscan &&
+	    mode1->flags == mode2->flags)
+		return true;
+
+	return false;
+}
+EXPORT_SYMBOL(drm_mode_equal);
+
+/**
+ * drm_mode_validate_size - make sure modes adhere to size constraints
+ * @dev: DRM device
+ * @mode_list: list of modes to check
+ * @maxX: maximum width
+ * @maxY: maximum height
+ * @maxPitch: max pitch
+ *
+ * LOCKING:
+ * Caller must hold a lock protecting @mode_list.
+ *
+ * The DRM device (@dev) has size and pitch limits.  Here we validate the
+ * modes we probed for @dev against those limits and set their status as
+ * necessary.
+ */
+void drm_mode_validate_size(struct drm_device *dev,
+			    struct list_head *mode_list,
+			    int maxX, int maxY, int maxPitch)
+{
+	struct drm_display_mode *mode;
+
+	list_for_each_entry(mode, mode_list, head) {
+		if (maxPitch > 0 && mode->hdisplay > maxPitch)
+			mode->status = MODE_BAD_WIDTH;
+
+		if (maxX > 0 && mode->hdisplay > maxX)
+			mode->status = MODE_VIRTUAL_X;
+
+		if (maxY > 0 && mode->vdisplay > maxY)
+			mode->status = MODE_VIRTUAL_Y;
+	}
+}
+EXPORT_SYMBOL(drm_mode_validate_size);
+
+/**
+ * drm_mode_validate_clocks - validate modes against clock limits
+ * @dev: DRM device
+ * @mode_list: list of modes to check
+ * @min: minimum clock rate array
+ * @max: maximum clock rate array
+ * @n_ranges: number of clock ranges (size of arrays)
+ *
+ * LOCKING:
+ * Caller must hold a lock protecting @mode_list.
+ *
+ * Some code may need to check a mode list against the clock limits of the
+ * device in question.  This function walks the mode list, testing to make
+ * sure each mode falls within a given range (defined by @min and @max
+ * arrays) and sets @mode->status as needed.
+ */
+void drm_mode_validate_clocks(struct drm_device *dev,
+			      struct list_head *mode_list,
+			      int *min, int *max, int n_ranges)
+{
+	struct drm_display_mode *mode;
+	int i;
+
+	list_for_each_entry(mode, mode_list, head) {
+		bool good = false;
+		for (i = 0; i < n_ranges; i++) {
+			if (mode->clock >= min[i] && mode->clock <= max[i]) {
+				good = true;
+				break;
+			}
+		}
+		if (!good)
+			mode->status = MODE_CLOCK_RANGE;
+	}
+}
+EXPORT_SYMBOL(drm_mode_validate_clocks);
+
+/**
+ * drm_mode_prune_invalid - remove invalid modes from mode list
+ * @dev: DRM device
+ * @mode_list: list of modes to check
+ * @verbose: be verbose about it
+ *
+ * LOCKING:
+ * Caller must hold a lock protecting @mode_list.
+ *
+ * Once mode list generation is complete, a caller can use this routine to
+ * remove invalid modes from a mode list.  If any of the modes have a
+ * status other than %MODE_OK, they are removed from @mode_list and freed.
+ */
+void drm_mode_prune_invalid(struct drm_device *dev,
+			    struct list_head *mode_list, bool verbose)
+{
+	struct drm_display_mode *mode, *t;
+
+	list_for_each_entry_safe(mode, t, mode_list, head) {
+		if (mode->status != MODE_OK) {
+			list_del(&mode->head);
+			if (verbose) {
+				drm_mode_debug_printmodeline(mode);
+				DRM_DEBUG("Not using %s mode %d\n", mode->name, mode->status);
+			}
+			drm_mode_destroy(dev, mode);
+		}
+	}
+}
+EXPORT_SYMBOL(drm_mode_prune_invalid);
+
+/**
+ * drm_mode_compare - compare modes for favorability
+ * @lh_a: list_head for first mode
+ * @lh_b: list_head for second mode
+ *
+ * LOCKING:
+ * None.
+ *
+ * Compare two modes, given by @lh_a and @lh_b, returning a value indicating
+ * which is better.
+ *
+ * RETURNS:
+ * Negative if @lh_a is better than @lh_b, zero if they're equivalent, or
+ * positive if @lh_b is better than @lh_a.
+ */
+static int drm_mode_compare(struct list_head *lh_a, struct list_head *lh_b)
+{
+	struct drm_display_mode *a = list_entry(lh_a, struct drm_display_mode, head);
+	struct drm_display_mode *b = list_entry(lh_b, struct drm_display_mode, head);
+	int diff;
+
+	diff = ((b->type & DRM_MODE_TYPE_PREFERRED) != 0) -
+		((a->type & DRM_MODE_TYPE_PREFERRED) != 0);
+	if (diff)
+		return diff;
+	diff = b->hdisplay * b->vdisplay - a->hdisplay * a->vdisplay;
+	if (diff)
+		return diff;
+	diff = b->clock - a->clock;
+	return diff;
+}
+
+/* FIXME: what we don't have a list sort function? */
+/* list sort from Mark J Roberts (mjr@znex.org) */
+void list_sort(struct list_head *head,
+	       int (*cmp)(struct list_head *a, struct list_head *b))
+{
+	struct list_head *p, *q, *e, *list, *tail, *oldhead;
+	int insize, nmerges, psize, qsize, i;
+
+	list = head->next;
+	list_del(head);
+	insize = 1;
+	for (;;) {
+		p = oldhead = list;
+		list = tail = NULL;
+		nmerges = 0;
+
+		while (p) {
+			nmerges++;
+			q = p;
+			psize = 0;
+			for (i = 0; i < insize; i++) {
+				psize++;
+				q = q->next == oldhead ? NULL : q->next;
+				if (!q)
+					break;
+			}
+
+			qsize = insize;
+			while (psize > 0 || (qsize > 0 && q)) {
+				if (!psize) {
+					e = q;
+					q = q->next;
+					qsize--;
+					if (q == oldhead)
+						q = NULL;
+				} else if (!qsize || !q) {
+					e = p;
+					p = p->next;
+					psize--;
+					if (p == oldhead)
+						p = NULL;
+				} else if (cmp(p, q) <= 0) {
+					e = p;
+					p = p->next;
+					psize--;
+					if (p == oldhead)
+						p = NULL;
+				} else {
+					e = q;
+					q = q->next;
+					qsize--;
+					if (q == oldhead)
+						q = NULL;
+				}
+				if (tail)
+					tail->next = e;
+				else
+					list = e;
+				e->prev = tail;
+				tail = e;
+			}
+			p = q;
+		}
+
+		tail->next = list;
+		list->prev = tail;
+
+		if (nmerges <= 1)
+			break;
+
+		insize *= 2;
+	}
+
+	head->next = list;
+	head->prev = list->prev;
+	list->prev->next = head;
+	list->prev = head;
+}
+
+/**
+ * drm_mode_sort - sort mode list
+ * @mode_list: list to sort
+ *
+ * LOCKING:
+ * Caller must hold a lock protecting @mode_list.
+ *
+ * Sort @mode_list by favorability, putting good modes first.
+ */
+void drm_mode_sort(struct list_head *mode_list)
+{
+	list_sort(mode_list, drm_mode_compare);
+}
+EXPORT_SYMBOL(drm_mode_sort);
+
+/**
+ * drm_mode_connector_list_update - update the mode list for the connector
+ * @connector: the connector to update
+ *
+ * LOCKING:
+ * Caller must hold a lock protecting @mode_list.
+ *
+ * This moves the modes from the @connector probed_modes list
+ * to the actual mode list. It compares the probed mode against the current
+ * list and only adds different modes. All modes unverified after this point
+ * will be removed by the prune invalid modes.
+ */
+void drm_mode_connector_list_update(struct drm_connector *connector)
+{
+	struct drm_display_mode *mode;
+	struct drm_display_mode *pmode, *pt;
+	int found_it;
+
+	list_for_each_entry_safe(pmode, pt, &connector->probed_modes,
+				 head) {
+		found_it = 0;
+		/* go through current modes checking for the new probed mode */
+		list_for_each_entry(mode, &connector->modes, head) {
+			if (drm_mode_equal(pmode, mode)) {
+				found_it = 1;
+				/* if equal delete the probed mode */
+				mode->status = pmode->status;
+				list_del(&pmode->head);
+				drm_mode_destroy(connector->dev, pmode);
+				break;
+			}
+		}
+
+		if (!found_it) {
+			list_move_tail(&pmode->head, &connector->modes);
+		}
+	}
+}
+EXPORT_SYMBOL(drm_mode_connector_list_update);
diff --git a/drivers/gpu/drm/drm_stub.c b/drivers/gpu/drm/drm_stub.c
index ea7f9e5d47f..5ca132afa4f 100644
--- a/drivers/gpu/drm/drm_stub.c
+++ b/drivers/gpu/drm/drm_stub.c
@@ -57,6 +57,14 @@ static int drm_minor_get_id(struct drm_device *dev, int type)
 	int ret;
 	int base = 0, limit = 63;
 
+	if (type == DRM_MINOR_CONTROL) {
+                base += 64;
+                limit = base + 127;
+        } else if (type == DRM_MINOR_RENDER) {
+                base += 128;
+                limit = base + 255;
+        }
+
 again:
 	if (idr_pre_get(&drm_minors_idr, GFP_KERNEL) == 0) {
 		DRM_ERROR("Out of memory expanding drawable idr\n");
@@ -362,12 +370,28 @@ int drm_get_dev(struct pci_dev *pdev, const struct pci_device_id *ent,
 		printk(KERN_ERR "DRM: Fill_in_dev failed.\n");
 		goto err_g2;
 	}
+
+	if (drm_core_check_feature(dev, DRIVER_MODESET)) {
+		ret = drm_get_minor(dev, &dev->control, DRM_MINOR_CONTROL);
+		if (ret)
+			goto err_g2;
+	}
+
 	if ((ret = drm_get_minor(dev, &dev->primary, DRM_MINOR_LEGACY)))
-		goto err_g2;
+		goto err_g3;
 
-	if (dev->driver->load)
-		if ((ret = dev->driver->load(dev, ent->driver_data)))
+	if (dev->driver->load) {
+		ret = dev->driver->load(dev, ent->driver_data);
+		if (ret)
 			goto err_g3;
+	}
+
+        /* setup the grouping for the legacy output */
+	if (drm_core_check_feature(dev, DRIVER_MODESET)) {
+		ret = drm_mode_group_init_legacy_group(dev, &dev->primary->mode_group);
+		if (ret)
+			goto err_g3;
+	}
 
 	list_add_tail(&dev->driver_item, &driver->device_list);
 
diff --git a/drivers/gpu/drm/drm_sysfs.c b/drivers/gpu/drm/drm_sysfs.c
index 1611b9bcbe7..65d72d094c8 100644
--- a/drivers/gpu/drm/drm_sysfs.c
+++ b/drivers/gpu/drm/drm_sysfs.c
@@ -20,6 +20,7 @@
 #include "drmP.h"
 
 #define to_drm_minor(d) container_of(d, struct drm_minor, kdev)
+#define to_drm_connector(d) container_of(d, struct drm_connector, kdev)
 
 /**
  * drm_sysfs_suspend - DRM class suspend hook
@@ -34,7 +35,7 @@ static int drm_sysfs_suspend(struct device *dev, pm_message_t state)
 	struct drm_minor *drm_minor = to_drm_minor(dev);
 	struct drm_device *drm_dev = drm_minor->dev;
 
-	if (drm_dev->driver->suspend)
+	if (drm_minor->type == DRM_MINOR_LEGACY && drm_dev->driver->suspend)
 		return drm_dev->driver->suspend(drm_dev, state);
 
 	return 0;
@@ -52,7 +53,7 @@ static int drm_sysfs_resume(struct device *dev)
 	struct drm_minor *drm_minor = to_drm_minor(dev);
 	struct drm_device *drm_dev = drm_minor->dev;
 
-	if (drm_dev->driver->resume)
+	if (drm_minor->type == DRM_MINOR_LEGACY && drm_dev->driver->resume)
 		return drm_dev->driver->resume(drm_dev);
 
 	return 0;
@@ -144,6 +145,323 @@ static void drm_sysfs_device_release(struct device *dev)
 	return;
 }
 
+/*
+ * Connector properties
+ */
+static ssize_t status_show(struct device *device,
+			   struct device_attribute *attr,
+			   char *buf)
+{
+	struct drm_connector *connector = to_drm_connector(device);
+	enum drm_connector_status status;
+
+	status = connector->funcs->detect(connector);
+	return snprintf(buf, PAGE_SIZE, "%s",
+			drm_get_connector_status_name(status));
+}
+
+static ssize_t dpms_show(struct device *device,
+			   struct device_attribute *attr,
+			   char *buf)
+{
+	struct drm_connector *connector = to_drm_connector(device);
+	struct drm_device *dev = connector->dev;
+	uint64_t dpms_status;
+	int ret;
+
+	ret = drm_connector_property_get_value(connector,
+					    dev->mode_config.dpms_property,
+					    &dpms_status);
+	if (ret)
+		return 0;
+
+	return snprintf(buf, PAGE_SIZE, "%s",
+			drm_get_dpms_name((int)dpms_status));
+}
+
+static ssize_t enabled_show(struct device *device,
+			    struct device_attribute *attr,
+			   char *buf)
+{
+	struct drm_connector *connector = to_drm_connector(device);
+
+	return snprintf(buf, PAGE_SIZE, connector->encoder ? "enabled" :
+			"disabled");
+}
+
+static ssize_t edid_show(struct kobject *kobj, struct bin_attribute *attr,
+			 char *buf, loff_t off, size_t count)
+{
+	struct device *connector_dev = container_of(kobj, struct device, kobj);
+	struct drm_connector *connector = to_drm_connector(connector_dev);
+	unsigned char *edid;
+	size_t size;
+
+	if (!connector->edid_blob_ptr)
+		return 0;
+
+	edid = connector->edid_blob_ptr->data;
+	size = connector->edid_blob_ptr->length;
+	if (!edid)
+		return 0;
+
+	if (off >= size)
+		return 0;
+
+	if (off + count > size)
+		count = size - off;
+	memcpy(buf, edid + off, count);
+
+	return count;
+}
+
+static ssize_t modes_show(struct device *device,
+			   struct device_attribute *attr,
+			   char *buf)
+{
+	struct drm_connector *connector = to_drm_connector(device);
+	struct drm_display_mode *mode;
+	int written = 0;
+
+	list_for_each_entry(mode, &connector->modes, head) {
+		written += snprintf(buf + written, PAGE_SIZE - written, "%s\n",
+				    mode->name);
+	}
+
+	return written;
+}
+
+static ssize_t subconnector_show(struct device *device,
+			   struct device_attribute *attr,
+			   char *buf)
+{
+	struct drm_connector *connector = to_drm_connector(device);
+	struct drm_device *dev = connector->dev;
+	struct drm_property *prop = NULL;
+	uint64_t subconnector;
+	int is_tv = 0;
+	int ret;
+
+	switch (connector->connector_type) {
+		case DRM_MODE_CONNECTOR_DVII:
+			prop = dev->mode_config.dvi_i_subconnector_property;
+			break;
+		case DRM_MODE_CONNECTOR_Composite:
+		case DRM_MODE_CONNECTOR_SVIDEO:
+		case DRM_MODE_CONNECTOR_Component:
+			prop = dev->mode_config.tv_subconnector_property;
+			is_tv = 1;
+			break;
+		default:
+			DRM_ERROR("Wrong connector type for this property\n");
+			return 0;
+	}
+
+	if (!prop) {
+		DRM_ERROR("Unable to find subconnector property\n");
+		return 0;
+	}
+
+	ret = drm_connector_property_get_value(connector, prop, &subconnector);
+	if (ret)
+		return 0;
+
+	return snprintf(buf, PAGE_SIZE, "%s", is_tv ?
+			drm_get_tv_subconnector_name((int)subconnector) :
+			drm_get_dvi_i_subconnector_name((int)subconnector));
+}
+
+static ssize_t select_subconnector_show(struct device *device,
+			   struct device_attribute *attr,
+			   char *buf)
+{
+	struct drm_connector *connector = to_drm_connector(device);
+	struct drm_device *dev = connector->dev;
+	struct drm_property *prop = NULL;
+	uint64_t subconnector;
+	int is_tv = 0;
+	int ret;
+
+	switch (connector->connector_type) {
+		case DRM_MODE_CONNECTOR_DVII:
+			prop = dev->mode_config.dvi_i_select_subconnector_property;
+			break;
+		case DRM_MODE_CONNECTOR_Composite:
+		case DRM_MODE_CONNECTOR_SVIDEO:
+		case DRM_MODE_CONNECTOR_Component:
+			prop = dev->mode_config.tv_select_subconnector_property;
+			is_tv = 1;
+			break;
+		default:
+			DRM_ERROR("Wrong connector type for this property\n");
+			return 0;
+	}
+
+	if (!prop) {
+		DRM_ERROR("Unable to find select subconnector property\n");
+		return 0;
+	}
+
+	ret = drm_connector_property_get_value(connector, prop, &subconnector);
+	if (ret)
+		return 0;
+
+	return snprintf(buf, PAGE_SIZE, "%s", is_tv ?
+			drm_get_tv_select_name((int)subconnector) :
+			drm_get_dvi_i_select_name((int)subconnector));
+}
+
+static struct device_attribute connector_attrs[] = {
+	__ATTR_RO(status),
+	__ATTR_RO(enabled),
+	__ATTR_RO(dpms),
+	__ATTR_RO(modes),
+};
+
+/* These attributes are for both DVI-I connectors and all types of tv-out. */
+static struct device_attribute connector_attrs_opt1[] = {
+	__ATTR_RO(subconnector),
+	__ATTR_RO(select_subconnector),
+};
+
+static struct bin_attribute edid_attr = {
+	.attr.name = "edid",
+	.size = 128,
+	.read = edid_show,
+};
+
+/**
+ * drm_sysfs_connector_add - add an connector to sysfs
+ * @connector: connector to add
+ *
+ * Create an connector device in sysfs, along with its associated connector
+ * properties (so far, connection status, dpms, mode list & edid) and
+ * generate a hotplug event so userspace knows there's a new connector
+ * available.
+ *
+ * Note:
+ * This routine should only be called *once* for each DRM minor registered.
+ * A second call for an already registered device will trigger the BUG_ON
+ * below.
+ */
+int drm_sysfs_connector_add(struct drm_connector *connector)
+{
+	struct drm_device *dev = connector->dev;
+	int ret = 0, i, j;
+
+	/* We shouldn't get called more than once for the same connector */
+	BUG_ON(device_is_registered(&connector->kdev));
+
+	connector->kdev.parent = &dev->primary->kdev;
+	connector->kdev.class = drm_class;
+	connector->kdev.release = drm_sysfs_device_release;
+
+	DRM_DEBUG("adding \"%s\" to sysfs\n",
+		  drm_get_connector_name(connector));
+
+	snprintf(connector->kdev.bus_id, BUS_ID_SIZE, "card%d-%s",
+		 dev->primary->index, drm_get_connector_name(connector));
+	ret = device_register(&connector->kdev);
+
+	if (ret) {
+		DRM_ERROR("failed to register connector device: %d\n", ret);
+		goto out;
+	}
+
+	/* Standard attributes */
+
+	for (i = 0; i < ARRAY_SIZE(connector_attrs); i++) {
+		ret = device_create_file(&connector->kdev, &connector_attrs[i]);
+		if (ret)
+			goto err_out_files;
+	}
+
+	/* Optional attributes */
+	/*
+	 * In the long run it maybe a good idea to make one set of
+	 * optionals per connector type.
+	 */
+	switch (connector->connector_type) {
+		case DRM_MODE_CONNECTOR_DVII:
+		case DRM_MODE_CONNECTOR_Composite:
+		case DRM_MODE_CONNECTOR_SVIDEO:
+		case DRM_MODE_CONNECTOR_Component:
+			for (i = 0; i < ARRAY_SIZE(connector_attrs_opt1); i++) {
+				ret = device_create_file(&connector->kdev, &connector_attrs_opt1[i]);
+				if (ret)
+					goto err_out_files;
+			}
+			break;
+		default:
+			break;
+	}
+
+	ret = sysfs_create_bin_file(&connector->kdev.kobj, &edid_attr);
+	if (ret)
+		goto err_out_files;
+
+	/* Let userspace know we have a new connector */
+	drm_sysfs_hotplug_event(dev);
+
+	return 0;
+
+err_out_files:
+	if (i > 0)
+		for (j = 0; j < i; j++)
+			device_remove_file(&connector->kdev,
+					   &connector_attrs[i]);
+	device_unregister(&connector->kdev);
+
+out:
+	return ret;
+}
+EXPORT_SYMBOL(drm_sysfs_connector_add);
+
+/**
+ * drm_sysfs_connector_remove - remove an connector device from sysfs
+ * @connector: connector to remove
+ *
+ * Remove @connector and its associated attributes from sysfs.  Note that
+ * the device model core will take care of sending the "remove" uevent
+ * at this time, so we don't need to do it.
+ *
+ * Note:
+ * This routine should only be called if the connector was previously
+ * successfully registered.  If @connector hasn't been registered yet,
+ * you'll likely see a panic somewhere deep in sysfs code when called.
+ */
+void drm_sysfs_connector_remove(struct drm_connector *connector)
+{
+	int i;
+
+	DRM_DEBUG("removing \"%s\" from sysfs\n",
+		  drm_get_connector_name(connector));
+
+	for (i = 0; i < ARRAY_SIZE(connector_attrs); i++)
+		device_remove_file(&connector->kdev, &connector_attrs[i]);
+	sysfs_remove_bin_file(&connector->kdev.kobj, &edid_attr);
+	device_unregister(&connector->kdev);
+}
+EXPORT_SYMBOL(drm_sysfs_connector_remove);
+
+/**
+ * drm_sysfs_hotplug_event - generate a DRM uevent
+ * @dev: DRM device
+ *
+ * Send a uevent for the DRM device specified by @dev.  Currently we only
+ * set HOTPLUG=1 in the uevent environment, but this could be expanded to
+ * deal with other types of events.
+ */
+void drm_sysfs_hotplug_event(struct drm_device *dev)
+{
+	char *event_string = "HOTPLUG=1";
+	char *envp[] = { event_string, NULL };
+
+	DRM_DEBUG("generating hotplug event\n");
+
+	kobject_uevent_env(&dev->primary->kdev.kobj, KOBJ_CHANGE, envp);
+}
+
 /**
  * drm_sysfs_device_add - adds a class device to sysfs for a character driver
  * @dev: DRM device to be added
@@ -163,7 +481,12 @@ int drm_sysfs_device_add(struct drm_minor *minor)
 	minor->kdev.class = drm_class;
 	minor->kdev.release = drm_sysfs_device_release;
 	minor->kdev.devt = minor->device;
-	minor_str = "card%d";
+	if (minor->type == DRM_MINOR_CONTROL)
+		minor_str = "controlD%d";
+        else if (minor->type == DRM_MINOR_RENDER)
+                minor_str = "renderD%d";
+        else
+                minor_str = "card%d";
 
 	snprintf(minor->kdev.bus_id, BUS_ID_SIZE, minor_str, minor->index);
 
diff --git a/drivers/video/console/vgacon.c b/drivers/video/console/vgacon.c
index 448d209a0bf..e6210725b9a 100644
--- a/drivers/video/console/vgacon.c
+++ b/drivers/video/console/vgacon.c
@@ -112,6 +112,23 @@ static int 		vga_video_font_height;
 static int 		vga_scan_lines		__read_mostly;
 static unsigned int 	vga_rolled_over;
 
+int vgacon_text_mode_force = 0;
+
+bool vgacon_text_force(void)
+{
+	return vgacon_text_mode_force ? true : false;
+}
+EXPORT_SYMBOL(vgacon_text_force);
+
+static int __init text_mode(char *str)
+{
+	vgacon_text_mode_force = 1;
+	return 1;
+}
+
+/* force text mode - used by kernel modesetting */
+__setup("nomodeset", text_mode);
+
 static int __init no_scroll(char *str)
 {
 	/*
diff --git a/include/drm/Kbuild b/include/drm/Kbuild
index 82b6983b7fb..b940fdfa3b2 100644
--- a/include/drm/Kbuild
+++ b/include/drm/Kbuild
@@ -1,4 +1,4 @@
-unifdef-y += drm.h drm_sarea.h
+unifdef-y += drm.h drm_sarea.h drm_mode.h
 unifdef-y += i810_drm.h
 unifdef-y += i830_drm.h
 unifdef-y += i915_drm.h
diff --git a/include/drm/drm.h b/include/drm/drm.h
index 3a66252456b..76ce6fe300b 100644
--- a/include/drm/drm.h
+++ b/include/drm/drm.h
@@ -599,6 +599,8 @@ struct drm_gem_open {
 	uint64_t size;
 };
 
+#include "drm_mode.h"
+
 #define DRM_IOCTL_BASE			'd'
 #define DRM_IO(nr)			_IO(DRM_IOCTL_BASE,nr)
 #define DRM_IOR(nr,type)		_IOR(DRM_IOCTL_BASE,nr,type)
@@ -668,6 +670,25 @@ struct drm_gem_open {
 
 #define DRM_IOCTL_UPDATE_DRAW		DRM_IOW(0x3f, struct drm_update_draw)
 
+#define DRM_IOCTL_MODE_GETRESOURCES	DRM_IOWR(0xA0, struct drm_mode_card_res)
+#define DRM_IOCTL_MODE_GETCRTC		DRM_IOWR(0xA1, struct drm_mode_crtc)
+#define DRM_IOCTL_MODE_SETCRTC		DRM_IOWR(0xA2, struct drm_mode_crtc)
+#define DRM_IOCTL_MODE_CURSOR		DRM_IOWR(0xA3, struct drm_mode_cursor)
+#define DRM_IOCTL_MODE_GETGAMMA		DRM_IOWR(0xA4, struct drm_mode_crtc_lut)
+#define DRM_IOCTL_MODE_SETGAMMA		DRM_IOWR(0xA5, struct drm_mode_crtc_lut)
+#define DRM_IOCTL_MODE_GETENCODER	DRM_IOWR(0xA6, struct drm_mode_get_encoder)
+#define DRM_IOCTL_MODE_GETCONNECTOR	DRM_IOWR(0xA7, struct drm_mode_get_connector)
+#define DRM_IOCTL_MODE_ATTACHMODE	DRM_IOWR(0xA8, struct drm_mode_mode_cmd)
+#define DRM_IOCTL_MODE_DETACHMODE	DRM_IOWR(0xA9, struct drm_mode_mode_cmd)
+
+#define DRM_IOCTL_MODE_GETPROPERTY	DRM_IOWR(0xAA, struct drm_mode_get_property)
+#define DRM_IOCTL_MODE_SETPROPERTY	DRM_IOWR(0xAB, struct drm_mode_connector_set_property)
+#define DRM_IOCTL_MODE_GETPROPBLOB	DRM_IOWR(0xAC, struct drm_mode_get_blob)
+#define DRM_IOCTL_MODE_GETFB		DRM_IOWR(0xAD, struct drm_mode_fb_cmd)
+#define DRM_IOCTL_MODE_ADDFB		DRM_IOWR(0xAE, struct drm_mode_fb_cmd)
+#define DRM_IOCTL_MODE_RMFB		DRM_IOWR(0xAF, unsigned int)
+#define DRM_IOCTL_MODE_REPLACEFB	DRM_IOWR(0xB0, struct drm_mode_fb_cmd)
+
 /**
  * Device specific ioctls should only be in their respective headers
  * The device specific ioctl range is from 0x40 to 0x99.
diff --git a/include/drm/drmP.h b/include/drm/drmP.h
index ae42a6a5c24..7802c80f2b2 100644
--- a/include/drm/drmP.h
+++ b/include/drm/drmP.h
@@ -105,6 +105,7 @@ struct drm_device;
 #define DRIVER_FB_DMA      0x400
 #define DRIVER_IRQ_VBL2    0x800
 #define DRIVER_GEM         0x1000
+#define DRIVER_MODESET     0x2000
 
 /***********************************************************************/
 /** \name Begin the DRM... */
@@ -276,6 +277,7 @@ typedef int drm_ioctl_compat_t(struct file *filp, unsigned int cmd,
 #define DRM_AUTH	0x1
 #define	DRM_MASTER	0x2
 #define DRM_ROOT_ONLY	0x4
+#define DRM_CONTROL_ALLOW 0x8
 
 struct drm_ioctl_desc {
 	unsigned int cmd;
@@ -398,6 +400,7 @@ struct drm_file {
 	int is_master; /* this file private is a master for a minor */
 	struct drm_master *master; /* master this node is currently associated with
 				      N.B. not always minor->master */
+	struct list_head fbs;
 };
 
 /** Wait queue */
@@ -629,6 +632,8 @@ struct drm_gem_object {
 	void *driver_private;
 };
 
+#include "drm_crtc.h"
+
 /* per-master structure */
 struct drm_master {
 
@@ -792,6 +797,8 @@ struct drm_driver {
 
 #define DRM_MINOR_UNASSIGNED 0
 #define DRM_MINOR_LEGACY 1
+#define DRM_MINOR_CONTROL 2
+#define DRM_MINOR_RENDER 3
 
 /**
  * DRM minor structure. This structure represents a drm minor number.
@@ -805,6 +812,7 @@ struct drm_minor {
 	struct proc_dir_entry *dev_root;  /**< proc directory entry */
 	struct drm_master *master; /* currently active master for this node */
 	struct list_head master_list;
+	struct drm_mode_group mode_group;
 };
 
 /**
@@ -855,6 +863,7 @@ struct drm_device {
 	struct idr ctx_idr;
 
 	struct list_head vmalist;	/**< List of vmas (for debugging) */
+
 	/*@} */
 
 	/** \name DMA queues (contexts) */
@@ -933,6 +942,7 @@ struct drm_device {
 	struct drm_driver *driver;
 	drm_local_map_t *agp_buffer_map;
 	unsigned int agp_buffer_token;
+	struct drm_minor *control;		/**< Control node for card */
 	struct drm_minor *primary;		/**< render type primary screen head */
 
 	/** \name Drawable information */
@@ -941,6 +951,8 @@ struct drm_device {
 	struct idr drw_idr;
 	/*@} */
 
+        struct drm_mode_config mode_config;	/**< Current mode config */
+
 	/** \name GEM information */
 	/*@{ */
 	spinlock_t object_name_lock;
@@ -1201,6 +1213,8 @@ extern int drm_vblank_get(struct drm_device *dev, int crtc);
 extern void drm_vblank_put(struct drm_device *dev, int crtc);
 extern void drm_vblank_cleanup(struct drm_device *dev);
 /* Modesetting support */
+extern void drm_vblank_pre_modeset(struct drm_device *dev, int crtc);
+extern void drm_vblank_post_modeset(struct drm_device *dev, int crtc);
 extern int drm_modeset_ctl(struct drm_device *dev, void *data,
 			   struct drm_file *file_priv);
 
@@ -1286,7 +1300,11 @@ struct drm_sysfs_class;
 extern struct class *drm_sysfs_create(struct module *owner, char *name);
 extern void drm_sysfs_destroy(void);
 extern int drm_sysfs_device_add(struct drm_minor *minor);
+extern void drm_sysfs_hotplug_event(struct drm_device *dev);
 extern void drm_sysfs_device_remove(struct drm_minor *minor);
+extern char *drm_get_connector_status_name(enum drm_connector_status status);
+extern int drm_sysfs_connector_add(struct drm_connector *connector);
+extern void drm_sysfs_connector_remove(struct drm_connector *connector);
 
 /*
  * Basic memory manager support (drm_mm.c)
diff --git a/include/drm/drm_crtc.h b/include/drm/drm_crtc.h
new file mode 100644
index 00000000000..08a884bea44
--- /dev/null
+++ b/include/drm/drm_crtc.h
@@ -0,0 +1,737 @@
+/*
+ * Copyright © 2006 Keith Packard
+ * Copyright © 2007-2008 Dave Airlie
+ * Copyright © 2007-2008 Intel Corporation
+ *   Jesse Barnes <jesse.barnes@intel.com>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+#ifndef __DRM_CRTC_H__
+#define __DRM_CRTC_H__
+
+#include <linux/i2c.h>
+#include <linux/spinlock.h>
+#include <linux/types.h>
+#include <linux/idr.h>
+
+#include <linux/fb.h>
+
+struct drm_device;
+struct drm_mode_set;
+struct drm_framebuffer;
+
+
+#define DRM_MODE_OBJECT_CRTC 0xcccccccc
+#define DRM_MODE_OBJECT_CONNECTOR 0xc0c0c0c0
+#define DRM_MODE_OBJECT_ENCODER 0xe0e0e0e0
+#define DRM_MODE_OBJECT_MODE 0xdededede
+#define DRM_MODE_OBJECT_PROPERTY 0xb0b0b0b0
+#define DRM_MODE_OBJECT_FB 0xfbfbfbfb
+#define DRM_MODE_OBJECT_BLOB 0xbbbbbbbb
+
+struct drm_mode_object {
+	uint32_t id;
+	uint32_t type;
+};
+
+/*
+ * Note on terminology:  here, for brevity and convenience, we refer to connector
+ * control chips as 'CRTCs'.  They can control any type of connector, VGA, LVDS,
+ * DVI, etc.  And 'screen' refers to the whole of the visible display, which
+ * may span multiple monitors (and therefore multiple CRTC and connector
+ * structures).
+ */
+
+enum drm_mode_status {
+    MODE_OK	= 0,	/* Mode OK */
+    MODE_HSYNC,		/* hsync out of range */
+    MODE_VSYNC,		/* vsync out of range */
+    MODE_H_ILLEGAL,	/* mode has illegal horizontal timings */
+    MODE_V_ILLEGAL,	/* mode has illegal horizontal timings */
+    MODE_BAD_WIDTH,	/* requires an unsupported linepitch */
+    MODE_NOMODE,	/* no mode with a maching name */
+    MODE_NO_INTERLACE,	/* interlaced mode not supported */
+    MODE_NO_DBLESCAN,	/* doublescan mode not supported */
+    MODE_NO_VSCAN,	/* multiscan mode not supported */
+    MODE_MEM,		/* insufficient video memory */
+    MODE_VIRTUAL_X,	/* mode width too large for specified virtual size */
+    MODE_VIRTUAL_Y,	/* mode height too large for specified virtual size */
+    MODE_MEM_VIRT,	/* insufficient video memory given virtual size */
+    MODE_NOCLOCK,	/* no fixed clock available */
+    MODE_CLOCK_HIGH,	/* clock required is too high */
+    MODE_CLOCK_LOW,	/* clock required is too low */
+    MODE_CLOCK_RANGE,	/* clock/mode isn't in a ClockRange */
+    MODE_BAD_HVALUE,	/* horizontal timing was out of range */
+    MODE_BAD_VVALUE,	/* vertical timing was out of range */
+    MODE_BAD_VSCAN,	/* VScan value out of range */
+    MODE_HSYNC_NARROW,	/* horizontal sync too narrow */
+    MODE_HSYNC_WIDE,	/* horizontal sync too wide */
+    MODE_HBLANK_NARROW,	/* horizontal blanking too narrow */
+    MODE_HBLANK_WIDE,	/* horizontal blanking too wide */
+    MODE_VSYNC_NARROW,	/* vertical sync too narrow */
+    MODE_VSYNC_WIDE,	/* vertical sync too wide */
+    MODE_VBLANK_NARROW,	/* vertical blanking too narrow */
+    MODE_VBLANK_WIDE,	/* vertical blanking too wide */
+    MODE_PANEL,         /* exceeds panel dimensions */
+    MODE_INTERLACE_WIDTH, /* width too large for interlaced mode */
+    MODE_ONE_WIDTH,     /* only one width is supported */
+    MODE_ONE_HEIGHT,    /* only one height is supported */
+    MODE_ONE_SIZE,      /* only one resolution is supported */
+    MODE_NO_REDUCED,    /* monitor doesn't accept reduced blanking */
+    MODE_UNVERIFIED = -3, /* mode needs to reverified */
+    MODE_BAD = -2,	/* unspecified reason */
+    MODE_ERROR	= -1	/* error condition */
+};
+
+#define DRM_MODE_TYPE_CLOCK_CRTC_C (DRM_MODE_TYPE_CLOCK_C | \
+				    DRM_MODE_TYPE_CRTC_C)
+
+#define DRM_MODE(nm, t, c, hd, hss, hse, ht, hsk, vd, vss, vse, vt, vs, f) \
+	.name = nm, .status = 0, .type = (t), .clock = (c), \
+	.hdisplay = (hd), .hsync_start = (hss), .hsync_end = (hse), \
+	.htotal = (ht), .hskew = (hsk), .vdisplay = (vd), \
+	.vsync_start = (vss), .vsync_end = (vse), .vtotal = (vt), \
+	.vscan = (vs), .flags = (f), .vrefresh = 0
+
+#define CRTC_INTERLACE_HALVE_V 0x1 /* halve V values for interlacing */
+
+struct drm_display_mode {
+	/* Header */
+	struct list_head head;
+	struct drm_mode_object base;
+
+	char name[DRM_DISPLAY_MODE_LEN];
+
+	int connector_count;
+	enum drm_mode_status status;
+	int type;
+
+	/* Proposed mode values */
+	int clock;
+	int hdisplay;
+	int hsync_start;
+	int hsync_end;
+	int htotal;
+	int hskew;
+	int vdisplay;
+	int vsync_start;
+	int vsync_end;
+	int vtotal;
+	int vscan;
+	unsigned int flags;
+
+	/* Addressable image size (may be 0 for projectors, etc.) */
+	int width_mm;
+	int height_mm;
+
+	/* Actual mode we give to hw */
+	int clock_index;
+	int synth_clock;
+	int crtc_hdisplay;
+	int crtc_hblank_start;
+	int crtc_hblank_end;
+	int crtc_hsync_start;
+	int crtc_hsync_end;
+	int crtc_htotal;
+	int crtc_hskew;
+	int crtc_vdisplay;
+	int crtc_vblank_start;
+	int crtc_vblank_end;
+	int crtc_vsync_start;
+	int crtc_vsync_end;
+	int crtc_vtotal;
+	int crtc_hadjusted;
+	int crtc_vadjusted;
+
+	/* Driver private mode info */
+	int private_size;
+	int *private;
+	int private_flags;
+
+	int vrefresh;
+	float hsync;
+};
+
+enum drm_connector_status {
+	connector_status_connected = 1,
+	connector_status_disconnected = 2,
+	connector_status_unknown = 3,
+};
+
+enum subpixel_order {
+	SubPixelUnknown = 0,
+	SubPixelHorizontalRGB,
+	SubPixelHorizontalBGR,
+	SubPixelVerticalRGB,
+	SubPixelVerticalBGR,
+	SubPixelNone,
+};
+
+
+/*
+ * Describes a given display (e.g. CRT or flat panel) and its limitations.
+ */
+struct drm_display_info {
+	char name[DRM_DISPLAY_INFO_LEN];
+	/* Input info */
+	bool serration_vsync;
+	bool sync_on_green;
+	bool composite_sync;
+	bool separate_syncs;
+	bool blank_to_black;
+	unsigned char video_level;
+	bool digital;
+	/* Physical size */
+        unsigned int width_mm;
+	unsigned int height_mm;
+
+	/* Display parameters */
+	unsigned char gamma; /* FIXME: storage format */
+	bool gtf_supported;
+	bool standard_color;
+	enum {
+		monochrome = 0,
+		rgb,
+		other,
+		unknown,
+	} display_type;
+	bool active_off_supported;
+	bool suspend_supported;
+	bool standby_supported;
+
+	/* Color info FIXME: storage format */
+	unsigned short redx, redy;
+	unsigned short greenx, greeny;
+	unsigned short bluex, bluey;
+	unsigned short whitex, whitey;
+
+	/* Clock limits FIXME: storage format */
+	unsigned int min_vfreq, max_vfreq;
+	unsigned int min_hfreq, max_hfreq;
+	unsigned int pixel_clock;
+
+	/* White point indices FIXME: storage format */
+	unsigned int wpx1, wpy1;
+	unsigned int wpgamma1;
+	unsigned int wpx2, wpy2;
+	unsigned int wpgamma2;
+
+	enum subpixel_order subpixel_order;
+
+	char *raw_edid; /* if any */
+};
+
+struct drm_framebuffer_funcs {
+	void (*destroy)(struct drm_framebuffer *framebuffer);
+	int (*create_handle)(struct drm_framebuffer *fb,
+			     struct drm_file *file_priv,
+			     unsigned int *handle);
+};
+
+struct drm_framebuffer {
+	struct drm_device *dev;
+	struct list_head head;
+	struct drm_mode_object base;
+	const struct drm_framebuffer_funcs *funcs;
+	unsigned int pitch;
+	unsigned int width;
+	unsigned int height;
+	/* depth can be 15 or 16 */
+	unsigned int depth;
+	int bits_per_pixel;
+	int flags;
+	void *fbdev;
+	u32 pseudo_palette[17];
+	struct list_head filp_head;
+};
+
+struct drm_property_blob {
+	struct drm_mode_object base;
+	struct list_head head;
+	unsigned int length;
+	void *data;
+};
+
+struct drm_property_enum {
+	uint64_t value;
+	struct list_head head;
+	char name[DRM_PROP_NAME_LEN];
+};
+
+struct drm_property {
+	struct list_head head;
+	struct drm_mode_object base;
+	uint32_t flags;
+	char name[DRM_PROP_NAME_LEN];
+	uint32_t num_values;
+	uint64_t *values;
+
+	struct list_head enum_blob_list;
+};
+
+struct drm_crtc;
+struct drm_connector;
+struct drm_encoder;
+
+/**
+ * drm_crtc_funcs - control CRTCs for a given device
+ * @dpms: control display power levels
+ * @save: save CRTC state
+ * @resore: restore CRTC state
+ * @lock: lock the CRTC
+ * @unlock: unlock the CRTC
+ * @shadow_allocate: allocate shadow pixmap
+ * @shadow_create: create shadow pixmap for rotation support
+ * @shadow_destroy: free shadow pixmap
+ * @mode_fixup: fixup proposed mode
+ * @mode_set: set the desired mode on the CRTC
+ * @gamma_set: specify color ramp for CRTC
+ * @destroy: deinit and free object.
+ *
+ * The drm_crtc_funcs structure is the central CRTC management structure
+ * in the DRM.  Each CRTC controls one or more connectors (note that the name
+ * CRTC is simply historical, a CRTC may control LVDS, VGA, DVI, TV out, etc.
+ * connectors, not just CRTs).
+ *
+ * Each driver is responsible for filling out this structure at startup time,
+ * in addition to providing other modesetting features, like i2c and DDC
+ * bus accessors.
+ */
+struct drm_crtc_funcs {
+	/* Save CRTC state */
+	void (*save)(struct drm_crtc *crtc); /* suspend? */
+	/* Restore CRTC state */
+	void (*restore)(struct drm_crtc *crtc); /* resume? */
+
+	/* cursor controls */
+	int (*cursor_set)(struct drm_crtc *crtc, struct drm_file *file_priv,
+			  uint32_t handle, uint32_t width, uint32_t height);
+	int (*cursor_move)(struct drm_crtc *crtc, int x, int y);
+
+	/* Set gamma on the CRTC */
+	void (*gamma_set)(struct drm_crtc *crtc, u16 *r, u16 *g, u16 *b,
+			  uint32_t size);
+	/* Object destroy routine */
+	void (*destroy)(struct drm_crtc *crtc);
+
+	int (*set_config)(struct drm_mode_set *set);
+};
+
+/**
+ * drm_crtc - central CRTC control structure
+ * @enabled: is this CRTC enabled?
+ * @x: x position on screen
+ * @y: y position on screen
+ * @desired_mode: new desired mode
+ * @desired_x: desired x for desired_mode
+ * @desired_y: desired y for desired_mode
+ * @funcs: CRTC control functions
+ *
+ * Each CRTC may have one or more connectors associated with it.  This structure
+ * allows the CRTC to be controlled.
+ */
+struct drm_crtc {
+	struct drm_device *dev;
+	struct list_head head;
+
+	struct drm_mode_object base;
+
+	/* framebuffer the connector is currently bound to */
+	struct drm_framebuffer *fb;
+
+	bool enabled;
+
+	struct drm_display_mode mode;
+
+	int x, y;
+	struct drm_display_mode *desired_mode;
+	int desired_x, desired_y;
+	const struct drm_crtc_funcs *funcs;
+
+	/* CRTC gamma size for reporting to userspace */
+	uint32_t gamma_size;
+	uint16_t *gamma_store;
+
+	/* if you are using the helper */
+	void *helper_private;
+};
+
+
+/**
+ * drm_connector_funcs - control connectors on a given device
+ * @dpms: set power state (see drm_crtc_funcs above)
+ * @save: save connector state
+ * @restore: restore connector state
+ * @mode_valid: is this mode valid on the given connector?
+ * @mode_fixup: try to fixup proposed mode for this connector
+ * @mode_set: set this mode
+ * @detect: is this connector active?
+ * @get_modes: get mode list for this connector
+ * @set_property: property for this connector may need update
+ * @destroy: make object go away
+ *
+ * Each CRTC may have one or more connectors attached to it.  The functions
+ * below allow the core DRM code to control connectors, enumerate available modes,
+ * etc.
+ */
+struct drm_connector_funcs {
+	void (*dpms)(struct drm_connector *connector, int mode);
+	void (*save)(struct drm_connector *connector);
+	void (*restore)(struct drm_connector *connector);
+	enum drm_connector_status (*detect)(struct drm_connector *connector);
+	void (*fill_modes)(struct drm_connector *connector, uint32_t max_width, uint32_t max_height);
+	int (*set_property)(struct drm_connector *connector, struct drm_property *property,
+			     uint64_t val);
+	void (*destroy)(struct drm_connector *connector);
+};
+
+struct drm_encoder_funcs {
+	void (*destroy)(struct drm_encoder *encoder);
+};
+
+#define DRM_CONNECTOR_MAX_UMODES 16
+#define DRM_CONNECTOR_MAX_PROPERTY 16
+#define DRM_CONNECTOR_LEN 32
+#define DRM_CONNECTOR_MAX_ENCODER 2
+
+/**
+ * drm_encoder - central DRM encoder structure
+ */
+struct drm_encoder {
+	struct drm_device *dev;
+	struct list_head head;
+
+	struct drm_mode_object base;
+	int encoder_type;
+	uint32_t possible_crtcs;
+	uint32_t possible_clones;
+
+	struct drm_crtc *crtc;
+	const struct drm_encoder_funcs *funcs;
+	void *helper_private;
+};
+
+/**
+ * drm_connector - central DRM connector control structure
+ * @crtc: CRTC this connector is currently connected to, NULL if none
+ * @interlace_allowed: can this connector handle interlaced modes?
+ * @doublescan_allowed: can this connector handle doublescan?
+ * @available_modes: modes available on this connector (from get_modes() + user)
+ * @initial_x: initial x position for this connector
+ * @initial_y: initial y position for this connector
+ * @status: connector connected?
+ * @funcs: connector control functions
+ *
+ * Each connector may be connected to one or more CRTCs, or may be clonable by
+ * another connector if they can share a CRTC.  Each connector also has a specific
+ * position in the broader display (referred to as a 'screen' though it could
+ * span multiple monitors).
+ */
+struct drm_connector {
+	struct drm_device *dev;
+	struct device kdev;
+	struct device_attribute *attr;
+	struct list_head head;
+
+	struct drm_mode_object base;
+
+	int connector_type;
+	int connector_type_id;
+	bool interlace_allowed;
+	bool doublescan_allowed;
+	struct list_head modes; /* list of modes on this connector */
+
+	int initial_x, initial_y;
+	enum drm_connector_status status;
+
+	/* these are modes added by probing with DDC or the BIOS */
+	struct list_head probed_modes;
+
+	struct drm_display_info display_info;
+	const struct drm_connector_funcs *funcs;
+
+	struct list_head user_modes;
+	struct drm_property_blob *edid_blob_ptr;
+	u32 property_ids[DRM_CONNECTOR_MAX_PROPERTY];
+	uint64_t property_values[DRM_CONNECTOR_MAX_PROPERTY];
+
+	void *helper_private;
+
+	uint32_t encoder_ids[DRM_CONNECTOR_MAX_ENCODER];
+	uint32_t force_encoder_id;
+	struct drm_encoder *encoder; /* currently active encoder */
+};
+
+/**
+ * struct drm_mode_set
+ *
+ * Represents a single crtc the connectors that it drives with what mode
+ * and from which framebuffer it scans out from.
+ *
+ * This is used to set modes.
+ */
+struct drm_mode_set {
+	struct list_head head;
+
+	struct drm_framebuffer *fb;
+	struct drm_crtc *crtc;
+	struct drm_display_mode *mode;
+
+	uint32_t x;
+	uint32_t y;
+
+	struct drm_connector **connectors;
+	size_t num_connectors;
+};
+
+/**
+ * struct drm_mode_config_funcs - configure CRTCs for a given screen layout
+ * @resize: adjust CRTCs as necessary for the proposed layout
+ *
+ * Currently only a resize hook is available.  DRM will call back into the
+ * driver with a new screen width and height.  If the driver can't support
+ * the proposed size, it can return false.  Otherwise it should adjust
+ * the CRTC<->connector mappings as needed and update its view of the screen.
+ */
+struct drm_mode_config_funcs {
+	int (*resize_fb)(struct drm_device *dev, struct drm_file *file_priv, struct drm_framebuffer *fb, struct drm_mode_fb_cmd *mode_cmd);
+	struct drm_framebuffer *(*fb_create)(struct drm_device *dev, struct drm_file *file_priv, struct drm_mode_fb_cmd *mode_cmd);
+	int (*fb_changed)(struct drm_device *dev);
+};
+
+struct drm_mode_group {
+	uint32_t num_crtcs;
+	uint32_t num_encoders;
+	uint32_t num_connectors;
+
+	/* list of object IDs for this group */
+	uint32_t *id_list;
+};
+
+/**
+ * drm_mode_config - Mode configuration control structure
+ *
+ */
+struct drm_mode_config {
+	struct mutex mutex; /* protects configuration and IDR */
+	struct idr crtc_idr; /* use this idr for all IDs, fb, crtc, connector, modes - just makes life easier */
+	/* this is limited to one for now */
+	int num_fb;
+	struct list_head fb_list;
+	int num_connector;
+	struct list_head connector_list;
+	int num_encoder;
+	struct list_head encoder_list;
+
+	int num_crtc;
+	struct list_head crtc_list;
+
+	struct list_head property_list;
+
+	/* in-kernel framebuffers - hung of filp_head in drm_framebuffer */
+	struct list_head fb_kernel_list;
+
+	int min_width, min_height;
+	int max_width, max_height;
+	struct drm_mode_config_funcs *funcs;
+	unsigned long fb_base;
+
+	/* pointers to standard properties */
+	struct list_head property_blob_list;
+	struct drm_property *edid_property;
+	struct drm_property *dpms_property;
+
+	/* DVI-I properties */
+	struct drm_property *dvi_i_subconnector_property;
+	struct drm_property *dvi_i_select_subconnector_property;
+
+	/* TV properties */
+	struct drm_property *tv_subconnector_property;
+	struct drm_property *tv_select_subconnector_property;
+	struct drm_property *tv_mode_property;
+	struct drm_property *tv_left_margin_property;
+	struct drm_property *tv_right_margin_property;
+	struct drm_property *tv_top_margin_property;
+	struct drm_property *tv_bottom_margin_property;
+
+	/* Optional properties */
+	struct drm_property *scaling_mode_property;
+	struct drm_property *dithering_mode_property;
+
+	/* hotplug */
+	uint32_t hotplug_counter;
+};
+
+#define obj_to_crtc(x) container_of(x, struct drm_crtc, base)
+#define obj_to_connector(x) container_of(x, struct drm_connector, base)
+#define obj_to_encoder(x) container_of(x, struct drm_encoder, base)
+#define obj_to_mode(x) container_of(x, struct drm_display_mode, base)
+#define obj_to_fb(x) container_of(x, struct drm_framebuffer, base)
+#define obj_to_property(x) container_of(x, struct drm_property, base)
+#define obj_to_blob(x) container_of(x, struct drm_property_blob, base)
+
+
+extern void drm_crtc_init(struct drm_device *dev,
+			  struct drm_crtc *crtc,
+			  const struct drm_crtc_funcs *funcs);
+extern void drm_crtc_cleanup(struct drm_crtc *crtc);
+
+extern void drm_connector_init(struct drm_device *dev,
+			    struct drm_connector *connector,
+			    const struct drm_connector_funcs *funcs,
+			    int connector_type);
+
+extern void drm_connector_cleanup(struct drm_connector *connector);
+
+extern void drm_encoder_init(struct drm_device *dev,
+			     struct drm_encoder *encoder,
+			     const struct drm_encoder_funcs *funcs,
+			     int encoder_type);
+
+extern void drm_encoder_cleanup(struct drm_encoder *encoder);
+
+extern char *drm_get_connector_name(struct drm_connector *connector);
+extern char *drm_get_dpms_name(int val);
+extern char *drm_get_dvi_i_subconnector_name(int val);
+extern char *drm_get_dvi_i_select_name(int val);
+extern char *drm_get_tv_subconnector_name(int val);
+extern char *drm_get_tv_select_name(int val);
+extern void drm_fb_release(struct file *filp);
+extern int drm_mode_group_init_legacy_group(struct drm_device *dev, struct drm_mode_group *group);
+extern struct edid *drm_get_edid(struct drm_connector *connector,
+				 struct i2c_adapter *adapter);
+extern unsigned char *drm_do_probe_ddc_edid(struct i2c_adapter *adapter);
+extern int drm_add_edid_modes(struct drm_connector *connector, struct edid *edid);
+extern void drm_mode_probed_add(struct drm_connector *connector, struct drm_display_mode *mode);
+extern void drm_mode_remove(struct drm_connector *connector, struct drm_display_mode *mode);
+extern struct drm_display_mode *drm_mode_duplicate(struct drm_device *dev,
+						   struct drm_display_mode *mode);
+extern void drm_mode_debug_printmodeline(struct drm_display_mode *mode);
+extern void drm_mode_config_init(struct drm_device *dev);
+extern void drm_mode_config_cleanup(struct drm_device *dev);
+extern void drm_mode_set_name(struct drm_display_mode *mode);
+extern bool drm_mode_equal(struct drm_display_mode *mode1, struct drm_display_mode *mode2);
+extern int drm_mode_width(struct drm_display_mode *mode);
+extern int drm_mode_height(struct drm_display_mode *mode);
+
+/* for us by fb module */
+extern int drm_mode_attachmode_crtc(struct drm_device *dev,
+				    struct drm_crtc *crtc,
+				    struct drm_display_mode *mode);
+extern int drm_mode_detachmode_crtc(struct drm_device *dev, struct drm_display_mode *mode);
+
+extern struct drm_display_mode *drm_mode_create(struct drm_device *dev);
+extern void drm_mode_destroy(struct drm_device *dev, struct drm_display_mode *mode);
+extern void drm_mode_list_concat(struct list_head *head,
+				 struct list_head *new);
+extern void drm_mode_validate_size(struct drm_device *dev,
+				   struct list_head *mode_list,
+				   int maxX, int maxY, int maxPitch);
+extern void drm_mode_prune_invalid(struct drm_device *dev,
+				   struct list_head *mode_list, bool verbose);
+extern void drm_mode_sort(struct list_head *mode_list);
+extern int drm_mode_vrefresh(struct drm_display_mode *mode);
+extern void drm_mode_set_crtcinfo(struct drm_display_mode *p,
+				  int adjust_flags);
+extern void drm_mode_connector_list_update(struct drm_connector *connector);
+extern int drm_mode_connector_update_edid_property(struct drm_connector *connector,
+						struct edid *edid);
+extern int drm_connector_property_set_value(struct drm_connector *connector,
+					 struct drm_property *property,
+					 uint64_t value);
+extern int drm_connector_property_get_value(struct drm_connector *connector,
+					 struct drm_property *property,
+					 uint64_t *value);
+extern struct drm_display_mode *drm_crtc_mode_create(struct drm_device *dev);
+extern void drm_framebuffer_set_object(struct drm_device *dev,
+				       unsigned long handle);
+extern int drm_framebuffer_init(struct drm_device *dev,
+				struct drm_framebuffer *fb,
+				const struct drm_framebuffer_funcs *funcs);
+extern void drm_framebuffer_cleanup(struct drm_framebuffer *fb);
+extern int drmfb_probe(struct drm_device *dev, struct drm_crtc *crtc);
+extern int drmfb_remove(struct drm_device *dev, struct drm_framebuffer *fb);
+extern void drm_crtc_probe_connector_modes(struct drm_device *dev, int maxX, int maxY);
+extern bool drm_crtc_in_use(struct drm_crtc *crtc);
+
+extern int drm_connector_attach_property(struct drm_connector *connector,
+				      struct drm_property *property, uint64_t init_val);
+extern struct drm_property *drm_property_create(struct drm_device *dev, int flags,
+						const char *name, int num_values);
+extern void drm_property_destroy(struct drm_device *dev, struct drm_property *property);
+extern int drm_property_add_enum(struct drm_property *property, int index,
+				 uint64_t value, const char *name);
+extern int drm_mode_create_dvi_i_properties(struct drm_device *dev);
+extern int drm_mode_create_tv_properties(struct drm_device *dev, int num_formats,
+				     char *formats[]);
+extern int drm_mode_create_scaling_mode_property(struct drm_device *dev);
+extern int drm_mode_create_dithering_property(struct drm_device *dev);
+extern char *drm_get_encoder_name(struct drm_encoder *encoder);
+
+extern int drm_mode_connector_attach_encoder(struct drm_connector *connector,
+					     struct drm_encoder *encoder);
+extern void drm_mode_connector_detach_encoder(struct drm_connector *connector,
+					   struct drm_encoder *encoder);
+extern bool drm_mode_crtc_set_gamma_size(struct drm_crtc *crtc,
+					 int gamma_size);
+extern void *drm_mode_object_find(struct drm_device *dev, uint32_t id, uint32_t type);
+/* IOCTLs */
+extern int drm_mode_getresources(struct drm_device *dev,
+				 void *data, struct drm_file *file_priv);
+
+extern int drm_mode_getcrtc(struct drm_device *dev,
+			    void *data, struct drm_file *file_priv);
+extern int drm_mode_getconnector(struct drm_device *dev,
+			      void *data, struct drm_file *file_priv);
+extern int drm_mode_setcrtc(struct drm_device *dev,
+			    void *data, struct drm_file *file_priv);
+extern int drm_mode_cursor_ioctl(struct drm_device *dev,
+				void *data, struct drm_file *file_priv);
+extern int drm_mode_addfb(struct drm_device *dev,
+			  void *data, struct drm_file *file_priv);
+extern int drm_mode_rmfb(struct drm_device *dev,
+			 void *data, struct drm_file *file_priv);
+extern int drm_mode_getfb(struct drm_device *dev,
+			  void *data, struct drm_file *file_priv);
+extern int drm_mode_addmode_ioctl(struct drm_device *dev,
+				  void *data, struct drm_file *file_priv);
+extern int drm_mode_rmmode_ioctl(struct drm_device *dev,
+				 void *data, struct drm_file *file_priv);
+extern int drm_mode_attachmode_ioctl(struct drm_device *dev,
+				     void *data, struct drm_file *file_priv);
+extern int drm_mode_detachmode_ioctl(struct drm_device *dev,
+				     void *data, struct drm_file *file_priv);
+
+extern int drm_mode_getproperty_ioctl(struct drm_device *dev,
+				      void *data, struct drm_file *file_priv);
+extern int drm_mode_getblob_ioctl(struct drm_device *dev,
+				  void *data, struct drm_file *file_priv);
+extern int drm_mode_connector_property_set_ioctl(struct drm_device *dev,
+					      void *data, struct drm_file *file_priv);
+extern int drm_mode_hotplug_ioctl(struct drm_device *dev,
+				  void *data, struct drm_file *file_priv);
+extern int drm_mode_replacefb(struct drm_device *dev,
+			      void *data, struct drm_file *file_priv);
+extern int drm_mode_getencoder(struct drm_device *dev,
+			       void *data, struct drm_file *file_priv);
+extern int drm_mode_gamma_get_ioctl(struct drm_device *dev,
+				    void *data, struct drm_file *file_priv);
+extern int drm_mode_gamma_set_ioctl(struct drm_device *dev,
+				    void *data, struct drm_file *file_priv);
+#endif /* __DRM_CRTC_H__ */
diff --git a/include/drm/drm_crtc_helper.h b/include/drm/drm_crtc_helper.h
new file mode 100644
index 00000000000..a341828d1d1
--- /dev/null
+++ b/include/drm/drm_crtc_helper.h
@@ -0,0 +1,121 @@
+/*
+ * Copyright © 2006 Keith Packard
+ * Copyright © 2007-2008 Dave Airlie
+ * Copyright © 2007-2008 Intel Corporation
+ *   Jesse Barnes <jesse.barnes@intel.com>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+/*
+ * The DRM mode setting helper functions are common code for drivers to use if
+ * they wish.  Drivers are not forced to use this code in their
+ * implementations but it would be useful if they code they do use at least
+ * provides a consistent interface and operation to userspace
+ */
+
+#ifndef __DRM_CRTC_HELPER_H__
+#define __DRM_CRTC_HELPER_H__
+
+#include <linux/i2c.h>
+#include <linux/spinlock.h>
+#include <linux/types.h>
+#include <linux/idr.h>
+
+#include <linux/fb.h>
+
+struct drm_crtc_helper_funcs {
+	/*
+	 * Control power levels on the CRTC.  If the mode passed in is
+	 * unsupported, the provider must use the next lowest power level.
+	 */
+	void (*dpms)(struct drm_crtc *crtc, int mode);
+	void (*prepare)(struct drm_crtc *crtc);
+	void (*commit)(struct drm_crtc *crtc);
+
+	/* Provider can fixup or change mode timings before modeset occurs */
+	bool (*mode_fixup)(struct drm_crtc *crtc,
+			   struct drm_display_mode *mode,
+			   struct drm_display_mode *adjusted_mode);
+	/* Actually set the mode */
+	void (*mode_set)(struct drm_crtc *crtc, struct drm_display_mode *mode,
+			 struct drm_display_mode *adjusted_mode, int x, int y);
+
+	/* Move the crtc on the current fb to the given position *optional* */
+	void (*mode_set_base)(struct drm_crtc *crtc, int x, int y);
+};
+
+struct drm_encoder_helper_funcs {
+	void (*dpms)(struct drm_encoder *encoder, int mode);
+	void (*save)(struct drm_encoder *encoder);
+	void (*restore)(struct drm_encoder *encoder);
+
+	bool (*mode_fixup)(struct drm_encoder *encoder,
+			   struct drm_display_mode *mode,
+			   struct drm_display_mode *adjusted_mode);
+	void (*prepare)(struct drm_encoder *encoder);
+	void (*commit)(struct drm_encoder *encoder);
+	void (*mode_set)(struct drm_encoder *encoder,
+			 struct drm_display_mode *mode,
+			 struct drm_display_mode *adjusted_mode);
+	/* detect for DAC style encoders */
+	enum drm_connector_status (*detect)(struct drm_encoder *encoder,
+					    struct drm_connector *connector);
+};
+
+struct drm_connector_helper_funcs {
+	int (*get_modes)(struct drm_connector *connector);
+	int (*mode_valid)(struct drm_connector *connector,
+			  struct drm_display_mode *mode);
+	struct drm_encoder *(*best_encoder)(struct drm_connector *connector);
+};
+
+extern void drm_helper_probe_single_connector_modes(struct drm_connector *connector, uint32_t maxX, uint32_t maxY);
+extern void drm_helper_disable_unused_functions(struct drm_device *dev);
+extern int drm_helper_hotplug_stage_two(struct drm_device *dev);
+extern bool drm_helper_initial_config(struct drm_device *dev, bool can_grow);
+extern int drm_crtc_helper_set_config(struct drm_mode_set *set);
+extern bool drm_crtc_helper_set_mode(struct drm_crtc *crtc,
+				     struct drm_display_mode *mode,
+				     int x, int y);
+extern bool drm_helper_crtc_in_use(struct drm_crtc *crtc);
+
+extern int drm_helper_mode_fill_fb_struct(struct drm_framebuffer *fb,
+					  struct drm_mode_fb_cmd *mode_cmd);
+
+static inline void drm_crtc_helper_add(struct drm_crtc *crtc,
+				       const struct drm_crtc_helper_funcs *funcs)
+{
+	crtc->helper_private = (void *)funcs;
+}
+
+static inline void drm_encoder_helper_add(struct drm_encoder *encoder,
+					  const struct drm_encoder_helper_funcs *funcs)
+{
+	encoder->helper_private = (void *)funcs;
+}
+
+static inline void drm_connector_helper_add(struct drm_connector *connector,
+					    const struct drm_connector_helper_funcs *funcs)
+{
+	connector->helper_private = (void *)funcs;
+}
+
+extern int drm_helper_resume_force_mode(struct drm_device *dev);
+#endif
diff --git a/include/drm/drm_edid.h b/include/drm/drm_edid.h
new file mode 100644
index 00000000000..c707c15f516
--- /dev/null
+++ b/include/drm/drm_edid.h
@@ -0,0 +1,202 @@
+/*
+ * Copyright © 2007-2008 Intel Corporation
+ *   Jesse Barnes <jesse.barnes@intel.com>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+#ifndef __DRM_EDID_H__
+#define __DRM_EDID_H__
+
+#include <linux/types.h>
+
+#define EDID_LENGTH 128
+#define DDC_ADDR 0x50
+
+#ifdef BIG_ENDIAN
+#error "EDID structure is little endian, need big endian versions"
+#else
+
+struct est_timings {
+	u8 t1;
+	u8 t2;
+	u8 mfg_rsvd;
+} __attribute__((packed));
+
+struct std_timing {
+	u8 hsize; /* need to multiply by 8 then add 248 */
+	u8 vfreq:6; /* need to add 60 */
+	u8 aspect_ratio:2; /* 00=16:10, 01=4:3, 10=5:4, 11=16:9 */
+} __attribute__((packed));
+
+/* If detailed data is pixel timing */
+struct detailed_pixel_timing {
+	u8 hactive_lo;
+	u8 hblank_lo;
+	u8 hblank_hi:4;
+	u8 hactive_hi:4;
+	u8 vactive_lo;
+	u8 vblank_lo;
+	u8 vblank_hi:4;
+	u8 vactive_hi:4;
+	u8 hsync_offset_lo;
+	u8 hsync_pulse_width_lo;
+	u8 vsync_pulse_width_lo:4;
+	u8 vsync_offset_lo:4;
+	u8 hsync_pulse_width_hi:2;
+	u8 hsync_offset_hi:2;
+	u8 vsync_pulse_width_hi:2;
+	u8 vsync_offset_hi:2;
+	u8 width_mm_lo;
+	u8 height_mm_lo;
+	u8 height_mm_hi:4;
+	u8 width_mm_hi:4;
+	u8 hborder;
+	u8 vborder;
+	u8 unknown0:1;
+	u8 vsync_positive:1;
+	u8 hsync_positive:1;
+	u8 separate_sync:2;
+	u8 stereo:1;
+	u8 unknown6:1;
+	u8 interlaced:1;
+} __attribute__((packed));
+
+/* If it's not pixel timing, it'll be one of the below */
+struct detailed_data_string {
+	u8 str[13];
+} __attribute__((packed));
+
+struct detailed_data_monitor_range {
+	u8 min_vfreq;
+	u8 max_vfreq;
+	u8 min_hfreq_khz;
+	u8 max_hfreq_khz;
+	u8 pixel_clock_mhz; /* need to multiply by 10 */
+	u16 sec_gtf_toggle; /* A000=use above, 20=use below */ /* FIXME: byte order */
+	u8 hfreq_start_khz; /* need to multiply by 2 */
+	u8 c; /* need to divide by 2 */
+	u16 m; /* FIXME: byte order */
+	u8 k;
+	u8 j; /* need to divide by 2 */
+} __attribute__((packed));
+
+struct detailed_data_wpindex {
+	u8 white_y_lo:2;
+	u8 white_x_lo:2;
+	u8 pad:4;
+	u8 white_x_hi;
+	u8 white_y_hi;
+	u8 gamma; /* need to divide by 100 then add 1 */
+} __attribute__((packed));
+
+struct detailed_data_color_point {
+	u8 windex1;
+	u8 wpindex1[3];
+	u8 windex2;
+	u8 wpindex2[3];
+} __attribute__((packed));
+
+struct detailed_non_pixel {
+	u8 pad1;
+	u8 type; /* ff=serial, fe=string, fd=monitor range, fc=monitor name
+		    fb=color point data, fa=standard timing data,
+		    f9=undefined, f8=mfg. reserved */
+	u8 pad2;
+	union {
+		struct detailed_data_string str;
+		struct detailed_data_monitor_range range;
+		struct detailed_data_wpindex color;
+		struct std_timing timings[5];
+	} data;
+} __attribute__((packed));
+
+#define EDID_DETAIL_STD_MODES 0xfa
+#define EDID_DETAIL_MONITOR_CPDATA 0xfb
+#define EDID_DETAIL_MONITOR_NAME 0xfc
+#define EDID_DETAIL_MONITOR_RANGE 0xfd
+#define EDID_DETAIL_MONITOR_STRING 0xfe
+#define EDID_DETAIL_MONITOR_SERIAL 0xff
+
+struct detailed_timing {
+	u16 pixel_clock; /* need to multiply by 10 KHz */ /* FIXME: byte order */
+	union {
+		struct detailed_pixel_timing pixel_data;
+		struct detailed_non_pixel other_data;
+	} data;
+} __attribute__((packed));
+
+struct edid {
+	u8 header[8];
+	/* Vendor & product info */
+	u8 mfg_id[2];
+	u8 prod_code[2];
+	u32 serial; /* FIXME: byte order */
+	u8 mfg_week;
+	u8 mfg_year;
+	/* EDID version */
+	u8 version;
+	u8 revision;
+	/* Display info: */
+	/*   input definition */
+	u8 serration_vsync:1;
+	u8 sync_on_green:1;
+	u8 composite_sync:1;
+	u8 separate_syncs:1;
+	u8 blank_to_black:1;
+	u8 video_level:2;
+	u8 digital:1; /* bits below must be zero if set */
+	u8 width_cm;
+	u8 height_cm;
+	u8 gamma;
+	/*   feature support */
+	u8 default_gtf:1;
+	u8 preferred_timing:1;
+	u8 standard_color:1;
+	u8 display_type:2; /* 00=mono, 01=rgb, 10=non-rgb, 11=unknown */
+	u8 pm_active_off:1;
+	u8 pm_suspend:1;
+	u8 pm_standby:1;
+	/* Color characteristics */
+	u8 red_green_lo;
+	u8 black_white_lo;
+	u8 red_x;
+	u8 red_y;
+	u8 green_x;
+	u8 green_y;
+	u8 blue_x;
+	u8 blue_y;
+	u8 white_x;
+	u8 white_y;
+	/* Est. timings and mfg rsvd timings*/
+	struct est_timings established_timings;
+	/* Standard timings 1-8*/
+	struct std_timing standard_timings[8];
+	/* Detailing timings 1-4 */
+	struct detailed_timing detailed_timings[4];
+	/* Number of 128 byte ext. blocks */
+	u8 extensions;
+	/* Checksum */
+	u8 checksum;
+} __attribute__((packed));
+
+#endif /* little endian structs */
+
+#define EDID_PRODUCT_ID(e) ((e)->prod_code[0] | ((e)->prod_code[1] << 8))
+
+#endif /* __DRM_EDID_H__ */
diff --git a/include/drm/drm_mode.h b/include/drm/drm_mode.h
new file mode 100644
index 00000000000..d2e791920ab
--- /dev/null
+++ b/include/drm/drm_mode.h
@@ -0,0 +1,278 @@
+/*
+ * Copyright (c) 2007 Dave Airlie <airlied@linux.ie>
+ * Copyright (c) 2007 Jakob Bornecrantz <wallbraker@gmail.com>
+ * Copyright (c) 2008 Red Hat Inc.
+ * Copyright (c) 2007-2008 Tungsten Graphics, Inc., Cedar Park, TX., USA
+ * Copyright (c) 2007-2008 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifndef _DRM_MODE_H
+#define _DRM_MODE_H
+
+#if !defined(__KERNEL__) && !defined(_KERNEL)
+#include <stdint.h>
+#else
+#include <linux/kernel.h>
+#endif
+
+#define DRM_DISPLAY_INFO_LEN 32
+#define DRM_CONNECTOR_NAME_LEN 32
+#define DRM_DISPLAY_MODE_LEN 32
+#define DRM_PROP_NAME_LEN 32
+
+#define DRM_MODE_TYPE_BUILTIN	(1<<0)
+#define DRM_MODE_TYPE_CLOCK_C	((1<<1) | DRM_MODE_TYPE_BUILTIN)
+#define DRM_MODE_TYPE_CRTC_C	((1<<2) | DRM_MODE_TYPE_BUILTIN)
+#define DRM_MODE_TYPE_PREFERRED	(1<<3)
+#define DRM_MODE_TYPE_DEFAULT	(1<<4)
+#define DRM_MODE_TYPE_USERDEF	(1<<5)
+#define DRM_MODE_TYPE_DRIVER	(1<<6)
+
+/* Video mode flags */
+/* bit compatible with the xorg definitions. */
+#define DRM_MODE_FLAG_PHSYNC	(1<<0)
+#define DRM_MODE_FLAG_NHSYNC	(1<<1)
+#define DRM_MODE_FLAG_PVSYNC	(1<<2)
+#define DRM_MODE_FLAG_NVSYNC	(1<<3)
+#define DRM_MODE_FLAG_INTERLACE	(1<<4)
+#define DRM_MODE_FLAG_DBLSCAN	(1<<5)
+#define DRM_MODE_FLAG_CSYNC	(1<<6)
+#define DRM_MODE_FLAG_PCSYNC	(1<<7)
+#define DRM_MODE_FLAG_NCSYNC	(1<<8)
+#define DRM_MODE_FLAG_HSKEW	(1<<9) /* hskew provided */
+#define DRM_MODE_FLAG_BCAST	(1<<10)
+#define DRM_MODE_FLAG_PIXMUX	(1<<11)
+#define DRM_MODE_FLAG_DBLCLK	(1<<12)
+#define DRM_MODE_FLAG_CLKDIV2	(1<<13)
+
+/* DPMS flags */
+/* bit compatible with the xorg definitions. */
+#define DRM_MODE_DPMS_ON 0
+#define DRM_MODE_DPMS_STANDBY 1
+#define DRM_MODE_DPMS_SUSPEND 2
+#define DRM_MODE_DPMS_OFF 3
+
+/* Scaling mode options */
+#define DRM_MODE_SCALE_NON_GPU 0
+#define DRM_MODE_SCALE_FULLSCREEN 1
+#define DRM_MODE_SCALE_NO_SCALE 2
+#define DRM_MODE_SCALE_ASPECT 3
+
+/* Dithering mode options */
+#define DRM_MODE_DITHERING_OFF 0
+#define DRM_MODE_DITHERING_ON 1
+
+struct drm_mode_modeinfo {
+	unsigned int clock;
+	unsigned short hdisplay, hsync_start, hsync_end, htotal, hskew;
+	unsigned short vdisplay, vsync_start, vsync_end, vtotal, vscan;
+
+	unsigned int vrefresh; /* vertical refresh * 1000 */
+
+	unsigned int flags;
+	unsigned int type;
+	char name[DRM_DISPLAY_MODE_LEN];
+};
+
+struct drm_mode_card_res {
+	uint64_t fb_id_ptr;
+	uint64_t crtc_id_ptr;
+	uint64_t connector_id_ptr;
+	uint64_t encoder_id_ptr;
+	int count_fbs;
+	int count_crtcs;
+	int count_connectors;
+	int count_encoders;
+	int min_width, max_width;
+	int min_height, max_height;
+};
+
+struct drm_mode_crtc {
+	uint64_t set_connectors_ptr;
+	int count_connectors;
+
+	unsigned int crtc_id; /**< Id */
+	unsigned int fb_id; /**< Id of framebuffer */
+
+	int x, y; /**< Position on the frameuffer */
+
+	uint32_t gamma_size;
+	int mode_valid;
+	struct drm_mode_modeinfo mode;
+};
+
+#define DRM_MODE_ENCODER_NONE 0
+#define DRM_MODE_ENCODER_DAC  1
+#define DRM_MODE_ENCODER_TMDS 2
+#define DRM_MODE_ENCODER_LVDS 3
+#define DRM_MODE_ENCODER_TVDAC 4
+
+struct drm_mode_get_encoder {
+	unsigned int encoder_id;
+	unsigned int encoder_type;
+
+	unsigned int crtc_id; /**< Id of crtc */
+
+	uint32_t possible_crtcs;
+	uint32_t possible_clones;
+};
+
+/* This is for connectors with multiple signal types. */
+/* Try to match DRM_MODE_CONNECTOR_X as closely as possible. */
+#define DRM_MODE_SUBCONNECTOR_Automatic 0
+#define DRM_MODE_SUBCONNECTOR_Unknown 0
+#define DRM_MODE_SUBCONNECTOR_DVID 3
+#define DRM_MODE_SUBCONNECTOR_DVIA 4
+#define DRM_MODE_SUBCONNECTOR_Composite 5
+#define DRM_MODE_SUBCONNECTOR_SVIDEO 6
+#define DRM_MODE_SUBCONNECTOR_Component 8
+
+#define DRM_MODE_CONNECTOR_Unknown 0
+#define DRM_MODE_CONNECTOR_VGA 1
+#define DRM_MODE_CONNECTOR_DVII 2
+#define DRM_MODE_CONNECTOR_DVID 3
+#define DRM_MODE_CONNECTOR_DVIA 4
+#define DRM_MODE_CONNECTOR_Composite 5
+#define DRM_MODE_CONNECTOR_SVIDEO 6
+#define DRM_MODE_CONNECTOR_LVDS 7
+#define DRM_MODE_CONNECTOR_Component 8
+#define DRM_MODE_CONNECTOR_9PinDIN 9
+#define DRM_MODE_CONNECTOR_DisplayPort 10
+#define DRM_MODE_CONNECTOR_HDMIA 11
+#define DRM_MODE_CONNECTOR_HDMIB 12
+
+struct drm_mode_get_connector {
+
+	uint64_t encoders_ptr;
+	uint64_t modes_ptr;
+	uint64_t props_ptr;
+	uint64_t prop_values_ptr;
+
+	int count_modes;
+	int count_props;
+	int count_encoders;
+
+	unsigned int encoder_id; /**< Current Encoder */
+	unsigned int connector_id; /**< Id */
+	unsigned int connector_type;
+	unsigned int connector_type_id;
+
+	unsigned int connection;
+	unsigned int mm_width, mm_height; /**< HxW in millimeters */
+	unsigned int subpixel;
+};
+
+#define DRM_MODE_PROP_PENDING (1<<0)
+#define DRM_MODE_PROP_RANGE (1<<1)
+#define DRM_MODE_PROP_IMMUTABLE (1<<2)
+#define DRM_MODE_PROP_ENUM (1<<3) /* enumerated type with text strings */
+#define DRM_MODE_PROP_BLOB (1<<4)
+
+struct drm_mode_property_enum {
+	uint64_t value;
+	unsigned char name[DRM_PROP_NAME_LEN];
+};
+
+struct drm_mode_get_property {
+	uint64_t values_ptr; /* values and blob lengths */
+	uint64_t enum_blob_ptr; /* enum and blob id ptrs */
+
+	unsigned int prop_id;
+	unsigned int flags;
+	unsigned char name[DRM_PROP_NAME_LEN];
+
+	int count_values;
+	int count_enum_blobs;
+};
+
+struct drm_mode_connector_set_property {
+	uint64_t value;
+	unsigned int prop_id;
+	unsigned int connector_id;
+};
+
+struct drm_mode_get_blob {
+	uint32_t blob_id;
+	uint32_t length;
+	uint64_t data;
+};
+
+struct drm_mode_fb_cmd {
+	unsigned int buffer_id;
+	unsigned int width, height;
+	unsigned int pitch;
+	unsigned int bpp;
+	unsigned int depth;
+
+	unsigned int handle;
+};
+
+struct drm_mode_mode_cmd {
+	unsigned int connector_id;
+	struct drm_mode_modeinfo mode;
+};
+
+#define DRM_MODE_CURSOR_BO   0x01
+#define DRM_MODE_CURSOR_MOVE 0x02
+
+/*
+ * depending on the value in flags diffrent members are used.
+ *
+ * CURSOR_BO uses
+ *    crtc
+ *    width
+ *    height
+ *    handle - if 0 turns the cursor of
+ *
+ * CURSOR_MOVE uses
+ *    crtc
+ *    x
+ *    y
+ */
+struct drm_mode_cursor {
+	unsigned int flags;
+	unsigned int crtc;
+	int x;
+	int y;
+	uint32_t width;
+	uint32_t height;
+	unsigned int handle;
+};
+
+/*
+ * oh so ugly hotplug
+ */
+struct drm_mode_hotplug {
+	uint32_t counter;
+};
+
+struct drm_mode_crtc_lut {
+
+	uint32_t crtc_id;
+	uint32_t gamma_size;
+
+	/* pointers to arrays */
+	uint64_t red;
+	uint64_t green;
+	uint64_t blue;
+};
+
+#endif
diff --git a/include/linux/console.h b/include/linux/console.h
index 248e6e3b9b7..a67a90cf826 100644
--- a/include/linux/console.h
+++ b/include/linux/console.h
@@ -153,4 +153,8 @@ void vcs_remove_sysfs(struct tty_struct *tty);
 #define VESA_HSYNC_SUSPEND      2
 #define VESA_POWERDOWN          3
 
+#ifdef CONFIG_VGA_CONSOLE
+extern bool vgacon_text_force(void);
+#endif
+
 #endif /* _LINUX_CONSOLE_H */
-- 
cgit v1.2.3-70-g09d2


From 773ff60e841461cb1f9374a713ffcda029b8c317 Mon Sep 17 00:00:00 2001
From: Akinobu Mita <akinobu.mita@gmail.com>
Date: Tue, 23 Dec 2008 19:37:01 +0900
Subject: SLUB: failslab support

Currently fault-injection capability for SLAB allocator is only
available to SLAB. This patch makes it available to SLUB, too.

[penberg@cs.helsinki.fi: unify slab and slub implementations]
Cc: Christoph Lameter <cl@linux-foundation.org>
Cc: Matt Mackall <mpm@selenic.com>
Signed-off-by: Akinobu Mita <akinobu.mita@gmail.com>
Signed-off-by: Pekka Enberg <penberg@cs.helsinki.fi>
---
 include/linux/fault-inject.h |  9 ++++++
 lib/Kconfig.debug            |  1 +
 mm/Makefile                  |  1 +
 mm/failslab.c                | 59 ++++++++++++++++++++++++++++++++++
 mm/slab.c                    | 75 +++-----------------------------------------
 mm/slub.c                    |  4 +++
 6 files changed, 79 insertions(+), 70 deletions(-)
 create mode 100644 mm/failslab.c

(limited to 'include/linux')

diff --git a/include/linux/fault-inject.h b/include/linux/fault-inject.h
index 32368c4f032..06ca9b21dad 100644
--- a/include/linux/fault-inject.h
+++ b/include/linux/fault-inject.h
@@ -81,4 +81,13 @@ static inline void cleanup_fault_attr_dentries(struct fault_attr *attr)
 
 #endif /* CONFIG_FAULT_INJECTION */
 
+#ifdef CONFIG_FAILSLAB
+extern bool should_failslab(size_t size, gfp_t gfpflags);
+#else
+static inline bool should_failslab(size_t size, gfp_t gfpflags)
+{
+	return false;
+}
+#endif /* CONFIG_FAILSLAB */
+
 #endif /* _LINUX_FAULT_INJECT_H */
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index b0f239e443b..af65ae7f054 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -699,6 +699,7 @@ config FAULT_INJECTION
 config FAILSLAB
 	bool "Fault-injection capability for kmalloc"
 	depends on FAULT_INJECTION
+	depends on SLAB || SLUB
 	help
 	  Provide fault-injection capability for kmalloc.
 
diff --git a/mm/Makefile b/mm/Makefile
index c06b45a1ff5..51c27709cc7 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -28,6 +28,7 @@ obj-$(CONFIG_SLOB) += slob.o
 obj-$(CONFIG_MMU_NOTIFIER) += mmu_notifier.o
 obj-$(CONFIG_SLAB) += slab.o
 obj-$(CONFIG_SLUB) += slub.o
+obj-$(CONFIG_FAILSLAB) += failslab.o
 obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o
 obj-$(CONFIG_FS_XIP) += filemap_xip.o
 obj-$(CONFIG_MIGRATION) += migrate.o
diff --git a/mm/failslab.c b/mm/failslab.c
new file mode 100644
index 00000000000..7c6ea6493f8
--- /dev/null
+++ b/mm/failslab.c
@@ -0,0 +1,59 @@
+#include <linux/fault-inject.h>
+
+static struct {
+	struct fault_attr attr;
+	u32 ignore_gfp_wait;
+#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS
+	struct dentry *ignore_gfp_wait_file;
+#endif
+} failslab = {
+	.attr = FAULT_ATTR_INITIALIZER,
+	.ignore_gfp_wait = 1,
+};
+
+bool should_failslab(size_t size, gfp_t gfpflags)
+{
+	if (gfpflags & __GFP_NOFAIL)
+		return false;
+
+        if (failslab.ignore_gfp_wait && (gfpflags & __GFP_WAIT))
+		return false;
+
+	return should_fail(&failslab.attr, size);
+}
+
+static int __init setup_failslab(char *str)
+{
+	return setup_fault_attr(&failslab.attr, str);
+}
+__setup("failslab=", setup_failslab);
+
+#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS
+
+static int __init failslab_debugfs_init(void)
+{
+	mode_t mode = S_IFREG | S_IRUSR | S_IWUSR;
+	struct dentry *dir;
+	int err;
+
+	err = init_fault_attr_dentries(&failslab.attr, "failslab");
+	if (err)
+		return err;
+	dir = failslab.attr.dentries.dir;
+
+	failslab.ignore_gfp_wait_file =
+		debugfs_create_bool("ignore-gfp-wait", mode, dir,
+				      &failslab.ignore_gfp_wait);
+
+	if (!failslab.ignore_gfp_wait_file) {
+		err = -ENOMEM;
+		debugfs_remove(failslab.ignore_gfp_wait_file);
+		cleanup_fault_attr_dentries(&failslab.attr);
+	}
+
+	return err;
+}
+
+late_initcall(failslab_debugfs_init);
+
+#endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */
diff --git a/mm/slab.c b/mm/slab.c
index 09187517f9d..c347dd8480c 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -3106,79 +3106,14 @@ static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep,
 #define cache_alloc_debugcheck_after(a,b,objp,d) (objp)
 #endif
 
-#ifdef CONFIG_FAILSLAB
-
-static struct failslab_attr {
-
-	struct fault_attr attr;
-
-	u32 ignore_gfp_wait;
-#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS
-	struct dentry *ignore_gfp_wait_file;
-#endif
-
-} failslab = {
-	.attr = FAULT_ATTR_INITIALIZER,
-	.ignore_gfp_wait = 1,
-};
-
-static int __init setup_failslab(char *str)
-{
-	return setup_fault_attr(&failslab.attr, str);
-}
-__setup("failslab=", setup_failslab);
-
-static int should_failslab(struct kmem_cache *cachep, gfp_t flags)
+static bool slab_should_failslab(struct kmem_cache *cachep, gfp_t flags)
 {
 	if (cachep == &cache_cache)
-		return 0;
-	if (flags & __GFP_NOFAIL)
-		return 0;
-	if (failslab.ignore_gfp_wait && (flags & __GFP_WAIT))
-		return 0;
+		return false;
 
-	return should_fail(&failslab.attr, obj_size(cachep));
+	return should_failslab(obj_size(cachep), flags);
 }
 
-#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS
-
-static int __init failslab_debugfs(void)
-{
-	mode_t mode = S_IFREG | S_IRUSR | S_IWUSR;
-	struct dentry *dir;
-	int err;
-
-	err = init_fault_attr_dentries(&failslab.attr, "failslab");
-	if (err)
-		return err;
-	dir = failslab.attr.dentries.dir;
-
-	failslab.ignore_gfp_wait_file =
-		debugfs_create_bool("ignore-gfp-wait", mode, dir,
-				      &failslab.ignore_gfp_wait);
-
-	if (!failslab.ignore_gfp_wait_file) {
-		err = -ENOMEM;
-		debugfs_remove(failslab.ignore_gfp_wait_file);
-		cleanup_fault_attr_dentries(&failslab.attr);
-	}
-
-	return err;
-}
-
-late_initcall(failslab_debugfs);
-
-#endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */
-
-#else /* CONFIG_FAILSLAB */
-
-static inline int should_failslab(struct kmem_cache *cachep, gfp_t flags)
-{
-	return 0;
-}
-
-#endif /* CONFIG_FAILSLAB */
-
 static inline void *____cache_alloc(struct kmem_cache *cachep, gfp_t flags)
 {
 	void *objp;
@@ -3381,7 +3316,7 @@ __cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid,
 	unsigned long save_flags;
 	void *ptr;
 
-	if (should_failslab(cachep, flags))
+	if (slab_should_failslab(cachep, flags))
 		return NULL;
 
 	cache_alloc_debugcheck_before(cachep, flags);
@@ -3457,7 +3392,7 @@ __cache_alloc(struct kmem_cache *cachep, gfp_t flags, void *caller)
 	unsigned long save_flags;
 	void *objp;
 
-	if (should_failslab(cachep, flags))
+	if (slab_should_failslab(cachep, flags))
 		return NULL;
 
 	cache_alloc_debugcheck_before(cachep, flags);
diff --git a/mm/slub.c b/mm/slub.c
index a2cd47d89e0..640fde7e354 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -24,6 +24,7 @@
 #include <linux/kallsyms.h>
 #include <linux/memory.h>
 #include <linux/math64.h>
+#include <linux/fault-inject.h>
 
 /*
  * Lock order:
@@ -1591,6 +1592,9 @@ static __always_inline void *slab_alloc(struct kmem_cache *s,
 	unsigned long flags;
 	unsigned int objsize;
 
+	if (should_failslab(s->objsize, gfpflags))
+		return NULL;
+
 	local_irq_save(flags);
 	c = get_cpu_slab(s, smp_processor_id());
 	objsize = c->objsize;
-- 
cgit v1.2.3-70-g09d2


From dfcd3610289132a762b7dc0eaf33998262cd9e20 Mon Sep 17 00:00:00 2001
From: Pascal Terjan <pterjan@mandriva.com>
Date: Tue, 25 Nov 2008 15:08:19 +0100
Subject: slab: Fix comment on #endif

This #endif in slab.h is described as closing the inner block while it's for
the big CONFIG_NUMA one. That makes reading the code a bit harder.

This trivial patch fixes the comment.

Signed-off-by: Pascal Terjan <pterjan@mandriva.com>
Signed-off-by: Pekka Enberg <penberg@cs.helsinki.fi>
---
 include/linux/slab.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/slab.h b/include/linux/slab.h
index 000da12b5cf..9d8ca14be3c 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -285,7 +285,7 @@ extern void *__kmalloc_node_track_caller(size_t, gfp_t, int, void *);
 #define kmalloc_node_track_caller(size, flags, node) \
 	kmalloc_track_caller(size, flags)
 
-#endif /* DEBUG_SLAB */
+#endif /* CONFIG_NUMA */
 
 /*
  * Shortcuts
-- 
cgit v1.2.3-70-g09d2


From 43a256322ac1fc105c181b3cade3b9bfc0b63ca1 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yhlu.kernel@gmail.com>
Date: Sun, 28 Dec 2008 16:01:13 -0800
Subject: sparseirq: move __weak symbols into separate compilation unit

GCC has a bug with __weak alias functions: if the functions are in
the same compilation unit as their call site, GCC can decide to
inline them - and thus rob the linker of the opportunity to override
the weak alias with the real thing.

So move all the IRQ handling related __weak symbols to kernel/irq/chip.c.

Signed-off-by: Yinghai Lu <yinghai@kernel.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/interrupt.h |  6 ++++++
 include/linux/irq.h       |  3 ---
 init/main.c               | 10 ----------
 kernel/irq/manage.c       |  9 ---------
 kernel/softirq.c          | 20 ++++++++++++++++++++
 5 files changed, 26 insertions(+), 22 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index 777f89e00b4..d9a370325ae 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -467,4 +467,10 @@ static inline void init_irq_proc(void)
 
 int show_interrupts(struct seq_file *p, void *v);
 
+struct irq_desc;
+
+extern int early_irq_init(void);
+extern int arch_early_irq_init(void);
+extern int arch_init_chip_data(struct irq_desc *desc, int cpu);
+
 #endif
diff --git a/include/linux/irq.h b/include/linux/irq.h
index 0e40af4bac4..d64a6d49bde 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -193,9 +193,6 @@ struct irq_desc {
 	const char		*name;
 } ____cacheline_internodealigned_in_smp;
 
-extern int early_irq_init(void);
-extern int arch_early_irq_init(void);
-extern int arch_init_chip_data(struct irq_desc *desc, int cpu);
 extern void arch_init_copy_chip_data(struct irq_desc *old_desc,
 					struct irq_desc *desc, int cpu);
 extern void arch_free_chip_data(struct irq_desc *old_desc, struct irq_desc *desc);
diff --git a/init/main.c b/init/main.c
index c314aa15370..2c183abbf61 100644
--- a/init/main.c
+++ b/init/main.c
@@ -539,16 +539,6 @@ void __init __weak thread_info_cache_init(void)
 {
 }
 
-int __init __weak arch_early_irq_init(void)
-{
-	return 0;
-}
-
-int __init __weak early_irq_init(void)
-{
-	return arch_early_irq_init();
-}
-
 asmlinkage void __init start_kernel(void)
 {
 	char * command_line;
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index c2741b02ad3..46953a06f4a 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -261,15 +261,6 @@ void enable_irq(unsigned int irq)
 }
 EXPORT_SYMBOL(enable_irq);
 
-/*
- * [ Not in kernel/irq/handle.c, so that GCC does not
- *   inline the __weak alias: ]
- */
-int __weak arch_init_chip_data(struct irq_desc *desc, int cpu)
-{
-	return 0;
-}
-
 static int set_irq_wake_real(unsigned int irq, unsigned int on)
 {
 	struct irq_desc *desc = irq_to_desc(irq);
diff --git a/kernel/softirq.c b/kernel/softirq.c
index e7c69a720d6..daf46358d2d 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -797,3 +797,23 @@ int on_each_cpu(void (*func) (void *info), void *info, int wait)
 }
 EXPORT_SYMBOL(on_each_cpu);
 #endif
+
+/*
+ * [ These __weak aliases are kept in a separate compilation unit, so that
+ *   GCC does not inline them incorrectly. ]
+ */
+
+int __init __weak early_irq_init(void)
+{
+	return 0;
+}
+
+int __init __weak arch_early_irq_init(void)
+{
+	return 0;
+}
+
+int __weak arch_init_chip_data(struct irq_desc *desc, int cpu)
+{
+	return 0;
+}
-- 
cgit v1.2.3-70-g09d2


From d61c72e52b98411d1cfef1fdb3f5a8bb070f8966 Mon Sep 17 00:00:00 2001
From: Jiri Slaby <jirislaby@gmail.com>
Date: Wed, 10 Dec 2008 14:07:21 +0100
Subject: DMI: add dmi_match

Add a wrapper for testing system_info which will handle also NULL
system infos.

This will be used by the ata PIIX driver.

Signed-off-by: Jiri Slaby <jirislaby@gmail.com>
Cc: Alexandru Romanescu <a_romanescu@yahoo.co.uk>
Cc: Tejun Heo <tj@kernel.org>
Cc: Alan Cox <alan@lxorguk.ukuu.org.uk>
Signed-off-by: Jeff Garzik <jgarzik@redhat.com>
---
 drivers/firmware/dmi_scan.c | 16 ++++++++++++++++
 include/linux/dmi.h         |  3 +++
 2 files changed, 19 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/firmware/dmi_scan.c b/drivers/firmware/dmi_scan.c
index 4a597d8c2f7..78b989d202a 100644
--- a/drivers/firmware/dmi_scan.c
+++ b/drivers/firmware/dmi_scan.c
@@ -582,3 +582,19 @@ int dmi_walk(void (*decode)(const struct dmi_header *))
 	return 0;
 }
 EXPORT_SYMBOL_GPL(dmi_walk);
+
+/**
+ * dmi_match - compare a string to the dmi field (if exists)
+ *
+ * Returns true if the requested field equals to the str (including NULL).
+ */
+bool dmi_match(enum dmi_field f, const char *str)
+{
+	const char *info = dmi_get_system_info(f);
+
+	if (info == NULL || str == NULL)
+		return info == str;
+
+	return !strcmp(info, str);
+}
+EXPORT_SYMBOL_GPL(dmi_match);
diff --git a/include/linux/dmi.h b/include/linux/dmi.h
index 2bfda178f27..34161907b2f 100644
--- a/include/linux/dmi.h
+++ b/include/linux/dmi.h
@@ -47,6 +47,7 @@ extern int dmi_name_in_vendors(const char *str);
 extern int dmi_name_in_serial(const char *str);
 extern int dmi_available;
 extern int dmi_walk(void (*decode)(const struct dmi_header *));
+extern bool dmi_match(enum dmi_field f, const char *str);
 
 #else
 
@@ -61,6 +62,8 @@ static inline int dmi_name_in_serial(const char *s) { return 0; }
 #define dmi_available 0
 static inline int dmi_walk(void (*decode)(const struct dmi_header *))
 	{ return -1; }
+static inline bool dmi_match(enum dmi_field f, const char *str)
+	{ return false; }
 
 #endif
 
-- 
cgit v1.2.3-70-g09d2


From 2a2ca6a96194c4744a2adeefbc09ce881f3c5abe Mon Sep 17 00:00:00 2001
From: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Date: Mon, 29 Dec 2008 20:27:31 +0100
Subject: ide: replace the global ide_lock spinlock by per-hwgroup spinlocks
 (v2)

Now that (almost) all host drivers have been fixed not to abuse ide_lock
and core code usage of ide_lock has been sanitized we may safely replace
ide_lock by per-hwgroup locks.

This patch is partially based on earlier patch from Ravikiran G Thirumalai.

While at it:
- don't use deprecated HWIF() and HWGROUP() macros
- update locking documentation in ide.h

v2:
Add missing spin_lock_init(&hwgroup->lock).  (Noticed by Elias Oltmanns)

Cc: Vaibhav V. Nivargi <vaibhav.nivargi@gmail.com>
Cc: Alok N. Kataria <alokk@calsoftinc.com>
Cc: Shai Fultheim <shai@scalex86.org>
Signed-off-by: Ravikiran Thirumalai <kiran@scalex86.org>
Cc: Elias Oltmanns <eo@nebensachen.de>
Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
---
 drivers/ide/ide-io.c    | 33 ++++++++++++++++-----------------
 drivers/ide/ide-iops.c  | 48 +++++++++++++++++++++++-------------------------
 drivers/ide/ide-park.c  | 16 ++++++++--------
 drivers/ide/ide-probe.c | 26 +++++++++++++++-----------
 drivers/ide/ide.c       |  8 +++-----
 drivers/ide/umc8672.c   | 11 +++++++----
 drivers/scsi/ide-scsi.c | 32 +++++++++++++++++++++-----------
 include/linux/ide.h     | 12 +++++++-----
 8 files changed, 100 insertions(+), 86 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/ide/ide-io.c b/drivers/ide/ide-io.c
index 02059e96e6c..72d0d702d5d 100644
--- a/drivers/ide/ide-io.c
+++ b/drivers/ide/ide-io.c
@@ -907,7 +907,7 @@ repeat:
 
 /*
  * Issue a new request to a drive from hwgroup
- * Caller must have already done spin_lock_irqsave(&ide_lock, ..);
+ * Caller must have already done spin_lock_irqsave(&hwgroup->lock, ..);
  *
  * A hwgroup is a serialized group of IDE interfaces.  Usually there is
  * exactly one hwif (interface) per hwgroup, but buggy controllers (eg. CMD640)
@@ -919,7 +919,7 @@ repeat:
  * possibly along with many other devices.  This is especially common in
  * PCI-based systems with off-board IDE controller cards.
  *
- * The IDE driver uses the single global ide_lock spinlock to protect
+ * The IDE driver uses a per-hwgroup spinlock to protect
  * access to the request queues, and to protect the hwgroup->busy flag.
  *
  * The first thread into the driver for a particular hwgroup sets the
@@ -935,7 +935,7 @@ repeat:
  * will start the next request from the queue.  If no more work remains,
  * the driver will clear the hwgroup->busy flag and exit.
  *
- * The ide_lock (spinlock) is used to protect all access to the
+ * The per-hwgroup spinlock is used to protect all access to the
  * hwgroup->busy flag, but is otherwise not needed for most processing in
  * the driver.  This makes the driver much more friendlier to shared IRQs
  * than previous designs, while remaining 100% (?) SMP safe and capable.
@@ -948,7 +948,7 @@ static void ide_do_request (ide_hwgroup_t *hwgroup, int masked_irq)
 	ide_startstop_t	startstop;
 	int             loops = 0;
 
-	/* caller must own ide_lock */
+	/* caller must own hwgroup->lock */
 	BUG_ON(!irqs_disabled());
 
 	while (!hwgroup->busy) {
@@ -1070,11 +1070,11 @@ static void ide_do_request (ide_hwgroup_t *hwgroup, int masked_irq)
 		 */
 		if (masked_irq != IDE_NO_IRQ && hwif->irq != masked_irq)
 			disable_irq_nosync(hwif->irq);
-		spin_unlock(&ide_lock);
+		spin_unlock(&hwgroup->lock);
 		local_irq_enable_in_hardirq();
 			/* allow other IRQs while we start this request */
 		startstop = start_request(drive, rq);
-		spin_lock_irq(&ide_lock);
+		spin_lock_irq(&hwgroup->lock);
 		if (masked_irq != IDE_NO_IRQ && hwif->irq != masked_irq)
 			enable_irq(hwif->irq);
 		if (startstop == ide_stopped)
@@ -1172,7 +1172,7 @@ void ide_timer_expiry (unsigned long data)
 	unsigned long	flags;
 	unsigned long	wait = -1;
 
-	spin_lock_irqsave(&ide_lock, flags);
+	spin_lock_irqsave(&hwgroup->lock, flags);
 
 	if (((handler = hwgroup->handler) == NULL) ||
 	    (hwgroup->req_gen != hwgroup->req_gen_timer)) {
@@ -1205,7 +1205,7 @@ void ide_timer_expiry (unsigned long data)
 					hwgroup->timer.expires  = jiffies + wait;
 					hwgroup->req_gen_timer = hwgroup->req_gen;
 					add_timer(&hwgroup->timer);
-					spin_unlock_irqrestore(&ide_lock, flags);
+					spin_unlock_irqrestore(&hwgroup->lock, flags);
 					return;
 				}
 			}
@@ -1215,7 +1215,7 @@ void ide_timer_expiry (unsigned long data)
 			 * the handler() function, which means we need to
 			 * globally mask the specific IRQ:
 			 */
-			spin_unlock(&ide_lock);
+			spin_unlock(&hwgroup->lock);
 			hwif  = HWIF(drive);
 			/* disable_irq_nosync ?? */
 			disable_irq(hwif->irq);
@@ -1239,14 +1239,14 @@ void ide_timer_expiry (unsigned long data)
 						  hwif->tp_ops->read_status(hwif));
 			}
 			drive->service_time = jiffies - drive->service_start;
-			spin_lock_irq(&ide_lock);
+			spin_lock_irq(&hwgroup->lock);
 			enable_irq(hwif->irq);
 			if (startstop == ide_stopped)
 				hwgroup->busy = 0;
 		}
 	}
 	ide_do_request(hwgroup, IDE_NO_IRQ);
-	spin_unlock_irqrestore(&ide_lock, flags);
+	spin_unlock_irqrestore(&hwgroup->lock, flags);
 }
 
 /**
@@ -1339,14 +1339,13 @@ irqreturn_t ide_intr (int irq, void *dev_id)
 {
 	unsigned long flags;
 	ide_hwgroup_t *hwgroup = (ide_hwgroup_t *)dev_id;
-	ide_hwif_t *hwif;
+	ide_hwif_t *hwif = hwgroup->hwif;
 	ide_drive_t *drive;
 	ide_handler_t *handler;
 	ide_startstop_t startstop;
 	irqreturn_t irq_ret = IRQ_NONE;
 
-	spin_lock_irqsave(&ide_lock, flags);
-	hwif = hwgroup->hwif;
+	spin_lock_irqsave(&hwgroup->lock, flags);
 
 	if (!ide_ack_intr(hwif))
 		goto out;
@@ -1416,7 +1415,7 @@ irqreturn_t ide_intr (int irq, void *dev_id)
 	hwgroup->handler = NULL;
 	hwgroup->req_gen++;
 	del_timer(&hwgroup->timer);
-	spin_unlock(&ide_lock);
+	spin_unlock(&hwgroup->lock);
 
 	if (hwif->port_ops && hwif->port_ops->clear_irq)
 		hwif->port_ops->clear_irq(drive);
@@ -1427,7 +1426,7 @@ irqreturn_t ide_intr (int irq, void *dev_id)
 	/* service this interrupt, may set handler for next interrupt */
 	startstop = handler(drive);
 
-	spin_lock_irq(&ide_lock);
+	spin_lock_irq(&hwgroup->lock);
 	/*
 	 * Note that handler() may have set things up for another
 	 * interrupt to occur soon, but it cannot happen until
@@ -1448,7 +1447,7 @@ irqreturn_t ide_intr (int irq, void *dev_id)
 out_handled:
 	irq_ret = IRQ_HANDLED;
 out:
-	spin_unlock_irqrestore(&ide_lock, flags);
+	spin_unlock_irqrestore(&hwgroup->lock, flags);
 	return irq_ret;
 }
 
diff --git a/drivers/ide/ide-iops.c b/drivers/ide/ide-iops.c
index c41c3b9b6f0..ad8bd653928 100644
--- a/drivers/ide/ide-iops.c
+++ b/drivers/ide/ide-iops.c
@@ -835,10 +835,12 @@ static void __ide_set_handler (ide_drive_t *drive, ide_handler_t *handler,
 void ide_set_handler (ide_drive_t *drive, ide_handler_t *handler,
 		      unsigned int timeout, ide_expiry_t *expiry)
 {
+	ide_hwgroup_t *hwgroup = drive->hwif->hwgroup;
 	unsigned long flags;
-	spin_lock_irqsave(&ide_lock, flags);
+
+	spin_lock_irqsave(&hwgroup->lock, flags);
 	__ide_set_handler(drive, handler, timeout, expiry);
-	spin_unlock_irqrestore(&ide_lock, flags);
+	spin_unlock_irqrestore(&hwgroup->lock, flags);
 }
 
 EXPORT_SYMBOL(ide_set_handler);
@@ -860,10 +862,11 @@ EXPORT_SYMBOL(ide_set_handler);
 void ide_execute_command(ide_drive_t *drive, u8 cmd, ide_handler_t *handler,
 			 unsigned timeout, ide_expiry_t *expiry)
 {
+	ide_hwif_t *hwif = drive->hwif;
+	ide_hwgroup_t *hwgroup = hwif->hwgroup;
 	unsigned long flags;
-	ide_hwif_t *hwif = HWIF(drive);
 
-	spin_lock_irqsave(&ide_lock, flags);
+	spin_lock_irqsave(&hwgroup->lock, flags);
 	__ide_set_handler(drive, handler, timeout, expiry);
 	hwif->tp_ops->exec_command(hwif, cmd);
 	/*
@@ -873,19 +876,20 @@ void ide_execute_command(ide_drive_t *drive, u8 cmd, ide_handler_t *handler,
 	 * FIXME: we could skip this delay with care on non shared devices
 	 */
 	ndelay(400);
-	spin_unlock_irqrestore(&ide_lock, flags);
+	spin_unlock_irqrestore(&hwgroup->lock, flags);
 }
 EXPORT_SYMBOL(ide_execute_command);
 
 void ide_execute_pkt_cmd(ide_drive_t *drive)
 {
 	ide_hwif_t *hwif = drive->hwif;
+	ide_hwgroup_t *hwgroup = hwif->hwgroup;
 	unsigned long flags;
 
-	spin_lock_irqsave(&ide_lock, flags);
+	spin_lock_irqsave(&hwgroup->lock, flags);
 	hwif->tp_ops->exec_command(hwif, ATA_CMD_PACKET);
 	ndelay(400);
-	spin_unlock_irqrestore(&ide_lock, flags);
+	spin_unlock_irqrestore(&hwgroup->lock, flags);
 }
 EXPORT_SYMBOL_GPL(ide_execute_pkt_cmd);
 
@@ -1076,22 +1080,16 @@ static void pre_reset(ide_drive_t *drive)
  */
 static ide_startstop_t do_reset1 (ide_drive_t *drive, int do_not_try_atapi)
 {
-	unsigned int unit;
-	unsigned long flags, timeout;
-	ide_hwif_t *hwif;
-	ide_hwgroup_t *hwgroup;
-	struct ide_io_ports *io_ports;
-	const struct ide_tp_ops *tp_ops;
+	ide_hwif_t *hwif = drive->hwif;
+	ide_hwgroup_t *hwgroup = hwif->hwgroup;
+	struct ide_io_ports *io_ports = &hwif->io_ports;
+	const struct ide_tp_ops *tp_ops = hwif->tp_ops;
 	const struct ide_port_ops *port_ops;
+	unsigned long flags, timeout;
+	unsigned int unit;
 	DEFINE_WAIT(wait);
 
-	spin_lock_irqsave(&ide_lock, flags);
-	hwif = HWIF(drive);
-	hwgroup = HWGROUP(drive);
-
-	io_ports = &hwif->io_ports;
-
-	tp_ops = hwif->tp_ops;
+	spin_lock_irqsave(&hwgroup->lock, flags);
 
 	/* We must not reset with running handlers */
 	BUG_ON(hwgroup->handler != NULL);
@@ -1106,7 +1104,7 @@ static ide_startstop_t do_reset1 (ide_drive_t *drive, int do_not_try_atapi)
 		hwgroup->poll_timeout = jiffies + WAIT_WORSTCASE;
 		hwgroup->polling = 1;
 		__ide_set_handler(drive, &atapi_reset_pollfunc, HZ/20, NULL);
-		spin_unlock_irqrestore(&ide_lock, flags);
+		spin_unlock_irqrestore(&hwgroup->lock, flags);
 		return ide_started;
 	}
 
@@ -1129,9 +1127,9 @@ static ide_startstop_t do_reset1 (ide_drive_t *drive, int do_not_try_atapi)
 		if (time_before_eq(timeout, now))
 			break;
 
-		spin_unlock_irqrestore(&ide_lock, flags);
+		spin_unlock_irqrestore(&hwgroup->lock, flags);
 		timeout = schedule_timeout_uninterruptible(timeout - now);
-		spin_lock_irqsave(&ide_lock, flags);
+		spin_lock_irqsave(&hwgroup->lock, flags);
 	} while (timeout);
 	finish_wait(&ide_park_wq, &wait);
 
@@ -1143,7 +1141,7 @@ static ide_startstop_t do_reset1 (ide_drive_t *drive, int do_not_try_atapi)
 		pre_reset(&hwif->drives[unit]);
 
 	if (io_ports->ctl_addr == 0) {
-		spin_unlock_irqrestore(&ide_lock, flags);
+		spin_unlock_irqrestore(&hwgroup->lock, flags);
 		ide_complete_drive_reset(drive, -ENXIO);
 		return ide_stopped;
 	}
@@ -1179,7 +1177,7 @@ static ide_startstop_t do_reset1 (ide_drive_t *drive, int do_not_try_atapi)
 	if (port_ops && port_ops->resetproc)
 		port_ops->resetproc(drive);
 
-	spin_unlock_irqrestore(&ide_lock, flags);
+	spin_unlock_irqrestore(&hwgroup->lock, flags);
 	return ide_started;
 }
 
diff --git a/drivers/ide/ide-park.c b/drivers/ide/ide-park.c
index 03b00e57e93..63d01c55f86 100644
--- a/drivers/ide/ide-park.c
+++ b/drivers/ide/ide-park.c
@@ -7,17 +7,16 @@ DECLARE_WAIT_QUEUE_HEAD(ide_park_wq);
 
 static void issue_park_cmd(ide_drive_t *drive, unsigned long timeout)
 {
+	ide_hwgroup_t *hwgroup = drive->hwif->hwgroup;
 	struct request_queue *q = drive->queue;
 	struct request *rq;
 	int rc;
 
 	timeout += jiffies;
-	spin_lock_irq(&ide_lock);
+	spin_lock_irq(&hwgroup->lock);
 	if (drive->dev_flags & IDE_DFLAG_PARKED) {
-		ide_hwgroup_t *hwgroup = drive->hwif->hwgroup;
-		int reset_timer;
+		int reset_timer = time_before(timeout, drive->sleep);
 
-		reset_timer = time_before(timeout, drive->sleep);
 		drive->sleep = timeout;
 		wake_up_all(&ide_park_wq);
 		if (reset_timer && hwgroup->sleeping &&
@@ -26,10 +25,10 @@ static void issue_park_cmd(ide_drive_t *drive, unsigned long timeout)
 			hwgroup->busy = 0;
 			blk_start_queueing(q);
 		}
-		spin_unlock_irq(&ide_lock);
+		spin_unlock_irq(&hwgroup->lock);
 		return;
 	}
-	spin_unlock_irq(&ide_lock);
+	spin_unlock_irq(&hwgroup->lock);
 
 	rq = blk_get_request(q, READ, __GFP_WAIT);
 	rq->cmd[0] = REQ_PARK_HEADS;
@@ -62,20 +61,21 @@ ssize_t ide_park_show(struct device *dev, struct device_attribute *attr,
 		      char *buf)
 {
 	ide_drive_t *drive = to_ide_device(dev);
+	ide_hwgroup_t *hwgroup = drive->hwif->hwgroup;
 	unsigned long now;
 	unsigned int msecs;
 
 	if (drive->dev_flags & IDE_DFLAG_NO_UNLOAD)
 		return -EOPNOTSUPP;
 
-	spin_lock_irq(&ide_lock);
+	spin_lock_irq(&hwgroup->lock);
 	now = jiffies;
 	if (drive->dev_flags & IDE_DFLAG_PARKED &&
 	    time_after(drive->sleep, now))
 		msecs = jiffies_to_msecs(drive->sleep - now);
 	else
 		msecs = 0;
-	spin_unlock_irq(&ide_lock);
+	spin_unlock_irq(&hwgroup->lock);
 
 	return snprintf(buf, 20, "%u\n", msecs);
 }
diff --git a/drivers/ide/ide-probe.c b/drivers/ide/ide-probe.c
index c55bdbd2231..504bc9480e9 100644
--- a/drivers/ide/ide-probe.c
+++ b/drivers/ide/ide-probe.c
@@ -906,7 +906,8 @@ static int ide_init_queue(ide_drive_t *drive)
 	 *	do not.
 	 */
 
-	q = blk_init_queue_node(do_ide_request, &ide_lock, hwif_to_node(hwif));
+	q = blk_init_queue_node(do_ide_request, &hwif->hwgroup->lock,
+				hwif_to_node(hwif));
 	if (!q)
 		return 1;
 
@@ -947,7 +948,7 @@ static void ide_add_drive_to_hwgroup(ide_drive_t *drive)
 {
 	ide_hwgroup_t *hwgroup = drive->hwif->hwgroup;
 
-	spin_lock_irq(&ide_lock);
+	spin_lock_irq(&hwgroup->lock);
 	if (!hwgroup->drive) {
 		/* first drive for hwgroup. */
 		drive->next = drive;
@@ -957,7 +958,7 @@ static void ide_add_drive_to_hwgroup(ide_drive_t *drive)
 		drive->next = hwgroup->drive->next;
 		hwgroup->drive->next = drive;
 	}
-	spin_unlock_irq(&ide_lock);
+	spin_unlock_irq(&hwgroup->lock);
 }
 
 /*
@@ -1002,7 +1003,7 @@ void ide_remove_port_from_hwgroup(ide_hwif_t *hwif)
 
 	ide_ports[hwif->index] = NULL;
 
-	spin_lock_irq(&ide_lock);
+	spin_lock_irq(&hwgroup->lock);
 	/*
 	 * Remove us from the hwgroup, and free
 	 * the hwgroup if we were the only member
@@ -1030,7 +1031,7 @@ void ide_remove_port_from_hwgroup(ide_hwif_t *hwif)
 		}
 		BUG_ON(hwgroup->hwif == hwif);
 	}
-	spin_unlock_irq(&ide_lock);
+	spin_unlock_irq(&hwgroup->lock);
 }
 
 /*
@@ -1092,17 +1093,19 @@ static int init_irq (ide_hwif_t *hwif)
 		 * linked list, the first entry is the hwif that owns
 		 * hwgroup->handler - do not change that.
 		 */
-		spin_lock_irq(&ide_lock);
+		spin_lock_irq(&hwgroup->lock);
 		hwif->next = hwgroup->hwif->next;
 		hwgroup->hwif->next = hwif;
 		BUG_ON(hwif->next == hwif);
-		spin_unlock_irq(&ide_lock);
+		spin_unlock_irq(&hwgroup->lock);
 	} else {
 		hwgroup = kmalloc_node(sizeof(*hwgroup), GFP_KERNEL|__GFP_ZERO,
 				       hwif_to_node(hwif));
 		if (hwgroup == NULL)
 			goto out_up;
 
+		spin_lock_init(&hwgroup->lock);
+
 		hwif->hwgroup = hwgroup;
 		hwgroup->hwif = hwif->next = hwif;
 
@@ -1263,20 +1266,21 @@ static void ide_remove_drive_from_hwgroup(ide_drive_t *drive)
 static void drive_release_dev (struct device *dev)
 {
 	ide_drive_t *drive = container_of(dev, ide_drive_t, gendev);
+	ide_hwgroup_t *hwgroup = drive->hwif->hwgroup;
 
 	ide_proc_unregister_device(drive);
 
-	spin_lock_irq(&ide_lock);
+	spin_lock_irq(&hwgroup->lock);
 	ide_remove_drive_from_hwgroup(drive);
 	kfree(drive->id);
 	drive->id = NULL;
 	drive->dev_flags &= ~IDE_DFLAG_PRESENT;
 	/* Messed up locking ... */
-	spin_unlock_irq(&ide_lock);
+	spin_unlock_irq(&hwgroup->lock);
 	blk_cleanup_queue(drive->queue);
-	spin_lock_irq(&ide_lock);
+	spin_lock_irq(&hwgroup->lock);
 	drive->queue = NULL;
-	spin_unlock_irq(&ide_lock);
+	spin_unlock_irq(&hwgroup->lock);
 
 	complete(&drive->gendev_rel_comp);
 }
diff --git a/drivers/ide/ide.c b/drivers/ide/ide.c
index bca6877ee6a..41d04205354 100644
--- a/drivers/ide/ide.c
+++ b/drivers/ide/ide.c
@@ -74,9 +74,6 @@ static const u8 ide_hwif_to_major[] = { IDE0_MAJOR, IDE1_MAJOR,
 
 DEFINE_MUTEX(ide_cfg_mtx);
 
-__cacheline_aligned_in_smp DEFINE_SPINLOCK(ide_lock);
-EXPORT_SYMBOL(ide_lock);
-
 static void ide_port_init_devices_data(ide_hwif_t *);
 
 /*
@@ -333,6 +330,7 @@ static int set_pio_mode_abuse(ide_hwif_t *hwif, u8 req_pio)
 static int set_pio_mode(ide_drive_t *drive, int arg)
 {
 	ide_hwif_t *hwif = drive->hwif;
+	ide_hwgroup_t *hwgroup = hwif->hwgroup;
 	const struct ide_port_ops *port_ops = hwif->port_ops;
 
 	if (arg < 0 || arg > 255)
@@ -347,9 +345,9 @@ static int set_pio_mode(ide_drive_t *drive, int arg)
 			unsigned long flags;
 
 			/* take lock for IDE_DFLAG_[NO_]UNMASK/[NO_]IO_32BIT */
-			spin_lock_irqsave(&ide_lock, flags);
+			spin_lock_irqsave(&hwgroup->lock, flags);
 			port_ops->set_pio_mode(drive, arg);
-			spin_unlock_irqrestore(&ide_lock, flags);
+			spin_unlock_irqrestore(&hwgroup->lock, flags);
 		} else
 			port_ops->set_pio_mode(drive, arg);
 	} else {
diff --git a/drivers/ide/umc8672.c b/drivers/ide/umc8672.c
index 1da076e0c91..e29978cf619 100644
--- a/drivers/ide/umc8672.c
+++ b/drivers/ide/umc8672.c
@@ -107,18 +107,21 @@ static void umc_set_speeds(u8 speeds[])
 static void umc_set_pio_mode(ide_drive_t *drive, const u8 pio)
 {
 	ide_hwif_t *hwif = drive->hwif;
-	unsigned long flags;
+	ide_hwgroup_t *mate_hwgroup = hwif->mate ? hwif->mate->hwgroup : NULL;
+	unsigned long uninitialized_var(flags);
 
 	printk("%s: setting umc8672 to PIO mode%d (speed %d)\n",
 		drive->name, pio, pio_to_umc[pio]);
-	spin_lock_irqsave(&ide_lock, flags);
-	if (hwif->mate && hwif->mate->hwgroup->handler) {
+	if (mate_hwgroup)
+		spin_lock_irqsave(&mate_hwgroup->lock, flags);
+	if (mate_hwgroup && mate_hwgroup->handler) {
 		printk(KERN_ERR "umc8672: other interface is busy: exiting tune_umc()\n");
 	} else {
 		current_speeds[drive->name[2] - 'a'] = pio_to_umc[pio];
 		umc_set_speeds(current_speeds);
 	}
-	spin_unlock_irqrestore(&ide_lock, flags);
+	if (mate_hwgroup)
+		spin_unlock_irqrestore(&mate_hwgroup->lock, flags);
 }
 
 static const struct ide_port_ops umc8672_port_ops = {
diff --git a/drivers/scsi/ide-scsi.c b/drivers/scsi/ide-scsi.c
index 2370fd82ebf..c24140aff8e 100644
--- a/drivers/scsi/ide-scsi.c
+++ b/drivers/scsi/ide-scsi.c
@@ -578,6 +578,8 @@ static int idescsi_eh_abort (struct scsi_cmnd *cmd)
 {
 	idescsi_scsi_t *scsi  = scsihost_to_idescsi(cmd->device->host);
 	ide_drive_t    *drive = scsi->drive;
+	ide_hwif_t     *hwif;
+	ide_hwgroup_t  *hwgroup;
 	int		busy;
 	int             ret   = FAILED;
 
@@ -594,13 +596,16 @@ static int idescsi_eh_abort (struct scsi_cmnd *cmd)
 		goto no_drive;
 	}
 
-	/* First give it some more time, how much is "right" is hard to say :-( */
+	hwif = drive->hwif;
+	hwgroup = hwif->hwgroup;
 
-	busy = ide_wait_not_busy(HWIF(drive), 100);	/* FIXME - uses mdelay which causes latency? */
+	/* First give it some more time, how much is "right" is hard to say :-(
+	   FIXME - uses mdelay which causes latency? */
+	busy = ide_wait_not_busy(hwif, 100);
 	if (test_bit(IDESCSI_LOG_CMD, &scsi->log))
 		printk (KERN_WARNING "ide-scsi: drive did%s become ready\n", busy?" not":"");
 
-	spin_lock_irq(&ide_lock);
+	spin_lock_irq(&hwgroup->lock);
 
 	/* If there is no pc running we're done (our interrupt took care of it) */
 	pc = drive->pc;
@@ -629,7 +634,7 @@ static int idescsi_eh_abort (struct scsi_cmnd *cmd)
 	}
 
 ide_unlock:
-	spin_unlock_irq(&ide_lock);
+	spin_unlock_irq(&hwgroup->lock);
 no_drive:
 	if (test_bit(IDESCSI_LOG_CMD, &scsi->log))
 		printk (KERN_WARNING "ide-scsi: abort returns %s\n", ret == SUCCESS?"success":"failed");
@@ -642,6 +647,7 @@ static int idescsi_eh_reset (struct scsi_cmnd *cmd)
 	struct request *req;
 	idescsi_scsi_t *scsi  = scsihost_to_idescsi(cmd->device->host);
 	ide_drive_t    *drive = scsi->drive;
+	ide_hwgroup_t  *hwgroup;
 	int             ready = 0;
 	int             ret   = SUCCESS;
 
@@ -658,14 +664,18 @@ static int idescsi_eh_reset (struct scsi_cmnd *cmd)
 		return FAILED;
 	}
 
+	hwgroup = drive->hwif->hwgroup;
+
 	spin_lock_irq(cmd->device->host->host_lock);
-	spin_lock(&ide_lock);
+	spin_lock(&hwgroup->lock);
 
 	pc = drive->pc;
+	if (pc)
+		req = pc->rq;
 
-	if (pc == NULL || (req = pc->rq) != HWGROUP(drive)->rq || !HWGROUP(drive)->handler) {
+	if (pc == NULL || req != hwgroup->rq || hwgroup->handler == NULL) {
 		printk (KERN_WARNING "ide-scsi: No active request in idescsi_eh_reset\n");
-		spin_unlock(&ide_lock);
+		spin_unlock(&hwgroup->lock);
 		spin_unlock_irq(cmd->device->host->host_lock);
 		return FAILED;
 	}
@@ -685,10 +695,10 @@ static int idescsi_eh_reset (struct scsi_cmnd *cmd)
 			BUG();
 	}
 
-	HWGROUP(drive)->rq = NULL;
-	HWGROUP(drive)->handler = NULL;
-	HWGROUP(drive)->busy = 1;		/* will set this to zero when ide reset finished */
-	spin_unlock(&ide_lock);
+	hwgroup->rq = NULL;
+	hwgroup->handler = NULL;
+	hwgroup->busy = 1; /* will set this to zero when ide reset finished */
+	spin_unlock(&hwgroup->lock);
 
 	ide_do_reset(drive);
 
diff --git a/include/linux/ide.h b/include/linux/ide.h
index 010fb26a157..c871d325ced 100644
--- a/include/linux/ide.h
+++ b/include/linux/ide.h
@@ -909,6 +909,8 @@ typedef struct hwgroup_s {
 
 	int req_gen;
 	int req_gen_timer;
+
+	spinlock_t lock;
 } ide_hwgroup_t;
 
 typedef struct ide_driver_s ide_driver_t;
@@ -1610,13 +1612,13 @@ extern struct mutex ide_cfg_mtx;
 /*
  * Structure locking:
  *
- * ide_cfg_mtx and ide_lock together protect changes to
- * ide_hwif_t->{next,hwgroup}
+ * ide_cfg_mtx and hwgroup->lock together protect changes to
+ * ide_hwif_t->next
  * ide_drive_t->next
  *
- * ide_hwgroup_t->busy: ide_lock
- * ide_hwgroup_t->hwif: ide_lock
- * ide_hwif_t->mate: constant, no locking
+ * ide_hwgroup_t->busy: hwgroup->lock
+ * ide_hwgroup_t->hwif: hwgroup->lock
+ * ide_hwif_t->{hwgroup,mate}: constant, no locking
  * ide_drive_t->hwif: constant, no locking
  */
 
-- 
cgit v1.2.3-70-g09d2


From 27c01c2db05c3cf8824975e50403cd4fd9356dca Mon Sep 17 00:00:00 2001
From: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Date: Mon, 29 Dec 2008 20:27:32 +0100
Subject: ide-cd: remove obsolete seek optimization

It doesn't make much sense nowadays and is problematic on some drives.

Cc: Borislav Petkov <petkovbb@googlemail.com>
Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
---
 drivers/ide/ide-cd.c | 89 ++++------------------------------------------------
 drivers/ide/ide-cd.h |  2 --
 include/linux/ide.h  |  4 ---
 3 files changed, 6 insertions(+), 89 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/ide/ide-cd.c b/drivers/ide/ide-cd.c
index 692fd4570df..ea35e94d098 100644
--- a/drivers/ide/ide-cd.c
+++ b/drivers/ide/ide-cd.c
@@ -774,52 +774,6 @@ static ide_startstop_t cdrom_start_rw_cont(ide_drive_t *drive)
 	return cdrom_transfer_packet_command(drive, rq, cdrom_newpc_intr);
 }
 
-#define IDECD_SEEK_THRESHOLD	(1000)			/* 1000 blocks */
-#define IDECD_SEEK_TIMER	(5 * WAIT_MIN_SLEEP)	/* 100 ms */
-#define IDECD_SEEK_TIMEOUT	(2 * WAIT_CMD)		/* 20 sec */
-
-static ide_startstop_t cdrom_seek_intr(ide_drive_t *drive)
-{
-	struct cdrom_info *info = drive->driver_data;
-	int stat;
-	static int retry = 10;
-
-	ide_debug_log(IDE_DBG_FUNC, "Call %s\n", __func__);
-
-	if (cdrom_decode_status(drive, 0, &stat))
-		return ide_stopped;
-
-	drive->atapi_flags |= IDE_AFLAG_SEEKING;
-
-	if (retry && time_after(jiffies, info->start_seek + IDECD_SEEK_TIMER)) {
-		if (--retry == 0)
-			drive->dev_flags &= ~IDE_DFLAG_DSC_OVERLAP;
-	}
-	return ide_stopped;
-}
-
-static void ide_cd_prepare_seek_request(ide_drive_t *drive, struct request *rq)
-{
-	sector_t frame = rq->sector;
-
-	ide_debug_log(IDE_DBG_FUNC, "Call %s\n", __func__);
-
-	sector_div(frame, queue_hardsect_size(drive->queue) >> SECTOR_BITS);
-
-	memset(rq->cmd, 0, BLK_MAX_CDB);
-	rq->cmd[0] = GPCMD_SEEK;
-	put_unaligned(cpu_to_be32(frame), (unsigned int *) &rq->cmd[2]);
-
-	rq->timeout = ATAPI_WAIT_PC;
-}
-
-static ide_startstop_t cdrom_start_seek_continuation(ide_drive_t *drive)
-{
-	struct request *rq = drive->hwif->hwgroup->rq;
-
-	return cdrom_transfer_packet_command(drive, rq, &cdrom_seek_intr);
-}
-
 /*
  * Fix up a possibly partially-processed request so that we can start it over
  * entirely, or even put it back on the request queue.
@@ -1260,7 +1214,6 @@ static void cdrom_do_block_pc(ide_drive_t *drive, struct request *rq)
 static ide_startstop_t ide_cd_do_request(ide_drive_t *drive, struct request *rq,
 					sector_t block)
 {
-	struct cdrom_info *info = drive->driver_data;
 	ide_handler_t *fn;
 	int xferlen;
 
@@ -1270,44 +1223,14 @@ static ide_startstop_t ide_cd_do_request(ide_drive_t *drive, struct request *rq,
 		      (unsigned long long)block);
 
 	if (blk_fs_request(rq)) {
-		if (drive->atapi_flags & IDE_AFLAG_SEEKING) {
-			ide_hwif_t *hwif = drive->hwif;
-			unsigned long elapsed = jiffies - info->start_seek;
-			int stat = hwif->tp_ops->read_status(hwif);
-
-			if ((stat & ATA_DSC) != ATA_DSC) {
-				if (elapsed < IDECD_SEEK_TIMEOUT) {
-					ide_stall_queue(drive,
-							IDECD_SEEK_TIMER);
-					return ide_stopped;
-				}
-				printk(KERN_ERR PFX "%s: DSC timeout\n",
-						drive->name);
-			}
-			drive->atapi_flags &= ~IDE_AFLAG_SEEKING;
-		}
-		if (rq_data_dir(rq) == READ &&
-		    IDE_LARGE_SEEK(info->last_block, block,
-			    IDECD_SEEK_THRESHOLD) &&
-		    (drive->dev_flags & IDE_DFLAG_DSC_OVERLAP)) {
-			xferlen = 0;
-			fn = cdrom_start_seek_continuation;
-
-			drive->dma = 0;
-			info->start_seek = jiffies;
-
-			ide_cd_prepare_seek_request(drive, rq);
-		} else {
-			xferlen = 32768;
-			fn = cdrom_start_rw_cont;
+		xferlen = 32768;
+		fn = cdrom_start_rw_cont;
 
-			if (cdrom_start_rw(drive, rq) == ide_stopped)
-				return ide_stopped;
+		if (cdrom_start_rw(drive, rq) == ide_stopped)
+			return ide_stopped;
 
-			if (ide_cd_prepare_rw_request(drive, rq) == ide_stopped)
-				return ide_stopped;
-		}
-		info->last_block = block;
+		if (ide_cd_prepare_rw_request(drive, rq) == ide_stopped)
+			return ide_stopped;
 	} else if (blk_sense_request(rq) || blk_pc_request(rq) ||
 		   rq->cmd_type == REQ_TYPE_ATA_PC) {
 		xferlen = rq->data_len;
diff --git a/drivers/ide/ide-cd.h b/drivers/ide/ide-cd.h
index 5882b9a9ea8..d5ce3362dbd 100644
--- a/drivers/ide/ide-cd.h
+++ b/drivers/ide/ide-cd.h
@@ -88,8 +88,6 @@ struct cdrom_info {
 	struct request_sense sense_data;
 
 	struct request request_sense_request;
-	unsigned long last_block;
-	unsigned long start_seek;
 
 	u8 max_speed;		/* Max speed of the drive. */
 	u8 current_speed;	/* Current speed of the drive. */
diff --git a/include/linux/ide.h b/include/linux/ide.h
index c871d325ced..0b8af0535d8 100644
--- a/include/linux/ide.h
+++ b/include/linux/ide.h
@@ -122,8 +122,6 @@ struct ide_io_ports {
 #define MAX_DRIVES	2	/* per interface; 2 assumed by lots of code */
 #define SECTOR_SIZE	512
 
-#define IDE_LARGE_SEEK(b1,b2,t)	(((b1) > (b2) + (t)) || ((b2) > (b1) + (t)))
-
 /*
  * Timeouts for various operations:
  */
@@ -496,8 +494,6 @@ enum {
 	 * when more than one interrupt is needed.
 	 */
 	IDE_AFLAG_LIMIT_NFRAMES		= (1 << 7),
-	/* Seeking in progress. */
-	IDE_AFLAG_SEEKING		= (1 << 8),
 	/* Saved TOC information is current. */
 	IDE_AFLAG_TOC_VALID		= (1 << 9),
 	/* We think that the drive door is locked. */
-- 
cgit v1.2.3-70-g09d2


From 6b5cde3629701258004b94cde75dd1089b556b02 Mon Sep 17 00:00:00 2001
From: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Date: Mon, 29 Dec 2008 20:27:32 +0100
Subject: cmd64x: set IDE_HFLAG_SERIALIZE explictly for CMD646

* Set IDE_HFLAG_SERIALIZE explictly for CMD646.

* Remove no longer needed ide_cmd646 chipset type (which has
  a nice side-effect of fixing handling of unexpected IRQs).

Cc: Sergei Shtylyov <sshtylyov@ru.mvista.com>
Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
---
 drivers/ide/cmd64x.c    | 4 ++--
 drivers/ide/ide-probe.c | 2 +-
 drivers/ide/ide-proc.c  | 1 -
 include/linux/ide.h     | 2 +-
 4 files changed, 4 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/ide/cmd64x.c b/drivers/ide/cmd64x.c
index 935385c77e0..3623bf013bc 100644
--- a/drivers/ide/cmd64x.c
+++ b/drivers/ide/cmd64x.c
@@ -424,10 +424,10 @@ static const struct ide_port_info cmd64x_chipsets[] __devinitdata = {
 		.name		= DRV_NAME,
 		.init_chipset	= init_chipset_cmd64x,
 		.enablebits	= {{0x51,0x04,0x04}, {0x51,0x08,0x08}},
-		.chipset	= ide_cmd646,
 		.port_ops	= &cmd64x_port_ops,
 		.dma_ops	= &cmd648_dma_ops,
-		.host_flags	= IDE_HFLAG_ABUSE_PREFETCH,
+		.host_flags	= IDE_HFLAG_SERIALIZE |
+				  IDE_HFLAG_ABUSE_PREFETCH,
 		.pio_mask	= ATA_PIO5,
 		.mwdma_mask	= ATA_MWDMA2,
 		.udma_mask	= ATA_UDMA2,
diff --git a/drivers/ide/ide-probe.c b/drivers/ide/ide-probe.c
index b0f9ededfad..a0eb72e2a02 100644
--- a/drivers/ide/ide-probe.c
+++ b/drivers/ide/ide-probe.c
@@ -1125,7 +1125,7 @@ static int init_irq (ide_hwif_t *hwif)
 		sa = IRQF_SHARED;
 #endif /* __mc68000__ */
 
-		if (hwif->chipset == ide_pci || hwif->chipset == ide_cmd646)
+		if (hwif->chipset == ide_pci)
 			sa = IRQF_SHARED;
 
 		if (io_ports->ctl_addr)
diff --git a/drivers/ide/ide-proc.c b/drivers/ide/ide-proc.c
index dd899e1f384..e41669eec2c 100644
--- a/drivers/ide/ide-proc.c
+++ b/drivers/ide/ide-proc.c
@@ -48,7 +48,6 @@ static int proc_ide_read_imodel
 	case ide_ht6560b:	name = "ht6560b";	break;
 	case ide_rz1000:	name = "rz1000";	break;
 	case ide_trm290:	name = "trm290";	break;
-	case ide_cmd646:	name = "cmd646";	break;
 	case ide_cy82c693:	name = "cy82c693";	break;
 	case ide_4drives:	name = "4drives";	break;
 	case ide_pmac:		name = "mac-io";	break;
diff --git a/include/linux/ide.h b/include/linux/ide.h
index 0b8af0535d8..150e42311ee 100644
--- a/include/linux/ide.h
+++ b/include/linux/ide.h
@@ -171,7 +171,7 @@ enum {		ide_unknown,	ide_generic,	ide_pci,
 		ide_cmd640,	ide_dtc2278,	ide_ali14xx,
 		ide_qd65xx,	ide_umc8672,	ide_ht6560b,
 		ide_rz1000,	ide_trm290,
-		ide_cmd646,	ide_cy82c693,	ide_4drives,
+		ide_cy82c693,	ide_4drives,
 		ide_pmac,	ide_acorn,
 		ide_au1xxx,	ide_palm3710
 };
-- 
cgit v1.2.3-70-g09d2


From f58c1ab8deebc2360cef998f169a6727c288210f Mon Sep 17 00:00:00 2001
From: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Date: Mon, 29 Dec 2008 20:27:33 +0100
Subject: ide: always set nIEN on idle devices

* Set nIEN for previous port/device in ide_do_request()
  also if port uses a non-shared IRQ.

* Remove no longer needed ide_hwif_t.sharing_irq.

Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
---
 drivers/ide/ide-io.c    | 2 +-
 drivers/ide/ide-probe.c | 4 +---
 include/linux/ide.h     | 1 -
 3 files changed, 2 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/ide/ide-io.c b/drivers/ide/ide-io.c
index ca51460af75..f776bd47501 100644
--- a/drivers/ide/ide-io.c
+++ b/drivers/ide/ide-io.c
@@ -1003,7 +1003,7 @@ static void ide_do_request (ide_hwgroup_t *hwgroup, int masked_irq)
 		}
 	again:
 		hwif = HWIF(drive);
-		if (hwgroup->hwif->sharing_irq && hwif != hwgroup->hwif) {
+		if (hwif != hwgroup->hwif) {
 			/*
 			 * set nIEN for previous hwif, drives in the
 			 * quirk_list may not like intr setups/cleanups
diff --git a/drivers/ide/ide-probe.c b/drivers/ide/ide-probe.c
index a0eb72e2a02..81f61e8ea97 100644
--- a/drivers/ide/ide-probe.c
+++ b/drivers/ide/ide-probe.c
@@ -1060,7 +1060,6 @@ static int init_irq (ide_hwif_t *hwif)
 
 		if (h && h->hwgroup) {  /* scan only initialized ports */
 			if (hwif->irq == h->irq) {
-				hwif->sharing_irq = h->sharing_irq = 1;
 				if (hwif->chipset != ide_pci ||
 				    h->chipset != ide_pci) {
 					save_match(hwif, h, &match);
@@ -1152,8 +1151,7 @@ static int init_irq (ide_hwif_t *hwif)
 		io_ports->data_addr, hwif->irq);
 #endif /* __mc68000__ */
 	if (match)
-		printk(KERN_CONT " (%sed with %s)",
-			hwif->sharing_irq ? "shar" : "serializ", match->name);
+		printk(KERN_CONT " (serialized with %s)", match->name);
 	printk(KERN_CONT "\n");
 
 	mutex_unlock(&ide_cfg_mtx);
diff --git a/include/linux/ide.h b/include/linux/ide.h
index 150e42311ee..1d28006aec6 100644
--- a/include/linux/ide.h
+++ b/include/linux/ide.h
@@ -842,7 +842,6 @@ typedef struct hwif_s {
 
 	unsigned	present    : 1;	/* this interface exists */
 	unsigned	serialized : 1;	/* serialized all channel operation */
-	unsigned	sharing_irq: 1;	/* 1 = sharing irq with another hwif */
 	unsigned	sg_mapped  : 1;	/* sg_table and sg_nents are ready */
 
 	struct device		gendev;
-- 
cgit v1.2.3-70-g09d2


From 7f1ac8c4b9dadc55ec656b368f5f470f2cbe3083 Mon Sep 17 00:00:00 2001
From: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Date: Mon, 29 Dec 2008 20:27:33 +0100
Subject: rz1000: apply chipset quirks early (v2)

* Use pci_name(dev) instead of hwif->name in init_hwif_rz1000().

* init_hwif_rz1000() -> rz1000_init_chipset().  Update rz1000_init_one()
  to use rz1000_init_chipset() and add now required rz1000_remove().

* Remove superfluous ide_rz1000 chipset type.

v2:
* unsigned int rz1000_init_chipset() -> int rz1000_disable_readahead()
  per Sergei's suggestion.

Cc: Sergei Shtylyov <sshtylyov@ru.mvista.com>
Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
---
 drivers/ide/ide-proc.c |  1 -
 drivers/ide/rz1000.c   | 36 +++++++++++++++++++++++++-----------
 include/linux/ide.h    |  3 +--
 3 files changed, 26 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/ide/ide-proc.c b/drivers/ide/ide-proc.c
index e41669eec2c..c2e6b8927bd 100644
--- a/drivers/ide/ide-proc.c
+++ b/drivers/ide/ide-proc.c
@@ -46,7 +46,6 @@ static int proc_ide_read_imodel
 	case ide_qd65xx:	name = "qd65xx";	break;
 	case ide_umc8672:	name = "umc8672";	break;
 	case ide_ht6560b:	name = "ht6560b";	break;
-	case ide_rz1000:	name = "rz1000";	break;
 	case ide_trm290:	name = "trm290";	break;
 	case ide_cy82c693:	name = "cy82c693";	break;
 	case ide_4drives:	name = "4drives";	break;
diff --git a/drivers/ide/rz1000.c b/drivers/ide/rz1000.c
index 7daf0135cba..a6414a884eb 100644
--- a/drivers/ide/rz1000.c
+++ b/drivers/ide/rz1000.c
@@ -22,34 +22,48 @@
 
 #define DRV_NAME "rz1000"
 
-static void __devinit init_hwif_rz1000 (ide_hwif_t *hwif)
+static int __devinit rz1000_disable_readahead(struct pci_dev *dev)
 {
-	struct pci_dev *dev = to_pci_dev(hwif->dev);
 	u16 reg;
 
 	if (!pci_read_config_word (dev, 0x40, &reg) &&
 	    !pci_write_config_word(dev, 0x40, reg & 0xdfff)) {
 		printk(KERN_INFO "%s: disabled chipset read-ahead "
-			"(buggy RZ1000/RZ1001)\n", hwif->name);
+			"(buggy RZ1000/RZ1001)\n", pci_name(dev));
+		return 0;
 	} else {
-		if (hwif->mate)
-			hwif->mate->serialized = hwif->serialized = 1;
-		hwif->host_flags |= IDE_HFLAG_NO_UNMASK_IRQS;
 		printk(KERN_INFO "%s: serialized, disabled unmasking "
-			"(buggy RZ1000/RZ1001)\n", hwif->name);
+			"(buggy RZ1000/RZ1001)\n", pci_name(dev));
+		return 1;
 	}
 }
 
 static const struct ide_port_info rz1000_chipset __devinitdata = {
 	.name		= DRV_NAME,
-	.init_hwif	= init_hwif_rz1000,
-	.chipset	= ide_rz1000,
 	.host_flags	= IDE_HFLAG_NO_DMA,
 };
 
 static int __devinit rz1000_init_one(struct pci_dev *dev, const struct pci_device_id *id)
 {
-	return ide_pci_init_one(dev, &rz1000_chipset, NULL);
+	struct ide_port_info d = rz1000_chipset;
+	int rc;
+
+	rc = pci_enable_device(dev);
+	if (rc)
+		return rc;
+
+	if (rz1000_disable_readahead(dev)) {
+		d.host_flags |= IDE_HFLAG_SERIALIZE;
+		d.host_flags |= IDE_HFLAG_NO_UNMASK_IRQS;
+	}
+
+	return ide_pci_init_one(dev, &d, NULL);
+}
+
+static void rz1000_remove(struct pci_dev *dev)
+{
+	ide_pci_remove(dev);
+	pci_disable_device(dev);
 }
 
 static const struct pci_device_id rz1000_pci_tbl[] = {
@@ -63,7 +77,7 @@ static struct pci_driver rz1000_pci_driver = {
 	.name		= "RZ1000_IDE",
 	.id_table	= rz1000_pci_tbl,
 	.probe		= rz1000_init_one,
-	.remove		= ide_pci_remove,
+	.remove		= rz1000_remove,
 };
 
 static int __init rz1000_ide_init(void)
diff --git a/include/linux/ide.h b/include/linux/ide.h
index 1d28006aec6..fc1a966c7b7 100644
--- a/include/linux/ide.h
+++ b/include/linux/ide.h
@@ -170,8 +170,7 @@ typedef int (ide_ack_intr_t)(struct hwif_s *);
 enum {		ide_unknown,	ide_generic,	ide_pci,
 		ide_cmd640,	ide_dtc2278,	ide_ali14xx,
 		ide_qd65xx,	ide_umc8672,	ide_ht6560b,
-		ide_rz1000,	ide_trm290,
-		ide_cy82c693,	ide_4drives,
+		ide_trm290,	ide_cy82c693,	ide_4drives,
 		ide_pmac,	ide_acorn,
 		ide_au1xxx,	ide_palm3710
 };
-- 
cgit v1.2.3-70-g09d2


From 6b4924962c49655494d2c8e9d3faab0e349a3062 Mon Sep 17 00:00:00 2001
From: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Date: Mon, 29 Dec 2008 20:27:34 +0100
Subject: ide: add ->max_sectors field to struct ide_port_info

* Add ->max_sectors field to struct ide_port_info to allow host drivers
  to specify value used for hwif->rqsize (if smaller than the default).

* Convert pdc202xx_old to use ->max_sectors and remove no longer needed
  IDE_HFLAG_RQSIZE_256 flag.

There should be no functional changes caused by this patch.

Acked-by: Sergei Shtyltov <sshtylyov@ru.mvista.com>
Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
---
 drivers/ide/ide-probe.c    | 4 ++--
 drivers/ide/pdc202xx_old.c | 9 +++++----
 include/linux/ide.h        | 5 +++--
 3 files changed, 10 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/ide/ide-probe.c b/drivers/ide/ide-probe.c
index 81f61e8ea97..f76c22c4508 100644
--- a/drivers/ide/ide-probe.c
+++ b/drivers/ide/ide-probe.c
@@ -1442,8 +1442,8 @@ static void ide_init_port(ide_hwif_t *hwif, unsigned int port,
 			hwif->mate->serialized = hwif->serialized = 1;
 	}
 
-	if (d->host_flags & IDE_HFLAG_RQSIZE_256)
-		hwif->rqsize = 256;
+	if (d->max_sectors)
+		hwif->rqsize = d->max_sectors;
 
 	/* call chipset specific routine for each enabled port */
 	if (d->init_hwif)
diff --git a/drivers/ide/pdc202xx_old.c b/drivers/ide/pdc202xx_old.c
index 799557c25ee..624e62e5cc9 100644
--- a/drivers/ide/pdc202xx_old.c
+++ b/drivers/ide/pdc202xx_old.c
@@ -350,16 +350,17 @@ static const struct ide_dma_ops pdc2026x_dma_ops = {
 	.dma_timeout		= pdc202xx_dma_timeout,
 };
 
-#define DECLARE_PDC2026X_DEV(udma, extra_flags) \
+#define DECLARE_PDC2026X_DEV(udma, sectors) \
 	{ \
 		.name		= DRV_NAME, \
 		.init_chipset	= init_chipset_pdc202xx, \
 		.port_ops	= &pdc2026x_port_ops, \
 		.dma_ops	= &pdc2026x_dma_ops, \
-		.host_flags	= IDE_HFLAGS_PDC202XX | extra_flags, \
+		.host_flags	= IDE_HFLAGS_PDC202XX, \
 		.pio_mask	= ATA_PIO4, \
 		.mwdma_mask	= ATA_MWDMA2, \
 		.udma_mask	= udma, \
+		.max_sectors	= sectors, \
 	}
 
 static const struct ide_port_info pdc202xx_chipsets[] __devinitdata = {
@@ -376,8 +377,8 @@ static const struct ide_port_info pdc202xx_chipsets[] __devinitdata = {
 
 	/* 1: PDC2026{2,3} */
 	DECLARE_PDC2026X_DEV(ATA_UDMA4, 0),
-	/* 2: PDC2026{5,7} */
-	DECLARE_PDC2026X_DEV(ATA_UDMA5, IDE_HFLAG_RQSIZE_256),
+	/* 2: PDC2026{5,7}: UDMA5, limit LBA48 requests to 256 sectors */
+	DECLARE_PDC2026X_DEV(ATA_UDMA5, 256),
 };
 
 /**
diff --git a/include/linux/ide.h b/include/linux/ide.h
index fc1a966c7b7..2574dda4a3e 100644
--- a/include/linux/ide.h
+++ b/include/linux/ide.h
@@ -1372,8 +1372,6 @@ enum {
 	IDE_HFLAG_LEGACY_IRQS		= (1 << 21),
 	/* force use of legacy IRQs */
 	IDE_HFLAG_FORCE_LEGACY_IRQS	= (1 << 22),
-	/* limit LBA48 requests to 256 sectors */
-	IDE_HFLAG_RQSIZE_256		= (1 << 23),
 	/* use 32-bit I/O ops */
 	IDE_HFLAG_IO_32BIT		= (1 << 24),
 	/* unmask IRQs */
@@ -1411,6 +1409,9 @@ struct ide_port_info {
 
 	ide_pci_enablebit_t	enablebits[2];
 	hwif_chipset_t		chipset;
+
+	u16			max_sectors;	/* if < than the default one */
+
 	u32			host_flags;
 	u8			pio_mask;
 	u8			swdma_mask;
-- 
cgit v1.2.3-70-g09d2


From 1f66019bdf902cb59adf959e462bcd3f4c01f683 Mon Sep 17 00:00:00 2001
From: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Date: Mon, 29 Dec 2008 20:27:34 +0100
Subject: trm290: add IDE_HFLAG_TRM290 host flag

* Add IDE_HFLAG_TRM290 host flag and use it in ide_build_dmatable().

* Remove no longer needed ide_trm290 chipset type.

Acked-by: Sergei Shtylyov <sshtylyov@ru.mvista.com>
Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
---
 drivers/ide/ide-dma-sff.c | 2 +-
 drivers/ide/ide-proc.c    | 1 -
 drivers/ide/trm290.c      | 4 ++--
 include/linux/ide.h       | 4 +++-
 4 files changed, 6 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/ide/ide-dma-sff.c b/drivers/ide/ide-dma-sff.c
index cac431f0df1..1f2a5f56f81 100644
--- a/drivers/ide/ide-dma-sff.c
+++ b/drivers/ide/ide-dma-sff.c
@@ -98,10 +98,10 @@ int ide_build_dmatable(ide_drive_t *drive, struct request *rq)
 {
 	ide_hwif_t *hwif = drive->hwif;
 	__le32 *table = (__le32 *)hwif->dmatable_cpu;
-	unsigned int is_trm290	= (hwif->chipset == ide_trm290) ? 1 : 0;
 	unsigned int count = 0;
 	int i;
 	struct scatterlist *sg;
+	u8 is_trm290 = !!(hwif->host_flags & IDE_HFLAG_TRM290);
 
 	hwif->sg_nents = ide_build_sglist(drive, rq);
 	if (hwif->sg_nents == 0)
diff --git a/drivers/ide/ide-proc.c b/drivers/ide/ide-proc.c
index c2e6b8927bd..066d2317545 100644
--- a/drivers/ide/ide-proc.c
+++ b/drivers/ide/ide-proc.c
@@ -46,7 +46,6 @@ static int proc_ide_read_imodel
 	case ide_qd65xx:	name = "qd65xx";	break;
 	case ide_umc8672:	name = "umc8672";	break;
 	case ide_ht6560b:	name = "ht6560b";	break;
-	case ide_trm290:	name = "trm290";	break;
 	case ide_cy82c693:	name = "cy82c693";	break;
 	case ide_4drives:	name = "4drives";	break;
 	case ide_pmac:		name = "mac-io";	break;
diff --git a/drivers/ide/trm290.c b/drivers/ide/trm290.c
index 75ea6152656..2a5ea90cf8b 100644
--- a/drivers/ide/trm290.c
+++ b/drivers/ide/trm290.c
@@ -328,10 +328,10 @@ static struct ide_dma_ops trm290_dma_ops = {
 static const struct ide_port_info trm290_chipset __devinitdata = {
 	.name		= DRV_NAME,
 	.init_hwif	= init_hwif_trm290,
-	.chipset	= ide_trm290,
 	.port_ops	= &trm290_port_ops,
 	.dma_ops	= &trm290_dma_ops,
-	.host_flags	= IDE_HFLAG_NO_ATAPI_DMA |
+	.host_flags	= IDE_HFLAG_TRM290 |
+			  IDE_HFLAG_NO_ATAPI_DMA |
 #if 0 /* play it safe for now */
 			  IDE_HFLAG_TRUST_BIOS_FOR_DMA |
 #endif
diff --git a/include/linux/ide.h b/include/linux/ide.h
index 2574dda4a3e..f62d35a5fb7 100644
--- a/include/linux/ide.h
+++ b/include/linux/ide.h
@@ -170,7 +170,7 @@ typedef int (ide_ack_intr_t)(struct hwif_s *);
 enum {		ide_unknown,	ide_generic,	ide_pci,
 		ide_cmd640,	ide_dtc2278,	ide_ali14xx,
 		ide_qd65xx,	ide_umc8672,	ide_ht6560b,
-		ide_trm290,	ide_cy82c693,	ide_4drives,
+		ide_cy82c693,	ide_4drives,
 		ide_pmac,	ide_acorn,
 		ide_au1xxx,	ide_palm3710
 };
@@ -1372,6 +1372,8 @@ enum {
 	IDE_HFLAG_LEGACY_IRQS		= (1 << 21),
 	/* force use of legacy IRQs */
 	IDE_HFLAG_FORCE_LEGACY_IRQS	= (1 << 22),
+	/* host is TRM290 */
+	IDE_HFLAG_TRM290		= (1 << 23),
 	/* use 32-bit I/O ops */
 	IDE_HFLAG_IO_32BIT		= (1 << 24),
 	/* unmask IRQs */
-- 
cgit v1.2.3-70-g09d2


From b7876a6fb6e9bf6cbcf7b40cf034aa4138d7978f Mon Sep 17 00:00:00 2001
From: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Date: Mon, 29 Dec 2008 20:27:34 +0100
Subject: cy82c693: remove superfluous ide_cy82c693 chipset type

Since CY82C693 doesn't require serialization we may as well
use the default ide_pci chipset type.

Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
---
 drivers/ide/cy82c693.c | 1 -
 drivers/ide/ide-proc.c | 1 -
 include/linux/ide.h    | 3 +--
 3 files changed, 1 insertion(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/ide/cy82c693.c b/drivers/ide/cy82c693.c
index 5297f07d293..d37baf8ecc5 100644
--- a/drivers/ide/cy82c693.c
+++ b/drivers/ide/cy82c693.c
@@ -292,7 +292,6 @@ static const struct ide_port_info cy82c693_chipset __devinitdata = {
 	.name		= DRV_NAME,
 	.init_iops	= init_iops_cy82c693,
 	.port_ops	= &cy82c693_port_ops,
-	.chipset	= ide_cy82c693,
 	.host_flags	= IDE_HFLAG_SINGLE,
 	.pio_mask	= ATA_PIO4,
 	.swdma_mask	= ATA_SWDMA2,
diff --git a/drivers/ide/ide-proc.c b/drivers/ide/ide-proc.c
index 066d2317545..a14e2938e4f 100644
--- a/drivers/ide/ide-proc.c
+++ b/drivers/ide/ide-proc.c
@@ -46,7 +46,6 @@ static int proc_ide_read_imodel
 	case ide_qd65xx:	name = "qd65xx";	break;
 	case ide_umc8672:	name = "umc8672";	break;
 	case ide_ht6560b:	name = "ht6560b";	break;
-	case ide_cy82c693:	name = "cy82c693";	break;
 	case ide_4drives:	name = "4drives";	break;
 	case ide_pmac:		name = "mac-io";	break;
 	case ide_au1xxx:	name = "au1xxx";	break;
diff --git a/include/linux/ide.h b/include/linux/ide.h
index f62d35a5fb7..f89d6d69a38 100644
--- a/include/linux/ide.h
+++ b/include/linux/ide.h
@@ -170,8 +170,7 @@ typedef int (ide_ack_intr_t)(struct hwif_s *);
 enum {		ide_unknown,	ide_generic,	ide_pci,
 		ide_cmd640,	ide_dtc2278,	ide_ali14xx,
 		ide_qd65xx,	ide_umc8672,	ide_ht6560b,
-		ide_cy82c693,	ide_4drives,
-		ide_pmac,	ide_acorn,
+		ide_4drives,	ide_pmac,	ide_acorn,
 		ide_au1xxx,	ide_palm3710
 };
 
-- 
cgit v1.2.3-70-g09d2


From 702c026be87ef8374ae58122969a4b0b081ce6f2 Mon Sep 17 00:00:00 2001
From: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Date: Mon, 29 Dec 2008 20:27:36 +0100
Subject: ide: rework handling of serialized ports (v2)

* hpt366: set IDE_HFLAG_SERIALIZE in ->host_flags if needed
  in init_hwif_hpt366().  Remove HPT_SERIALIZE_IO while at it.

* Set IDE_HFLAG_SERIALIZE in ->host_flags if needed in
  ide_init_port().

* Convert init_irq() to use IDE_HFLAG_SERIALIZE together with
  hwif->host to find out ports which need to be serialized.

* Remove no longer needed save_match() and ide_hwif_t.serialized.

v2:
* Set host's ->host_flags field instead of port's copy.

This patch should fix the incorrect grouping of port(s) from
host(s) that need serialization with port(s) that happen to use
the same IRQ(s) but are from the host(s) that don't need it.

Cc: Sergei Shtylyov <sshtylyov@ru.mvista.com>
Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
---
 drivers/ide/hpt366.c    |  8 +-------
 drivers/ide/ide-probe.c | 50 +++++--------------------------------------------
 include/linux/ide.h     |  1 -
 3 files changed, 6 insertions(+), 53 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/ide/hpt366.c b/drivers/ide/hpt366.c
index f5afd46ed51..b18e10d99d2 100644
--- a/drivers/ide/hpt366.c
+++ b/drivers/ide/hpt366.c
@@ -135,7 +135,6 @@
 /* various tuning parameters */
 #define HPT_RESET_STATE_ENGINE
 #undef	HPT_DELAY_INTERRUPT
-#define HPT_SERIALIZE_IO	0
 
 static const char *quirk_drives[] = {
 	"QUANTUM FIREBALLlct08 08",
@@ -1288,7 +1287,6 @@ static u8 hpt3xx_cable_detect(ide_hwif_t *hwif)
 static void __devinit init_hwif_hpt366(ide_hwif_t *hwif)
 {
 	struct hpt_info *info	= hpt3xx_get_info(hwif->dev);
-	int serialize		= HPT_SERIALIZE_IO;
 	u8  chip_type		= info->chip_type;
 
 	/* Cache the channel's MISC. control registers' offset */
@@ -1305,13 +1303,9 @@ static void __devinit init_hwif_hpt366(ide_hwif_t *hwif)
 		 * Clock is shared between the channels,
 		 * so we'll have to serialize them... :-(
 		 */
-		serialize = 1;
+		hwif->host->host_flags |= IDE_HFLAG_SERIALIZE;
 		hwif->rw_disk = &hpt3xxn_rw_disk;
 	}
-
-	/* Serialize access to this device if needed */
-	if (serialize && hwif->mate)
-		hwif->serialized = hwif->mate->serialized = 1;
 }
 
 static int __devinit init_dma_hpt366(ide_hwif_t *hwif,
diff --git a/drivers/ide/ide-probe.c b/drivers/ide/ide-probe.c
index f76c22c4508..c1cd1af2646 100644
--- a/drivers/ide/ide-probe.c
+++ b/drivers/ide/ide-probe.c
@@ -863,31 +863,6 @@ static void ide_port_tune_devices(ide_hwif_t *hwif)
 	}
 }
 
-/*
- * save_match() is used to simplify logic in init_irq() below.
- *
- * A loophole here is that we may not know about a particular
- * hwif's irq until after that hwif is actually probed/initialized..
- * This could be a problem for the case where an hwif is on a
- * dual interface that requires serialization (eg. cmd640) and another
- * hwif using one of the same irqs is initialized beforehand.
- *
- * This routine detects and reports such situations, but does not fix them.
- */
-static void save_match(ide_hwif_t *hwif, ide_hwif_t *new, ide_hwif_t **match)
-{
-	ide_hwif_t *m = *match;
-
-	if (m && m->hwgroup && m->hwgroup != new->hwgroup) {
-		if (!new->hwgroup)
-			return;
-		printk(KERN_WARNING "%s: potential IRQ problem with %s and %s\n",
-			hwif->name, new->name, m->name);
-	}
-	if (!m || m->irq != hwif->irq) /* don't undo a prior perfect match */
-		*match = new;
-}
-
 /*
  * init request queue
  */
@@ -1052,26 +1027,13 @@ static int init_irq (ide_hwif_t *hwif)
 	mutex_lock(&ide_cfg_mtx);
 	hwif->hwgroup = NULL;
 
-	/*
-	 * Group up with any other hwifs that share our irq(s).
-	 */
 	for (index = 0; index < MAX_HWIFS; index++) {
 		ide_hwif_t *h = ide_ports[index];
 
 		if (h && h->hwgroup) {  /* scan only initialized ports */
-			if (hwif->irq == h->irq) {
-				if (hwif->chipset != ide_pci ||
-				    h->chipset != ide_pci) {
-					save_match(hwif, h, &match);
-				}
-			}
-			if (hwif->serialized) {
-				if (hwif->mate && hwif->mate->irq == h->irq)
-					save_match(hwif, h, &match);
-			}
-			if (h->serialized) {
-				if (h->mate && hwif->irq == h->mate->irq)
-					save_match(hwif, h, &match);
+			if (hwif->host->host_flags & IDE_HFLAG_SERIALIZE) {
+				if (hwif->host == h->host)
+					match = h;
 			}
 		}
 	}
@@ -1437,10 +1399,8 @@ static void ide_init_port(ide_hwif_t *hwif, unsigned int port,
 	}
 
 	if ((d->host_flags & IDE_HFLAG_SERIALIZE) ||
-	    ((d->host_flags & IDE_HFLAG_SERIALIZE_DMA) && hwif->dma_base)) {
-		if (hwif->mate)
-			hwif->mate->serialized = hwif->serialized = 1;
-	}
+	    ((d->host_flags & IDE_HFLAG_SERIALIZE_DMA) && hwif->dma_base))
+		hwif->host->host_flags |= IDE_HFLAG_SERIALIZE;
 
 	if (d->max_sectors)
 		hwif->rqsize = d->max_sectors;
diff --git a/include/linux/ide.h b/include/linux/ide.h
index f89d6d69a38..9b89cab6d49 100644
--- a/include/linux/ide.h
+++ b/include/linux/ide.h
@@ -839,7 +839,6 @@ typedef struct hwif_s {
 	unsigned	extra_ports;	/* number of extra dma ports */
 
 	unsigned	present    : 1;	/* this interface exists */
-	unsigned	serialized : 1;	/* serialized all channel operation */
 	unsigned	sg_mapped  : 1;	/* sg_table and sg_nents are ready */
 
 	struct device		gendev;
-- 
cgit v1.2.3-70-g09d2


From e2984c628c924442132304ae662da433f41c05c9 Mon Sep 17 00:00:00 2001
From: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Date: Mon, 29 Dec 2008 20:27:37 +0100
Subject: ide: move Power Management support to ide-pm.c

There should be no functional changes caused by this patch.

Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
---
 drivers/ide/Makefile |   2 +-
 drivers/ide/ide-io.c | 159 ----------------------------------
 drivers/ide/ide-pm.c | 235 +++++++++++++++++++++++++++++++++++++++++++++++++++
 drivers/ide/ide.c    |  74 ----------------
 include/linux/ide.h  |   8 ++
 5 files changed, 244 insertions(+), 234 deletions(-)
 create mode 100644 drivers/ide/ide-pm.c

(limited to 'include/linux')

diff --git a/drivers/ide/Makefile b/drivers/ide/Makefile
index 7818d402b18..bdc7b81bc04 100644
--- a/drivers/ide/Makefile
+++ b/drivers/ide/Makefile
@@ -5,7 +5,7 @@
 EXTRA_CFLAGS				+= -Idrivers/ide
 
 ide-core-y += ide.o ide-ioctls.o ide-io.o ide-iops.o ide-lib.o ide-probe.o \
-	      ide-taskfile.o ide-park.o ide-pio-blacklist.o
+	      ide-taskfile.o ide-pm.o ide-park.o ide-pio-blacklist.o
 
 # core IDE code
 ide-core-$(CONFIG_IDE_TIMINGS)		+= ide-timings.o
diff --git a/drivers/ide/ide-io.c b/drivers/ide/ide-io.c
index 26d3f3cacc8..ecacc008fda 100644
--- a/drivers/ide/ide-io.c
+++ b/drivers/ide/ide-io.c
@@ -120,98 +120,6 @@ int ide_end_request (ide_drive_t *drive, int uptodate, int nr_sectors)
 }
 EXPORT_SYMBOL(ide_end_request);
 
-static void ide_complete_power_step(ide_drive_t *drive, struct request *rq)
-{
-	struct request_pm_state *pm = rq->data;
-
-#ifdef DEBUG_PM
-	printk(KERN_INFO "%s: complete_power_step(step: %d)\n",
-		drive->name, pm->pm_step);
-#endif
-	if (drive->media != ide_disk)
-		return;
-
-	switch (pm->pm_step) {
-	case IDE_PM_FLUSH_CACHE:	/* Suspend step 1 (flush cache) */
-		if (pm->pm_state == PM_EVENT_FREEZE)
-			pm->pm_step = IDE_PM_COMPLETED;
-		else
-			pm->pm_step = IDE_PM_STANDBY;
-		break;
-	case IDE_PM_STANDBY:		/* Suspend step 2 (standby) */
-		pm->pm_step = IDE_PM_COMPLETED;
-		break;
-	case IDE_PM_RESTORE_PIO:	/* Resume step 1 (restore PIO) */
-		pm->pm_step = IDE_PM_IDLE;
-		break;
-	case IDE_PM_IDLE:		/* Resume step 2 (idle)*/
-		pm->pm_step = IDE_PM_RESTORE_DMA;
-		break;
-	}
-}
-
-static ide_startstop_t ide_start_power_step(ide_drive_t *drive, struct request *rq)
-{
-	struct request_pm_state *pm = rq->data;
-	ide_task_t *args = rq->special;
-
-	memset(args, 0, sizeof(*args));
-
-	switch (pm->pm_step) {
-	case IDE_PM_FLUSH_CACHE:	/* Suspend step 1 (flush cache) */
-		if (drive->media != ide_disk)
-			break;
-		/* Not supported? Switch to next step now. */
-		if (ata_id_flush_enabled(drive->id) == 0 ||
-		    (drive->dev_flags & IDE_DFLAG_WCACHE) == 0) {
-			ide_complete_power_step(drive, rq);
-			return ide_stopped;
-		}
-		if (ata_id_flush_ext_enabled(drive->id))
-			args->tf.command = ATA_CMD_FLUSH_EXT;
-		else
-			args->tf.command = ATA_CMD_FLUSH;
-		goto out_do_tf;
-	case IDE_PM_STANDBY:		/* Suspend step 2 (standby) */
-		args->tf.command = ATA_CMD_STANDBYNOW1;
-		goto out_do_tf;
-	case IDE_PM_RESTORE_PIO:	/* Resume step 1 (restore PIO) */
-		ide_set_max_pio(drive);
-		/*
-		 * skip IDE_PM_IDLE for ATAPI devices
-		 */
-		if (drive->media != ide_disk)
-			pm->pm_step = IDE_PM_RESTORE_DMA;
-		else
-			ide_complete_power_step(drive, rq);
-		return ide_stopped;
-	case IDE_PM_IDLE:		/* Resume step 2 (idle) */
-		args->tf.command = ATA_CMD_IDLEIMMEDIATE;
-		goto out_do_tf;
-	case IDE_PM_RESTORE_DMA:	/* Resume step 3 (restore DMA) */
-		/*
-		 * Right now, all we do is call ide_set_dma(drive),
-		 * we could be smarter and check for current xfer_speed
-		 * in struct drive etc...
-		 */
-		if (drive->hwif->dma_ops == NULL)
-			break;
-		/*
-		 * TODO: respect IDE_DFLAG_USING_DMA
-		 */
-		ide_set_dma(drive);
-		break;
-	}
-
-	pm->pm_step = IDE_PM_COMPLETED;
-	return ide_stopped;
-
-out_do_tf:
-	args->tf_flags	 = IDE_TFLAG_TF | IDE_TFLAG_DEVICE;
-	args->data_phase = TASKFILE_NO_DATA;
-	return do_rw_taskfile(drive, args);
-}
-
 /**
  *	ide_end_dequeued_request	-	complete an IDE I/O
  *	@drive: IDE device for the I/O
@@ -236,39 +144,6 @@ int ide_end_dequeued_request(ide_drive_t *drive, struct request *rq,
 }
 EXPORT_SYMBOL_GPL(ide_end_dequeued_request);
 
-
-/**
- *	ide_complete_pm_request - end the current Power Management request
- *	@drive: target drive
- *	@rq: request
- *
- *	This function cleans up the current PM request and stops the queue
- *	if necessary.
- */
-static void ide_complete_pm_request (ide_drive_t *drive, struct request *rq)
-{
-	struct request_queue *q = drive->queue;
-	unsigned long flags;
-
-#ifdef DEBUG_PM
-	printk("%s: completing PM request, %s\n", drive->name,
-	       blk_pm_suspend_request(rq) ? "suspend" : "resume");
-#endif
-	spin_lock_irqsave(q->queue_lock, flags);
-	if (blk_pm_suspend_request(rq)) {
-		blk_stop_queue(q);
-	} else {
-		drive->dev_flags &= ~IDE_DFLAG_BLOCKED;
-		blk_start_queue(q);
-	}
-	spin_unlock_irqrestore(q->queue_lock, flags);
-
-	drive->hwif->hwgroup->rq = NULL;
-
-	if (blk_end_request(rq, 0, 0))
-		BUG();
-}
-
 /**
  *	ide_end_drive_cmd	-	end an explicit drive command
  *	@drive: command 
@@ -697,40 +572,6 @@ static ide_startstop_t ide_special_rq(ide_drive_t *drive, struct request *rq)
 	}
 }
 
-static void ide_check_pm_state(ide_drive_t *drive, struct request *rq)
-{
-	struct request_pm_state *pm = rq->data;
-
-	if (blk_pm_suspend_request(rq) &&
-	    pm->pm_step == IDE_PM_START_SUSPEND)
-		/* Mark drive blocked when starting the suspend sequence. */
-		drive->dev_flags |= IDE_DFLAG_BLOCKED;
-	else if (blk_pm_resume_request(rq) &&
-		 pm->pm_step == IDE_PM_START_RESUME) {
-		/* 
-		 * The first thing we do on wakeup is to wait for BSY bit to
-		 * go away (with a looong timeout) as a drive on this hwif may
-		 * just be POSTing itself.
-		 * We do that before even selecting as the "other" device on
-		 * the bus may be broken enough to walk on our toes at this
-		 * point.
-		 */
-		ide_hwif_t *hwif = drive->hwif;
-		int rc;
-#ifdef DEBUG_PM
-		printk("%s: Wakeup request inited, waiting for !BSY...\n", drive->name);
-#endif
-		rc = ide_wait_not_busy(hwif, 35000);
-		if (rc)
-			printk(KERN_WARNING "%s: bus not ready on wakeup\n", drive->name);
-		SELECT_DRIVE(drive);
-		hwif->tp_ops->set_irq(hwif, 1);
-		rc = ide_wait_not_busy(hwif, 100000);
-		if (rc)
-			printk(KERN_WARNING "%s: drive not ready on wakeup\n", drive->name);
-	}
-}
-
 /**
  *	start_request	-	start of I/O and command issuing for IDE
  *
diff --git a/drivers/ide/ide-pm.c b/drivers/ide/ide-pm.c
new file mode 100644
index 00000000000..8282c6086e6
--- /dev/null
+++ b/drivers/ide/ide-pm.c
@@ -0,0 +1,235 @@
+#include <linux/kernel.h>
+#include <linux/ide.h>
+#include <linux/hdreg.h>
+
+int generic_ide_suspend(struct device *dev, pm_message_t mesg)
+{
+	ide_drive_t *drive = dev->driver_data, *pair = ide_get_pair_dev(drive);
+	ide_hwif_t *hwif = HWIF(drive);
+	struct request *rq;
+	struct request_pm_state rqpm;
+	ide_task_t args;
+	int ret;
+
+	/* call ACPI _GTM only once */
+	if ((drive->dn & 1) == 0 || pair == NULL)
+		ide_acpi_get_timing(hwif);
+
+	memset(&rqpm, 0, sizeof(rqpm));
+	memset(&args, 0, sizeof(args));
+	rq = blk_get_request(drive->queue, READ, __GFP_WAIT);
+	rq->cmd_type = REQ_TYPE_PM_SUSPEND;
+	rq->special = &args;
+	rq->data = &rqpm;
+	rqpm.pm_step = IDE_PM_START_SUSPEND;
+	if (mesg.event == PM_EVENT_PRETHAW)
+		mesg.event = PM_EVENT_FREEZE;
+	rqpm.pm_state = mesg.event;
+
+	ret = blk_execute_rq(drive->queue, NULL, rq, 0);
+	blk_put_request(rq);
+
+	/* call ACPI _PS3 only after both devices are suspended */
+	if (ret == 0 && ((drive->dn & 1) || pair == NULL))
+		ide_acpi_set_state(hwif, 0);
+
+	return ret;
+}
+
+int generic_ide_resume(struct device *dev)
+{
+	ide_drive_t *drive = dev->driver_data, *pair = ide_get_pair_dev(drive);
+	ide_hwif_t *hwif = HWIF(drive);
+	struct request *rq;
+	struct request_pm_state rqpm;
+	ide_task_t args;
+	int err;
+
+	/* call ACPI _PS0 / _STM only once */
+	if ((drive->dn & 1) == 0 || pair == NULL) {
+		ide_acpi_set_state(hwif, 1);
+		ide_acpi_push_timing(hwif);
+	}
+
+	ide_acpi_exec_tfs(drive);
+
+	memset(&rqpm, 0, sizeof(rqpm));
+	memset(&args, 0, sizeof(args));
+	rq = blk_get_request(drive->queue, READ, __GFP_WAIT);
+	rq->cmd_type = REQ_TYPE_PM_RESUME;
+	rq->cmd_flags |= REQ_PREEMPT;
+	rq->special = &args;
+	rq->data = &rqpm;
+	rqpm.pm_step = IDE_PM_START_RESUME;
+	rqpm.pm_state = PM_EVENT_ON;
+
+	err = blk_execute_rq(drive->queue, NULL, rq, 1);
+	blk_put_request(rq);
+
+	if (err == 0 && dev->driver) {
+		ide_driver_t *drv = to_ide_driver(dev->driver);
+
+		if (drv->resume)
+			drv->resume(drive);
+	}
+
+	return err;
+}
+
+void ide_complete_power_step(ide_drive_t *drive, struct request *rq)
+{
+	struct request_pm_state *pm = rq->data;
+
+#ifdef DEBUG_PM
+	printk(KERN_INFO "%s: complete_power_step(step: %d)\n",
+		drive->name, pm->pm_step);
+#endif
+	if (drive->media != ide_disk)
+		return;
+
+	switch (pm->pm_step) {
+	case IDE_PM_FLUSH_CACHE:	/* Suspend step 1 (flush cache) */
+		if (pm->pm_state == PM_EVENT_FREEZE)
+			pm->pm_step = IDE_PM_COMPLETED;
+		else
+			pm->pm_step = IDE_PM_STANDBY;
+		break;
+	case IDE_PM_STANDBY:		/* Suspend step 2 (standby) */
+		pm->pm_step = IDE_PM_COMPLETED;
+		break;
+	case IDE_PM_RESTORE_PIO:	/* Resume step 1 (restore PIO) */
+		pm->pm_step = IDE_PM_IDLE;
+		break;
+	case IDE_PM_IDLE:		/* Resume step 2 (idle)*/
+		pm->pm_step = IDE_PM_RESTORE_DMA;
+		break;
+	}
+}
+
+ide_startstop_t ide_start_power_step(ide_drive_t *drive, struct request *rq)
+{
+	struct request_pm_state *pm = rq->data;
+	ide_task_t *args = rq->special;
+
+	memset(args, 0, sizeof(*args));
+
+	switch (pm->pm_step) {
+	case IDE_PM_FLUSH_CACHE:	/* Suspend step 1 (flush cache) */
+		if (drive->media != ide_disk)
+			break;
+		/* Not supported? Switch to next step now. */
+		if (ata_id_flush_enabled(drive->id) == 0 ||
+		    (drive->dev_flags & IDE_DFLAG_WCACHE) == 0) {
+			ide_complete_power_step(drive, rq);
+			return ide_stopped;
+		}
+		if (ata_id_flush_ext_enabled(drive->id))
+			args->tf.command = ATA_CMD_FLUSH_EXT;
+		else
+			args->tf.command = ATA_CMD_FLUSH;
+		goto out_do_tf;
+	case IDE_PM_STANDBY:		/* Suspend step 2 (standby) */
+		args->tf.command = ATA_CMD_STANDBYNOW1;
+		goto out_do_tf;
+	case IDE_PM_RESTORE_PIO:	/* Resume step 1 (restore PIO) */
+		ide_set_max_pio(drive);
+		/*
+		 * skip IDE_PM_IDLE for ATAPI devices
+		 */
+		if (drive->media != ide_disk)
+			pm->pm_step = IDE_PM_RESTORE_DMA;
+		else
+			ide_complete_power_step(drive, rq);
+		return ide_stopped;
+	case IDE_PM_IDLE:		/* Resume step 2 (idle) */
+		args->tf.command = ATA_CMD_IDLEIMMEDIATE;
+		goto out_do_tf;
+	case IDE_PM_RESTORE_DMA:	/* Resume step 3 (restore DMA) */
+		/*
+		 * Right now, all we do is call ide_set_dma(drive),
+		 * we could be smarter and check for current xfer_speed
+		 * in struct drive etc...
+		 */
+		if (drive->hwif->dma_ops == NULL)
+			break;
+		/*
+		 * TODO: respect IDE_DFLAG_USING_DMA
+		 */
+		ide_set_dma(drive);
+		break;
+	}
+
+	pm->pm_step = IDE_PM_COMPLETED;
+	return ide_stopped;
+
+out_do_tf:
+	args->tf_flags	 = IDE_TFLAG_TF | IDE_TFLAG_DEVICE;
+	args->data_phase = TASKFILE_NO_DATA;
+	return do_rw_taskfile(drive, args);
+}
+
+/**
+ *	ide_complete_pm_request - end the current Power Management request
+ *	@drive: target drive
+ *	@rq: request
+ *
+ *	This function cleans up the current PM request and stops the queue
+ *	if necessary.
+ */
+void ide_complete_pm_request(ide_drive_t *drive, struct request *rq)
+{
+	struct request_queue *q = drive->queue;
+	unsigned long flags;
+
+#ifdef DEBUG_PM
+	printk("%s: completing PM request, %s\n", drive->name,
+	       blk_pm_suspend_request(rq) ? "suspend" : "resume");
+#endif
+	spin_lock_irqsave(q->queue_lock, flags);
+	if (blk_pm_suspend_request(rq)) {
+		blk_stop_queue(q);
+	} else {
+		drive->dev_flags &= ~IDE_DFLAG_BLOCKED;
+		blk_start_queue(q);
+	}
+	spin_unlock_irqrestore(q->queue_lock, flags);
+
+	drive->hwif->hwgroup->rq = NULL;
+
+	if (blk_end_request(rq, 0, 0))
+		BUG();
+}
+
+void ide_check_pm_state(ide_drive_t *drive, struct request *rq)
+{
+	struct request_pm_state *pm = rq->data;
+
+	if (blk_pm_suspend_request(rq) &&
+	    pm->pm_step == IDE_PM_START_SUSPEND)
+		/* Mark drive blocked when starting the suspend sequence. */
+		drive->dev_flags |= IDE_DFLAG_BLOCKED;
+	else if (blk_pm_resume_request(rq) &&
+		 pm->pm_step == IDE_PM_START_RESUME) {
+		/*
+		 * The first thing we do on wakeup is to wait for BSY bit to
+		 * go away (with a looong timeout) as a drive on this hwif may
+		 * just be POSTing itself.
+		 * We do that before even selecting as the "other" device on
+		 * the bus may be broken enough to walk on our toes at this
+		 * point.
+		 */
+		ide_hwif_t *hwif = drive->hwif;
+		int rc;
+#ifdef DEBUG_PM
+		printk("%s: Wakeup request inited, waiting for !BSY...\n", drive->name);
+#endif
+		rc = ide_wait_not_busy(hwif, 35000);
+		if (rc)
+			printk(KERN_WARNING "%s: bus not ready on wakeup\n", drive->name);
+		SELECT_DRIVE(drive);
+		hwif->tp_ops->set_irq(hwif, 1);
+		rc = ide_wait_not_busy(hwif, 100000);
+		if (rc)
+			printk(KERN_WARNING "%s: drive not ready on wakeup\n", drive->name);
+	}
+}
diff --git a/drivers/ide/ide.c b/drivers/ide/ide.c
index 41d04205354..f0f09f702e9 100644
--- a/drivers/ide/ide.c
+++ b/drivers/ide/ide.c
@@ -388,80 +388,6 @@ ide_ext_devset_rw_sync(unmaskirq, unmaskirq);
 ide_ext_devset_rw_sync(using_dma, using_dma);
 __IDE_DEVSET(pio_mode, DS_SYNC, NULL, set_pio_mode);
 
-static int generic_ide_suspend(struct device *dev, pm_message_t mesg)
-{
-	ide_drive_t *drive = dev->driver_data, *pair = ide_get_pair_dev(drive);
-	ide_hwif_t *hwif = HWIF(drive);
-	struct request *rq;
-	struct request_pm_state rqpm;
-	ide_task_t args;
-	int ret;
-
-	/* call ACPI _GTM only once */
-	if ((drive->dn & 1) == 0 || pair == NULL)
-		ide_acpi_get_timing(hwif);
-
-	memset(&rqpm, 0, sizeof(rqpm));
-	memset(&args, 0, sizeof(args));
-	rq = blk_get_request(drive->queue, READ, __GFP_WAIT);
-	rq->cmd_type = REQ_TYPE_PM_SUSPEND;
-	rq->special = &args;
-	rq->data = &rqpm;
-	rqpm.pm_step = IDE_PM_START_SUSPEND;
-	if (mesg.event == PM_EVENT_PRETHAW)
-		mesg.event = PM_EVENT_FREEZE;
-	rqpm.pm_state = mesg.event;
-
-	ret = blk_execute_rq(drive->queue, NULL, rq, 0);
-	blk_put_request(rq);
-
-	/* call ACPI _PS3 only after both devices are suspended */
-	if (ret == 0 && ((drive->dn & 1) || pair == NULL))
-		ide_acpi_set_state(hwif, 0);
-
-	return ret;
-}
-
-static int generic_ide_resume(struct device *dev)
-{
-	ide_drive_t *drive = dev->driver_data, *pair = ide_get_pair_dev(drive);
-	ide_hwif_t *hwif = HWIF(drive);
-	struct request *rq;
-	struct request_pm_state rqpm;
-	ide_task_t args;
-	int err;
-
-	/* call ACPI _PS0 / _STM only once */
-	if ((drive->dn & 1) == 0 || pair == NULL) {
-		ide_acpi_set_state(hwif, 1);
-		ide_acpi_push_timing(hwif);
-	}
-
-	ide_acpi_exec_tfs(drive);
-
-	memset(&rqpm, 0, sizeof(rqpm));
-	memset(&args, 0, sizeof(args));
-	rq = blk_get_request(drive->queue, READ, __GFP_WAIT);
-	rq->cmd_type = REQ_TYPE_PM_RESUME;
-	rq->cmd_flags |= REQ_PREEMPT;
-	rq->special = &args;
-	rq->data = &rqpm;
-	rqpm.pm_step = IDE_PM_START_RESUME;
-	rqpm.pm_state = PM_EVENT_ON;
-
-	err = blk_execute_rq(drive->queue, NULL, rq, 1);
-	blk_put_request(rq);
-
-	if (err == 0 && dev->driver) {
-		ide_driver_t *drv = to_ide_driver(dev->driver);
-
-		if (drv->resume)
-			drv->resume(drive);
-	}
-
-	return err;
-}
-
 /**
  * ide_device_get	-	get an additional reference to a ide_drive_t
  * @drive:	device to get a reference to
diff --git a/include/linux/ide.h b/include/linux/ide.h
index 9b89cab6d49..e99c56de7f5 100644
--- a/include/linux/ide.h
+++ b/include/linux/ide.h
@@ -1116,6 +1116,14 @@ enum {
 	IDE_PM_COMPLETED,
 };
 
+int generic_ide_suspend(struct device *, pm_message_t);
+int generic_ide_resume(struct device *);
+
+void ide_complete_power_step(ide_drive_t *, struct request *);
+ide_startstop_t ide_start_power_step(ide_drive_t *, struct request *);
+void ide_complete_pm_request(ide_drive_t *, struct request *);
+void ide_check_pm_state(ide_drive_t *, struct request *);
+
 /*
  * Subdrivers support.
  *
-- 
cgit v1.2.3-70-g09d2


From 69acdf1e5a9146ec6667f6c4b439acd38c18f5ea Mon Sep 17 00:00:00 2001
From: Robert Jarzmik <robert.jarzmik@free.fr>
Date: Tue, 4 Nov 2008 21:59:37 -0300
Subject: V4L/DVB (9530): Add new pixel format VYUY 16 bits wide.

There were already 3 YUV formats defined :
 - YUYV
 - YVYU
 - UYVY
The only left combination is VYUY, which is added in this
patch.

Signed-off-by: Robert Jarzmik <robert.jarzmik@free.fr>
Signed-off-by: Mauro Carvalho Chehab <mchehab@redhat.com>
---
 include/linux/videodev2.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/videodev2.h b/include/linux/videodev2.h
index 4669d7e72e7..ec311d4616c 100644
--- a/include/linux/videodev2.h
+++ b/include/linux/videodev2.h
@@ -293,6 +293,7 @@ struct v4l2_pix_format {
 #define V4L2_PIX_FMT_YVU420  v4l2_fourcc('Y', 'V', '1', '2') /* 12  YVU 4:2:0     */
 #define V4L2_PIX_FMT_YUYV    v4l2_fourcc('Y', 'U', 'Y', 'V') /* 16  YUV 4:2:2     */
 #define V4L2_PIX_FMT_UYVY    v4l2_fourcc('U', 'Y', 'V', 'Y') /* 16  YUV 4:2:2     */
+#define V4L2_PIX_FMT_VYUY    v4l2_fourcc('V', 'Y', 'U', 'Y') /* 16  YUV 4:2:2     */
 #define V4L2_PIX_FMT_YUV422P v4l2_fourcc('4', '2', '2', 'P') /* 16  YVU422 planar */
 #define V4L2_PIX_FMT_YUV411P v4l2_fourcc('4', '1', '1', 'P') /* 16  YVU411 planar */
 #define V4L2_PIX_FMT_Y41P    v4l2_fourcc('Y', '4', '1', 'P') /* 12  YUV 4:1:1     */
-- 
cgit v1.2.3-70-g09d2


From 278d1ed65e25d80af7c3a112d707b3f70516ddb4 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Tue, 30 Dec 2008 09:05:12 +1030
Subject: cpumask: make CONFIG_NR_CPUS always valid.

Impact: cleanup

Currently we have NR_CPUS, which is 1 on UP, and CONFIG_NR_CPUS on
SMP.  If we make CONFIG_NR_CPUS always valid (and always 1 on !SMP),
we can skip the middleman.

This also allows us to find and check all the unaudited NR_CPUS usage
as we prepare for v. large NR_CPUS.

To avoid breaking every arch, we cheat and do this for the moment
in the header if the arch doesn't.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
Signed-off-by: Mike Travis <travis@sgi.com>
---
 include/linux/threads.h | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/threads.h b/include/linux/threads.h
index 38d1a5d6568..052b12bec8b 100644
--- a/include/linux/threads.h
+++ b/include/linux/threads.h
@@ -8,17 +8,17 @@
  */
 
 /*
- * Maximum supported processors that can run under SMP.  This value is
- * set via configure setting.  The maximum is equal to the size of the
- * bitmasks used on that platform, i.e. 32 or 64.  Setting this smaller
- * saves quite a bit of memory.
+ * Maximum supported processors.  Setting this smaller saves quite a
+ * bit of memory.  Use nr_cpu_ids instead of this except for static bitmaps.
  */
-#ifdef CONFIG_SMP
-#define NR_CPUS		CONFIG_NR_CPUS
-#else
-#define NR_CPUS		1
+#ifndef CONFIG_NR_CPUS
+/* FIXME: This should be fixed in the arch's Kconfig */
+#define CONFIG_NR_CPUS	1
 #endif
 
+/* Places which use this should consider cpumask_var_t. */
+#define NR_CPUS		CONFIG_NR_CPUS
+
 #define MIN_THREADS_LEFT_FOR_ROOT 4
 
 /*
-- 
cgit v1.2.3-70-g09d2


From 4b0bc0bca83f3fb7cf920e2ec80684c15d2269c0 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Tue, 30 Dec 2008 09:05:13 +1030
Subject: bitmap: test for constant as well as small size for inline versions

Impact: reduce text size

bitmap_zero et al have a fastpath for nbits <= BITS_PER_LONG, but this
should really only apply where the nbits is known at compile time.

This only saves about 1200 bytes on an allyesconfig kernel, but with
cpumasks going variable that number will increase.

   text		data	bss	dec		hex	filename
35327852        5035607 6782976 47146435        2cf65c3 vmlinux-before
35326640        5035607 6782976 47145223        2cf6107 vmlinux-after

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 include/linux/bitmap.h | 35 +++++++++++++++++++----------------
 1 file changed, 19 insertions(+), 16 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bitmap.h b/include/linux/bitmap.h
index a08c33a26ca..2878811c613 100644
--- a/include/linux/bitmap.h
+++ b/include/linux/bitmap.h
@@ -137,9 +137,12 @@ extern void bitmap_copy_le(void *dst, const unsigned long *src, int nbits);
 		(1UL<<((nbits) % BITS_PER_LONG))-1 : ~0UL		\
 )
 
+#define small_const_nbits(nbits) \
+	(__builtin_constant_p(nbits) && (nbits) <= BITS_PER_LONG)
+
 static inline void bitmap_zero(unsigned long *dst, int nbits)
 {
-	if (nbits <= BITS_PER_LONG)
+	if (small_const_nbits(nbits))
 		*dst = 0UL;
 	else {
 		int len = BITS_TO_LONGS(nbits) * sizeof(unsigned long);
@@ -150,7 +153,7 @@ static inline void bitmap_zero(unsigned long *dst, int nbits)
 static inline void bitmap_fill(unsigned long *dst, int nbits)
 {
 	size_t nlongs = BITS_TO_LONGS(nbits);
-	if (nlongs > 1) {
+	if (!small_const_nbits(nbits)) {
 		int len = (nlongs - 1) * sizeof(unsigned long);
 		memset(dst, 0xff,  len);
 	}
@@ -160,7 +163,7 @@ static inline void bitmap_fill(unsigned long *dst, int nbits)
 static inline void bitmap_copy(unsigned long *dst, const unsigned long *src,
 			int nbits)
 {
-	if (nbits <= BITS_PER_LONG)
+	if (small_const_nbits(nbits))
 		*dst = *src;
 	else {
 		int len = BITS_TO_LONGS(nbits) * sizeof(unsigned long);
@@ -171,7 +174,7 @@ static inline void bitmap_copy(unsigned long *dst, const unsigned long *src,
 static inline void bitmap_and(unsigned long *dst, const unsigned long *src1,
 			const unsigned long *src2, int nbits)
 {
-	if (nbits <= BITS_PER_LONG)
+	if (small_const_nbits(nbits))
 		*dst = *src1 & *src2;
 	else
 		__bitmap_and(dst, src1, src2, nbits);
@@ -180,7 +183,7 @@ static inline void bitmap_and(unsigned long *dst, const unsigned long *src1,
 static inline void bitmap_or(unsigned long *dst, const unsigned long *src1,
 			const unsigned long *src2, int nbits)
 {
-	if (nbits <= BITS_PER_LONG)
+	if (small_const_nbits(nbits))
 		*dst = *src1 | *src2;
 	else
 		__bitmap_or(dst, src1, src2, nbits);
@@ -189,7 +192,7 @@ static inline void bitmap_or(unsigned long *dst, const unsigned long *src1,
 static inline void bitmap_xor(unsigned long *dst, const unsigned long *src1,
 			const unsigned long *src2, int nbits)
 {
-	if (nbits <= BITS_PER_LONG)
+	if (small_const_nbits(nbits))
 		*dst = *src1 ^ *src2;
 	else
 		__bitmap_xor(dst, src1, src2, nbits);
@@ -198,7 +201,7 @@ static inline void bitmap_xor(unsigned long *dst, const unsigned long *src1,
 static inline void bitmap_andnot(unsigned long *dst, const unsigned long *src1,
 			const unsigned long *src2, int nbits)
 {
-	if (nbits <= BITS_PER_LONG)
+	if (small_const_nbits(nbits))
 		*dst = *src1 & ~(*src2);
 	else
 		__bitmap_andnot(dst, src1, src2, nbits);
@@ -207,7 +210,7 @@ static inline void bitmap_andnot(unsigned long *dst, const unsigned long *src1,
 static inline void bitmap_complement(unsigned long *dst, const unsigned long *src,
 			int nbits)
 {
-	if (nbits <= BITS_PER_LONG)
+	if (small_const_nbits(nbits))
 		*dst = ~(*src) & BITMAP_LAST_WORD_MASK(nbits);
 	else
 		__bitmap_complement(dst, src, nbits);
@@ -216,7 +219,7 @@ static inline void bitmap_complement(unsigned long *dst, const unsigned long *sr
 static inline int bitmap_equal(const unsigned long *src1,
 			const unsigned long *src2, int nbits)
 {
-	if (nbits <= BITS_PER_LONG)
+	if (small_const_nbits(nbits))
 		return ! ((*src1 ^ *src2) & BITMAP_LAST_WORD_MASK(nbits));
 	else
 		return __bitmap_equal(src1, src2, nbits);
@@ -225,7 +228,7 @@ static inline int bitmap_equal(const unsigned long *src1,
 static inline int bitmap_intersects(const unsigned long *src1,
 			const unsigned long *src2, int nbits)
 {
-	if (nbits <= BITS_PER_LONG)
+	if (small_const_nbits(nbits))
 		return ((*src1 & *src2) & BITMAP_LAST_WORD_MASK(nbits)) != 0;
 	else
 		return __bitmap_intersects(src1, src2, nbits);
@@ -234,7 +237,7 @@ static inline int bitmap_intersects(const unsigned long *src1,
 static inline int bitmap_subset(const unsigned long *src1,
 			const unsigned long *src2, int nbits)
 {
-	if (nbits <= BITS_PER_LONG)
+	if (small_const_nbits(nbits))
 		return ! ((*src1 & ~(*src2)) & BITMAP_LAST_WORD_MASK(nbits));
 	else
 		return __bitmap_subset(src1, src2, nbits);
@@ -242,7 +245,7 @@ static inline int bitmap_subset(const unsigned long *src1,
 
 static inline int bitmap_empty(const unsigned long *src, int nbits)
 {
-	if (nbits <= BITS_PER_LONG)
+	if (small_const_nbits(nbits))
 		return ! (*src & BITMAP_LAST_WORD_MASK(nbits));
 	else
 		return __bitmap_empty(src, nbits);
@@ -250,7 +253,7 @@ static inline int bitmap_empty(const unsigned long *src, int nbits)
 
 static inline int bitmap_full(const unsigned long *src, int nbits)
 {
-	if (nbits <= BITS_PER_LONG)
+	if (small_const_nbits(nbits))
 		return ! (~(*src) & BITMAP_LAST_WORD_MASK(nbits));
 	else
 		return __bitmap_full(src, nbits);
@@ -258,7 +261,7 @@ static inline int bitmap_full(const unsigned long *src, int nbits)
 
 static inline int bitmap_weight(const unsigned long *src, int nbits)
 {
-	if (nbits <= BITS_PER_LONG)
+	if (small_const_nbits(nbits))
 		return hweight_long(*src & BITMAP_LAST_WORD_MASK(nbits));
 	return __bitmap_weight(src, nbits);
 }
@@ -266,7 +269,7 @@ static inline int bitmap_weight(const unsigned long *src, int nbits)
 static inline void bitmap_shift_right(unsigned long *dst,
 			const unsigned long *src, int n, int nbits)
 {
-	if (nbits <= BITS_PER_LONG)
+	if (small_const_nbits(nbits))
 		*dst = *src >> n;
 	else
 		__bitmap_shift_right(dst, src, n, nbits);
@@ -275,7 +278,7 @@ static inline void bitmap_shift_right(unsigned long *dst,
 static inline void bitmap_shift_left(unsigned long *dst,
 			const unsigned long *src, int n, int nbits)
 {
-	if (nbits <= BITS_PER_LONG)
+	if (small_const_nbits(nbits))
 		*dst = (*src << n) & BITMAP_LAST_WORD_MASK(nbits);
 	else
 		__bitmap_shift_left(dst, src, n, nbits);
-- 
cgit v1.2.3-70-g09d2


From cb78a0ce69fad2026825f957e24e2d9cda1ec9f1 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Tue, 30 Dec 2008 09:05:14 +1030
Subject: bitmap: fix seq_bitmap and seq_cpumask to take const pointer

Impact: cleanup

seq_bitmap just calls bitmap_scnprintf on the bits: that arg can be const.
Similarly, seq_cpumask just calls seq_bitmap.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 fs/seq_file.c            | 3 ++-
 include/linux/seq_file.h | 5 +++--
 2 files changed, 5 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/fs/seq_file.c b/fs/seq_file.c
index 16c211558c2..c99358a5217 100644
--- a/fs/seq_file.c
+++ b/fs/seq_file.c
@@ -462,7 +462,8 @@ int seq_dentry(struct seq_file *m, struct dentry *dentry, char *esc)
 	return -1;
 }
 
-int seq_bitmap(struct seq_file *m, unsigned long *bits, unsigned int nr_bits)
+int seq_bitmap(struct seq_file *m, const unsigned long *bits,
+				   unsigned int nr_bits)
 {
 	if (m->count < m->size) {
 		int len = bitmap_scnprintf(m->buf + m->count,
diff --git a/include/linux/seq_file.h b/include/linux/seq_file.h
index b3dfa72f13b..952e0187ba1 100644
--- a/include/linux/seq_file.h
+++ b/include/linux/seq_file.h
@@ -50,8 +50,9 @@ int seq_path(struct seq_file *, struct path *, char *);
 int seq_dentry(struct seq_file *, struct dentry *, char *);
 int seq_path_root(struct seq_file *m, struct path *path, struct path *root,
 		  char *esc);
-int seq_bitmap(struct seq_file *m, unsigned long *bits, unsigned int nr_bits);
-static inline int seq_cpumask(struct seq_file *m, cpumask_t *mask)
+int seq_bitmap(struct seq_file *m, const unsigned long *bits,
+				   unsigned int nr_bits);
+static inline int seq_cpumask(struct seq_file *m, const struct cpumask *mask)
 {
 	return seq_bitmap(m, mask->bits, NR_CPUS);
 }
-- 
cgit v1.2.3-70-g09d2


From b3199c025d1646e25e7d1d640dd605db251dccf8 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Tue, 30 Dec 2008 09:05:14 +1030
Subject: cpumask: switch over to cpu_online/possible/active/present_mask: core

Impact: cleanup

This implements the obsolescent cpu_online_map in terms of
cpu_online_mask, rather than the other way around.  Same for the other
maps.

The documentation comments are also updated to refer to _mask rather
than _map.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
Signed-off-by: Mike Travis <travis@sgi.com>
---
 include/linux/cpumask.h | 75 +++++++++++++++++++------------------------------
 kernel/cpu.c            | 49 +++++++++++++++-----------------
 2 files changed, 52 insertions(+), 72 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h
index b5ad19a6f43..db2341beca4 100644
--- a/include/linux/cpumask.h
+++ b/include/linux/cpumask.h
@@ -416,65 +416,54 @@ int __next_cpu_nr(int n, const cpumask_t *srcp);
 
 /*
  * The following particular system cpumasks and operations manage
- * possible, present, active and online cpus.  Each of them is a fixed size
- * bitmap of size NR_CPUS.
+ * possible, present, active and online cpus.
  *
- *  #ifdef CONFIG_HOTPLUG_CPU
- *     cpu_possible_map - has bit 'cpu' set iff cpu is populatable
- *     cpu_present_map  - has bit 'cpu' set iff cpu is populated
- *     cpu_online_map   - has bit 'cpu' set iff cpu available to scheduler
- *     cpu_active_map   - has bit 'cpu' set iff cpu available to migration
- *  #else
- *     cpu_possible_map - has bit 'cpu' set iff cpu is populated
- *     cpu_present_map  - copy of cpu_possible_map
- *     cpu_online_map   - has bit 'cpu' set iff cpu available to scheduler
- *  #endif
+ *     cpu_possible_mask- has bit 'cpu' set iff cpu is populatable
+ *     cpu_present_mask - has bit 'cpu' set iff cpu is populated
+ *     cpu_online_mask  - has bit 'cpu' set iff cpu available to scheduler
+ *     cpu_active_mask  - has bit 'cpu' set iff cpu available to migration
  *
- *  In either case, NR_CPUS is fixed at compile time, as the static
- *  size of these bitmaps.  The cpu_possible_map is fixed at boot
- *  time, as the set of CPU id's that it is possible might ever
- *  be plugged in at anytime during the life of that system boot.
- *  The cpu_present_map is dynamic(*), representing which CPUs
- *  are currently plugged in.  And cpu_online_map is the dynamic
- *  subset of cpu_present_map, indicating those CPUs available
- *  for scheduling.
+ *  If !CONFIG_HOTPLUG_CPU, present == possible, and active == online.
  *
- *  If HOTPLUG is enabled, then cpu_possible_map is forced to have
+ *  The cpu_possible_mask is fixed at boot time, as the set of CPU id's
+ *  that it is possible might ever be plugged in at anytime during the
+ *  life of that system boot.  The cpu_present_mask is dynamic(*),
+ *  representing which CPUs are currently plugged in.  And
+ *  cpu_online_mask is the dynamic subset of cpu_present_mask,
+ *  indicating those CPUs available for scheduling.
+ *
+ *  If HOTPLUG is enabled, then cpu_possible_mask is forced to have
  *  all NR_CPUS bits set, otherwise it is just the set of CPUs that
  *  ACPI reports present at boot.
  *
- *  If HOTPLUG is enabled, then cpu_present_map varies dynamically,
+ *  If HOTPLUG is enabled, then cpu_present_mask varies dynamically,
  *  depending on what ACPI reports as currently plugged in, otherwise
- *  cpu_present_map is just a copy of cpu_possible_map.
+ *  cpu_present_mask is just a copy of cpu_possible_mask.
  *
- *  (*) Well, cpu_present_map is dynamic in the hotplug case.  If not
- *      hotplug, it's a copy of cpu_possible_map, hence fixed at boot.
+ *  (*) Well, cpu_present_mask is dynamic in the hotplug case.  If not
+ *      hotplug, it's a copy of cpu_possible_mask, hence fixed at boot.
  *
  * Subtleties:
  * 1) UP arch's (NR_CPUS == 1, CONFIG_SMP not defined) hardcode
  *    assumption that their single CPU is online.  The UP
- *    cpu_{online,possible,present}_maps are placebos.  Changing them
+ *    cpu_{online,possible,present}_masks are placebos.  Changing them
  *    will have no useful affect on the following num_*_cpus()
  *    and cpu_*() macros in the UP case.  This ugliness is a UP
  *    optimization - don't waste any instructions or memory references
  *    asking if you're online or how many CPUs there are if there is
  *    only one CPU.
- * 2) Most SMP arch's #define some of these maps to be some
- *    other map specific to that arch.  Therefore, the following
- *    must be #define macros, not inlines.  To see why, examine
- *    the assembly code produced by the following.  Note that
- *    set1() writes phys_x_map, but set2() writes x_map:
- *        int x_map, phys_x_map;
- *        #define set1(a) x_map = a
- *        inline void set2(int a) { x_map = a; }
- *        #define x_map phys_x_map
- *        main(){ set1(3); set2(5); }
  */
 
-extern cpumask_t cpu_possible_map;
-extern cpumask_t cpu_online_map;
-extern cpumask_t cpu_present_map;
-extern cpumask_t cpu_active_map;
+extern const struct cpumask *const cpu_possible_mask;
+extern const struct cpumask *const cpu_online_mask;
+extern const struct cpumask *const cpu_present_mask;
+extern const struct cpumask *const cpu_active_mask;
+
+/* These strip const, as traditionally they weren't const. */
+#define cpu_possible_map	(*(cpumask_t *)cpu_possible_mask)
+#define cpu_online_map		(*(cpumask_t *)cpu_online_mask)
+#define cpu_present_map		(*(cpumask_t *)cpu_present_mask)
+#define cpu_active_map		(*(cpumask_t *)cpu_active_mask)
 
 #if NR_CPUS > 1
 #define num_online_cpus()	cpus_weight_nr(cpu_online_map)
@@ -1058,12 +1047,6 @@ static inline void free_bootmem_cpumask_var(cpumask_var_t mask)
 }
 #endif /* CONFIG_CPUMASK_OFFSTACK */
 
-/* The pointer versions of the maps, these will become the primary versions. */
-#define cpu_possible_mask ((const struct cpumask *)&cpu_possible_map)
-#define cpu_online_mask ((const struct cpumask *)&cpu_online_map)
-#define cpu_present_mask ((const struct cpumask *)&cpu_present_map)
-#define cpu_active_mask ((const struct cpumask *)&cpu_active_map)
-
 /* It's common to want to use cpu_all_mask in struct member initializers,
  * so it has to refer to an address rather than a pointer. */
 extern const DECLARE_BITMAP(cpu_all_bits, NR_CPUS);
diff --git a/kernel/cpu.c b/kernel/cpu.c
index bae131a1211..3ddc509b19c 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -15,30 +15,8 @@
 #include <linux/stop_machine.h>
 #include <linux/mutex.h>
 
-/*
- * Represents all cpu's present in the system
- * In systems capable of hotplug, this map could dynamically grow
- * as new cpu's are detected in the system via any platform specific
- * method, such as ACPI for e.g.
- */
-cpumask_t cpu_present_map __read_mostly;
-EXPORT_SYMBOL(cpu_present_map);
-
-/*
- * Represents all cpu's that are currently online.
- */
-cpumask_t cpu_online_map __read_mostly;
-EXPORT_SYMBOL(cpu_online_map);
-
-#ifdef CONFIG_INIT_ALL_POSSIBLE
-cpumask_t cpu_possible_map __read_mostly = CPU_MASK_ALL;
-#else
-cpumask_t cpu_possible_map __read_mostly;
-#endif
-EXPORT_SYMBOL(cpu_possible_map);
-
 #ifdef CONFIG_SMP
-/* Serializes the updates to cpu_online_map, cpu_present_map */
+/* Serializes the updates to cpu_online_mask, cpu_present_mask */
 static DEFINE_MUTEX(cpu_add_remove_lock);
 
 static __cpuinitdata RAW_NOTIFIER_HEAD(cpu_chain);
@@ -65,8 +43,6 @@ void __init cpu_hotplug_init(void)
 	cpu_hotplug.refcount = 0;
 }
 
-cpumask_t cpu_active_map;
-
 #ifdef CONFIG_HOTPLUG_CPU
 
 void get_online_cpus(void)
@@ -97,7 +73,7 @@ EXPORT_SYMBOL_GPL(put_online_cpus);
 
 /*
  * The following two API's must be used when attempting
- * to serialize the updates to cpu_online_map, cpu_present_map.
+ * to serialize the updates to cpu_online_mask, cpu_present_mask.
  */
 void cpu_maps_update_begin(void)
 {
@@ -503,3 +479,24 @@ EXPORT_SYMBOL_GPL(cpu_bit_bitmap);
 
 const DECLARE_BITMAP(cpu_all_bits, NR_CPUS) = CPU_BITS_ALL;
 EXPORT_SYMBOL(cpu_all_bits);
+
+#ifdef CONFIG_INIT_ALL_POSSIBLE
+static DECLARE_BITMAP(cpu_possible_bits, CONFIG_NR_CPUS) __read_mostly
+	= CPU_BITS_ALL;
+#else
+static DECLARE_BITMAP(cpu_possible_bits, CONFIG_NR_CPUS) __read_mostly;
+#endif
+const struct cpumask *const cpu_possible_mask = to_cpumask(cpu_possible_bits);
+EXPORT_SYMBOL(cpu_possible_mask);
+
+static DECLARE_BITMAP(cpu_online_bits, CONFIG_NR_CPUS) __read_mostly;
+const struct cpumask *const cpu_online_mask = to_cpumask(cpu_online_bits);
+EXPORT_SYMBOL(cpu_online_mask);
+
+static DECLARE_BITMAP(cpu_present_bits, CONFIG_NR_CPUS) __read_mostly;
+const struct cpumask *const cpu_present_mask = to_cpumask(cpu_present_bits);
+EXPORT_SYMBOL(cpu_present_mask);
+
+static DECLARE_BITMAP(cpu_active_bits, CONFIG_NR_CPUS) __read_mostly;
+const struct cpumask *const cpu_active_mask = to_cpumask(cpu_active_bits);
+EXPORT_SYMBOL(cpu_active_mask);
-- 
cgit v1.2.3-70-g09d2


From ae7a47e72e1a0b5e2b46d1596bc2c22942a73023 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Tue, 30 Dec 2008 09:05:15 +1030
Subject: cpumask: make cpumask.h eat its own dogfood.

Changes:
1) cpumask_t to struct cpumask,
2) cpus_weight_nr to cpumask_weight,
3) cpu_isset to cpumask_test_cpu,
4) ->bits to cpumask_bits()
5) cpu_*_map to cpu_*_mask.
6) for_each_cpu_mask_nr to for_each_cpu

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 include/linux/cpumask.h | 75 +++++++++++++++++++++++++------------------------
 1 file changed, 38 insertions(+), 37 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h
index db2341beca4..e62a67156c5 100644
--- a/include/linux/cpumask.h
+++ b/include/linux/cpumask.h
@@ -268,6 +268,25 @@ static inline void __cpus_shift_left(cpumask_t *dstp,
 	bitmap_shift_left(dstp->bits, srcp->bits, n, nbits);
 }
 
+/**
+ * to_cpumask - convert an NR_CPUS bitmap to a struct cpumask *
+ * @bitmap: the bitmap
+ *
+ * There are a few places where cpumask_var_t isn't appropriate and
+ * static cpumasks must be used (eg. very early boot), yet we don't
+ * expose the definition of 'struct cpumask'.
+ *
+ * This does the conversion, and can be used as a constant initializer.
+ */
+#define to_cpumask(bitmap)						\
+	((struct cpumask *)(1 ? (bitmap)				\
+			    : (void *)sizeof(__check_is_bitmap(bitmap))))
+
+static inline int __check_is_bitmap(const unsigned long *bitmap)
+{
+	return 1;
+}
+
 /*
  * Special-case data structure for "single bit set only" constant CPU masks.
  *
@@ -278,11 +297,11 @@ static inline void __cpus_shift_left(cpumask_t *dstp,
 extern const unsigned long
 	cpu_bit_bitmap[BITS_PER_LONG+1][BITS_TO_LONGS(NR_CPUS)];
 
-static inline const cpumask_t *get_cpu_mask(unsigned int cpu)
+static inline const struct cpumask *get_cpu_mask(unsigned int cpu)
 {
 	const unsigned long *p = cpu_bit_bitmap[1 + cpu % BITS_PER_LONG];
 	p -= cpu / BITS_PER_LONG;
-	return (const cpumask_t *)p;
+	return to_cpumask(p);
 }
 
 /*
@@ -466,13 +485,13 @@ extern const struct cpumask *const cpu_active_mask;
 #define cpu_active_map		(*(cpumask_t *)cpu_active_mask)
 
 #if NR_CPUS > 1
-#define num_online_cpus()	cpus_weight_nr(cpu_online_map)
-#define num_possible_cpus()	cpus_weight_nr(cpu_possible_map)
-#define num_present_cpus()	cpus_weight_nr(cpu_present_map)
-#define cpu_online(cpu)		cpu_isset((cpu), cpu_online_map)
-#define cpu_possible(cpu)	cpu_isset((cpu), cpu_possible_map)
-#define cpu_present(cpu)	cpu_isset((cpu), cpu_present_map)
-#define cpu_active(cpu)		cpu_isset((cpu), cpu_active_map)
+#define num_online_cpus()	cpumask_weight(cpu_online_mask)
+#define num_possible_cpus()	cpumask_weight(cpu_possible_mask)
+#define num_present_cpus()	cpumask_weight(cpu_present_mask)
+#define cpu_online(cpu)		cpumask_test_cpu((cpu), cpu_online_mask)
+#define cpu_possible(cpu)	cpumask_test_cpu((cpu), cpu_possible_mask)
+#define cpu_present(cpu)	cpumask_test_cpu((cpu), cpu_present_mask)
+#define cpu_active(cpu)		cpumask_test_cpu((cpu), cpu_active_mask)
 #else
 #define num_online_cpus()	1
 #define num_possible_cpus()	1
@@ -485,10 +504,6 @@ extern const struct cpumask *const cpu_active_mask;
 
 #define cpu_is_offline(cpu)	unlikely(!cpu_online(cpu))
 
-#define for_each_possible_cpu(cpu) for_each_cpu_mask_nr((cpu), cpu_possible_map)
-#define for_each_online_cpu(cpu)   for_each_cpu_mask_nr((cpu), cpu_online_map)
-#define for_each_present_cpu(cpu)  for_each_cpu_mask_nr((cpu), cpu_present_map)
-
 /* These are the new versions of the cpumask operators: passed by pointer.
  * The older versions will be implemented in terms of these, then deleted. */
 #define cpumask_bits(maskp) ((maskp)->bits)
@@ -676,7 +691,7 @@ static inline void cpumask_clear_cpu(int cpu, struct cpumask *dstp)
  * No static inline type checking - see Subtlety (1) above.
  */
 #define cpumask_test_cpu(cpu, cpumask) \
-	test_bit(cpumask_check(cpu), (cpumask)->bits)
+	test_bit(cpumask_check(cpu), cpumask_bits((cpumask)))
 
 /**
  * cpumask_test_and_set_cpu - atomically test and set a cpu in a cpumask
@@ -919,7 +934,7 @@ static inline void cpumask_copy(struct cpumask *dstp,
 static inline int cpumask_scnprintf(char *buf, int len,
 				    const struct cpumask *srcp)
 {
-	return bitmap_scnprintf(buf, len, srcp->bits, nr_cpumask_bits);
+	return bitmap_scnprintf(buf, len, cpumask_bits(srcp), nr_cpumask_bits);
 }
 
 /**
@@ -933,7 +948,7 @@ static inline int cpumask_scnprintf(char *buf, int len,
 static inline int cpumask_parse_user(const char __user *buf, int len,
 				     struct cpumask *dstp)
 {
-	return bitmap_parse_user(buf, len, dstp->bits, nr_cpumask_bits);
+	return bitmap_parse_user(buf, len, cpumask_bits(dstp), nr_cpumask_bits);
 }
 
 /**
@@ -948,7 +963,8 @@ static inline int cpumask_parse_user(const char __user *buf, int len,
 static inline int cpulist_scnprintf(char *buf, int len,
 				    const struct cpumask *srcp)
 {
-	return bitmap_scnlistprintf(buf, len, srcp->bits, nr_cpumask_bits);
+	return bitmap_scnlistprintf(buf, len, cpumask_bits(srcp),
+				    nr_cpumask_bits);
 }
 
 /**
@@ -961,26 +977,7 @@ static inline int cpulist_scnprintf(char *buf, int len,
  */
 static inline int cpulist_parse(const char *buf, struct cpumask *dstp)
 {
-	return bitmap_parselist(buf, dstp->bits, nr_cpumask_bits);
-}
-
-/**
- * to_cpumask - convert an NR_CPUS bitmap to a struct cpumask *
- * @bitmap: the bitmap
- *
- * There are a few places where cpumask_var_t isn't appropriate and
- * static cpumasks must be used (eg. very early boot), yet we don't
- * expose the definition of 'struct cpumask'.
- *
- * This does the conversion, and can be used as a constant initializer.
- */
-#define to_cpumask(bitmap)						\
-	((struct cpumask *)(1 ? (bitmap)				\
-			    : (void *)sizeof(__check_is_bitmap(bitmap))))
-
-static inline int __check_is_bitmap(const unsigned long *bitmap)
-{
-	return 1;
+	return bitmap_parselist(buf, cpumask_bits(dstp), nr_cpumask_bits);
 }
 
 /**
@@ -1055,6 +1052,10 @@ extern const DECLARE_BITMAP(cpu_all_bits, NR_CPUS);
 /* First bits of cpu_bit_bitmap are in fact unset. */
 #define cpu_none_mask to_cpumask(cpu_bit_bitmap[0])
 
+#define for_each_possible_cpu(cpu) for_each_cpu((cpu), cpu_possible_mask)
+#define for_each_online_cpu(cpu)   for_each_cpu((cpu), cpu_online_mask)
+#define for_each_present_cpu(cpu)  for_each_cpu((cpu), cpu_present_mask)
+
 /* Wrappers for arch boot code to manipulate normally-constant masks */
 static inline void set_cpu_possible(unsigned int cpu, bool possible)
 {
-- 
cgit v1.2.3-70-g09d2


From 3fa41520696fec2815e2d88fbcccdda77ba4d693 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Tue, 30 Dec 2008 09:05:16 +1030
Subject: cpumask: make set_cpu_*/init_cpu_* out-of-line

They're only for use in boot/cpu hotplug code anyway, and this avoids
the use of deprecated cpu_*_map.

Stephen Rothwell points out that gcc 4.2.4 (on powerpc at least)
didn't like the cast away of const anyway:

  include/linux/cpumask.h: In function 'set_cpu_possible':
  include/linux/cpumask.h:1052: warning: passing argument 2 of 'cpumask_set_cpu' discards qualifiers from pointer target type

So this kills two birds with one stone.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 include/linux/cpumask.h | 53 +++++++------------------------------------------
 kernel/cpu.c            | 47 +++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 54 insertions(+), 46 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h
index e62a67156c5..7c178a6baae 100644
--- a/include/linux/cpumask.h
+++ b/include/linux/cpumask.h
@@ -1057,50 +1057,11 @@ extern const DECLARE_BITMAP(cpu_all_bits, NR_CPUS);
 #define for_each_present_cpu(cpu)  for_each_cpu((cpu), cpu_present_mask)
 
 /* Wrappers for arch boot code to manipulate normally-constant masks */
-static inline void set_cpu_possible(unsigned int cpu, bool possible)
-{
-	if (possible)
-		cpumask_set_cpu(cpu, &cpu_possible_map);
-	else
-		cpumask_clear_cpu(cpu, &cpu_possible_map);
-}
-
-static inline void set_cpu_present(unsigned int cpu, bool present)
-{
-	if (present)
-		cpumask_set_cpu(cpu, &cpu_present_map);
-	else
-		cpumask_clear_cpu(cpu, &cpu_present_map);
-}
-
-static inline void set_cpu_online(unsigned int cpu, bool online)
-{
-	if (online)
-		cpumask_set_cpu(cpu, &cpu_online_map);
-	else
-		cpumask_clear_cpu(cpu, &cpu_online_map);
-}
-
-static inline void set_cpu_active(unsigned int cpu, bool active)
-{
-	if (active)
-		cpumask_set_cpu(cpu, &cpu_active_map);
-	else
-		cpumask_clear_cpu(cpu, &cpu_active_map);
-}
-
-static inline void init_cpu_present(const struct cpumask *src)
-{
-	cpumask_copy(&cpu_present_map, src);
-}
-
-static inline void init_cpu_possible(const struct cpumask *src)
-{
-	cpumask_copy(&cpu_possible_map, src);
-}
-
-static inline void init_cpu_online(const struct cpumask *src)
-{
-	cpumask_copy(&cpu_online_map, src);
-}
+void set_cpu_possible(unsigned int cpu, bool possible);
+void set_cpu_present(unsigned int cpu, bool present);
+void set_cpu_online(unsigned int cpu, bool online);
+void set_cpu_active(unsigned int cpu, bool active);
+void init_cpu_present(const struct cpumask *src);
+void init_cpu_possible(const struct cpumask *src);
+void init_cpu_online(const struct cpumask *src);
 #endif /* __LINUX_CPUMASK_H */
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 3ddc509b19c..2c9f78f3a2f 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -500,3 +500,50 @@ EXPORT_SYMBOL(cpu_present_mask);
 static DECLARE_BITMAP(cpu_active_bits, CONFIG_NR_CPUS) __read_mostly;
 const struct cpumask *const cpu_active_mask = to_cpumask(cpu_active_bits);
 EXPORT_SYMBOL(cpu_active_mask);
+
+void set_cpu_possible(unsigned int cpu, bool possible)
+{
+	if (possible)
+		cpumask_set_cpu(cpu, to_cpumask(cpu_possible_bits));
+	else
+		cpumask_clear_cpu(cpu, to_cpumask(cpu_possible_bits));
+}
+
+void set_cpu_present(unsigned int cpu, bool present)
+{
+	if (present)
+		cpumask_set_cpu(cpu, to_cpumask(cpu_present_bits));
+	else
+		cpumask_clear_cpu(cpu, to_cpumask(cpu_present_bits));
+}
+
+void set_cpu_online(unsigned int cpu, bool online)
+{
+	if (online)
+		cpumask_set_cpu(cpu, to_cpumask(cpu_online_bits));
+	else
+		cpumask_clear_cpu(cpu, to_cpumask(cpu_online_bits));
+}
+
+void set_cpu_active(unsigned int cpu, bool active)
+{
+	if (active)
+		cpumask_set_cpu(cpu, to_cpumask(cpu_active_bits));
+	else
+		cpumask_clear_cpu(cpu, to_cpumask(cpu_active_bits));
+}
+
+void init_cpu_present(const struct cpumask *src)
+{
+	cpumask_copy(to_cpumask(cpu_present_bits), src);
+}
+
+void init_cpu_possible(const struct cpumask *src)
+{
+	cpumask_copy(to_cpumask(cpu_possible_bits), src);
+}
+
+void init_cpu_online(const struct cpumask *src)
+{
+	cpumask_copy(to_cpumask(cpu_online_bits), src);
+}
-- 
cgit v1.2.3-70-g09d2


From 54b11e6d57a10aa9d0009efd93873e17bffd5d30 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Tue, 30 Dec 2008 09:05:16 +1030
Subject: cpumask: smp_call_function_many()

Impact: Implementation change to remove cpumask_t from stack.

Actually change smp_call_function_mask() to smp_call_function_many().
We avoid cpumasks on the stack in this version.

(S390 has its own version, but that's going away apparently).

We have to do some dancing to figure out if 0 or 1 other cpus are in
the mask supplied and the online mask without allocating a tmp
cpumask.  It's still fairly cheap.

We allocate the cpumask at the end of the call_function_data
structure: if allocation fails we fallback to smp_call_function_single
rather than using the baroque quiescing code (which needs a cpumask on
stack).

(Thanks to Hiroshi Shimamoto for spotting several bugs in previous versions!)

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
Signed-off-by: Mike Travis <travis@sgi.com>
Cc: Hiroshi Shimamoto <h-shimamoto@ct.jp.nec.com>
Cc: npiggin@suse.de
Cc: axboe@kernel.dk
---
 include/linux/smp.h |  15 +++---
 kernel/smp.c        | 139 ++++++++++++++++++----------------------------------
 2 files changed, 57 insertions(+), 97 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/smp.h b/include/linux/smp.h
index 2f85f3b04bc..b8246696810 100644
--- a/include/linux/smp.h
+++ b/include/linux/smp.h
@@ -67,15 +67,16 @@ extern void smp_cpus_done(unsigned int max_cpus);
  * Call a function on all other processors
  */
 int smp_call_function(void(*func)(void *info), void *info, int wait);
-/* Deprecated: use smp_call_function_many() which uses a cpumask ptr. */
-int smp_call_function_mask(cpumask_t mask, void(*func)(void *info), void *info,
-				int wait);
+void smp_call_function_many(const struct cpumask *mask,
+			    void (*func)(void *info), void *info, bool wait);
 
-static inline void smp_call_function_many(const struct cpumask *mask,
-					  void (*func)(void *info), void *info,
-					  int wait)
+/* Deprecated: Use smp_call_function_many which takes a pointer to the mask. */
+static inline int
+smp_call_function_mask(cpumask_t mask, void(*func)(void *info), void *info,
+		       int wait)
 {
-	smp_call_function_mask(*mask, func, info, wait);
+	smp_call_function_many(&mask, func, info, wait);
+	return 0;
 }
 
 int smp_call_function_single(int cpuid, void (*func) (void *info), void *info,
diff --git a/kernel/smp.c b/kernel/smp.c
index 75c8dde58c5..9f0eafed139 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -24,8 +24,8 @@ struct call_function_data {
 	struct call_single_data csd;
 	spinlock_t lock;
 	unsigned int refs;
-	cpumask_t cpumask;
 	struct rcu_head rcu_head;
+	unsigned long cpumask_bits[];
 };
 
 struct call_single_queue {
@@ -110,13 +110,13 @@ void generic_smp_call_function_interrupt(void)
 	list_for_each_entry_rcu(data, &call_function_queue, csd.list) {
 		int refs;
 
-		if (!cpu_isset(cpu, data->cpumask))
+		if (!cpumask_test_cpu(cpu, to_cpumask(data->cpumask_bits)))
 			continue;
 
 		data->csd.func(data->csd.info);
 
 		spin_lock(&data->lock);
-		cpu_clear(cpu, data->cpumask);
+		cpumask_clear_cpu(cpu, to_cpumask(data->cpumask_bits));
 		WARN_ON(data->refs == 0);
 		data->refs--;
 		refs = data->refs;
@@ -266,51 +266,13 @@ void __smp_call_function_single(int cpu, struct call_single_data *data)
 	generic_exec_single(cpu, data);
 }
 
-/* Dummy function */
-static void quiesce_dummy(void *unused)
-{
-}
-
-/*
- * Ensure stack based data used in call function mask is safe to free.
- *
- * This is needed by smp_call_function_mask when using on-stack data, because
- * a single call function queue is shared by all CPUs, and any CPU may pick up
- * the data item on the queue at any time before it is deleted. So we need to
- * ensure that all CPUs have transitioned through a quiescent state after
- * this call.
- *
- * This is a very slow function, implemented by sending synchronous IPIs to
- * all possible CPUs. For this reason, we have to alloc data rather than use
- * stack based data even in the case of synchronous calls. The stack based
- * data is then just used for deadlock/oom fallback which will be very rare.
- *
- * If a faster scheme can be made, we could go back to preferring stack based
- * data -- the data allocation/free is non-zero cost.
- */
-static void smp_call_function_mask_quiesce_stack(cpumask_t mask)
-{
-	struct call_single_data data;
-	int cpu;
-
-	data.func = quiesce_dummy;
-	data.info = NULL;
-
-	for_each_cpu_mask(cpu, mask) {
-		data.flags = CSD_FLAG_WAIT;
-		generic_exec_single(cpu, &data);
-	}
-}
-
 /**
- * smp_call_function_mask(): Run a function on a set of other CPUs.
- * @mask: The set of cpus to run on.
+ * smp_call_function_many(): Run a function on a set of other CPUs.
+ * @mask: The set of cpus to run on (only runs on online subset).
  * @func: The function to run. This must be fast and non-blocking.
  * @info: An arbitrary pointer to pass to the function.
  * @wait: If true, wait (atomically) until function has completed on other CPUs.
  *
- * Returns 0 on success, else a negative status code.
- *
  * If @wait is true, then returns once @func has returned. Note that @wait
  * will be implicitly turned on in case of allocation failures, since
  * we fall back to on-stack allocation.
@@ -319,53 +281,57 @@ static void smp_call_function_mask_quiesce_stack(cpumask_t mask)
  * hardware interrupt handler or from a bottom half handler. Preemption
  * must be disabled when calling this function.
  */
-int smp_call_function_mask(cpumask_t mask, void (*func)(void *), void *info,
-			   int wait)
+void smp_call_function_many(const struct cpumask *mask,
+			    void (*func)(void *), void *info,
+			    bool wait)
 {
-	struct call_function_data d;
-	struct call_function_data *data = NULL;
-	cpumask_t allbutself;
+	struct call_function_data *data;
 	unsigned long flags;
-	int cpu, num_cpus;
-	int slowpath = 0;
+	int cpu, next_cpu;
 
 	/* Can deadlock when called with interrupts disabled */
 	WARN_ON(irqs_disabled());
 
-	cpu = smp_processor_id();
-	allbutself = cpu_online_map;
-	cpu_clear(cpu, allbutself);
-	cpus_and(mask, mask, allbutself);
-	num_cpus = cpus_weight(mask);
-
-	/*
-	 * If zero CPUs, return. If just a single CPU, turn this request
-	 * into a targetted single call instead since it's faster.
-	 */
-	if (!num_cpus)
-		return 0;
-	else if (num_cpus == 1) {
-		cpu = first_cpu(mask);
-		return smp_call_function_single(cpu, func, info, wait);
+	/* So, what's a CPU they want?  Ignoring this one. */
+	cpu = cpumask_first_and(mask, cpu_online_mask);
+	if (cpu == smp_processor_id())
+		cpu = cpumask_next_and(cpu, mask, cpu_online_mask);
+	/* No online cpus?  We're done. */
+	if (cpu >= nr_cpu_ids)
+		return;
+
+	/* Do we have another CPU which isn't us? */
+	next_cpu = cpumask_next_and(cpu, mask, cpu_online_mask);
+	if (next_cpu == smp_processor_id())
+		next_cpu = cpumask_next_and(next_cpu, mask, cpu_online_mask);
+
+	/* Fastpath: do that cpu by itself. */
+	if (next_cpu >= nr_cpu_ids) {
+		smp_call_function_single(cpu, func, info, wait);
+		return;
 	}
 
-	data = kmalloc(sizeof(*data), GFP_ATOMIC);
-	if (data) {
-		data->csd.flags = CSD_FLAG_ALLOC;
-		if (wait)
-			data->csd.flags |= CSD_FLAG_WAIT;
-	} else {
-		data = &d;
-		data->csd.flags = CSD_FLAG_WAIT;
-		wait = 1;
-		slowpath = 1;
+	data = kmalloc(sizeof(*data) + cpumask_size(), GFP_ATOMIC);
+	if (unlikely(!data)) {
+		/* Slow path. */
+		for_each_online_cpu(cpu) {
+			if (cpu == smp_processor_id())
+				continue;
+			if (cpumask_test_cpu(cpu, mask))
+				smp_call_function_single(cpu, func, info, wait);
+		}
+		return;
 	}
 
 	spin_lock_init(&data->lock);
+	data->csd.flags = CSD_FLAG_ALLOC;
+	if (wait)
+		data->csd.flags |= CSD_FLAG_WAIT;
 	data->csd.func = func;
 	data->csd.info = info;
-	data->refs = num_cpus;
-	data->cpumask = mask;
+	cpumask_and(to_cpumask(data->cpumask_bits), mask, cpu_online_mask);
+	cpumask_clear_cpu(smp_processor_id(), to_cpumask(data->cpumask_bits));
+	data->refs = cpumask_weight(to_cpumask(data->cpumask_bits));
 
 	spin_lock_irqsave(&call_function_lock, flags);
 	list_add_tail_rcu(&data->csd.list, &call_function_queue);
@@ -377,18 +343,13 @@ int smp_call_function_mask(cpumask_t mask, void (*func)(void *), void *info,
 	smp_mb();
 
 	/* Send a message to all CPUs in the map */
-	arch_send_call_function_ipi(mask);
+	arch_send_call_function_ipi(*to_cpumask(data->cpumask_bits));
 
 	/* optionally wait for the CPUs to complete */
-	if (wait) {
+	if (wait)
 		csd_flag_wait(&data->csd);
-		if (unlikely(slowpath))
-			smp_call_function_mask_quiesce_stack(mask);
-	}
-
-	return 0;
 }
-EXPORT_SYMBOL(smp_call_function_mask);
+EXPORT_SYMBOL(smp_call_function_many);
 
 /**
  * smp_call_function(): Run a function on all other CPUs.
@@ -396,7 +357,7 @@ EXPORT_SYMBOL(smp_call_function_mask);
  * @info: An arbitrary pointer to pass to the function.
  * @wait: If true, wait (atomically) until function has completed on other CPUs.
  *
- * Returns 0 on success, else a negative status code.
+ * Returns 0.
  *
  * If @wait is true, then returns once @func has returned; otherwise
  * it returns just before the target cpu calls @func. In case of allocation
@@ -407,12 +368,10 @@ EXPORT_SYMBOL(smp_call_function_mask);
  */
 int smp_call_function(void (*func)(void *), void *info, int wait)
 {
-	int ret;
-
 	preempt_disable();
-	ret = smp_call_function_mask(cpu_online_map, func, info, wait);
+	smp_call_function_many(cpu_online_mask, func, info, wait);
 	preempt_enable();
-	return ret;
+	return 0;
 }
 EXPORT_SYMBOL(smp_call_function);
 
-- 
cgit v1.2.3-70-g09d2


From e12f0102ac81d660c9f801d0a0e10ccf4537a9de Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Tue, 30 Dec 2008 09:05:19 +1030
Subject: cpumask: Use nr_cpu_ids in seq_cpumask

Impact: cleanup, futureproof

nr_cpu_ids is the (badly named) runtime limit on possible CPU numbers;
ie. the variable version of NR_CPUS.

With the new cpumask operators, only bits less than this are defined.
So we should use it everywhere, rather than NR_CPUS.  Eventually this
will make it possible to allocate cpumasks of the minimal length at runtime.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
Signed-off-by: Mike Travis <travis@sgi.com>
Acked-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/seq_file.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/seq_file.h b/include/linux/seq_file.h
index 952e0187ba1..40ea5058c2e 100644
--- a/include/linux/seq_file.h
+++ b/include/linux/seq_file.h
@@ -54,7 +54,7 @@ int seq_bitmap(struct seq_file *m, const unsigned long *bits,
 				   unsigned int nr_bits);
 static inline int seq_cpumask(struct seq_file *m, const struct cpumask *mask)
 {
-	return seq_bitmap(m, mask->bits, NR_CPUS);
+	return seq_bitmap(m, mask->bits, nr_cpu_ids);
 }
 
 static inline int seq_nodemask(struct seq_file *m, nodemask_t *mask)
-- 
cgit v1.2.3-70-g09d2


From 480daab42c4dd74b3c07031ddf9031251c530c77 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Tue, 30 Dec 2008 09:25:56 -0600
Subject: virtio: Don't use PAGE_SIZE in virtio_pci.c

The virtio PCI devices don't depend on the guest page size.  This matters
now PowerPC virtio is gaining ground (they like 64k pages).

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 drivers/virtio/virtio_pci.c | 2 +-
 include/linux/virtio_pci.h  | 4 ++++
 2 files changed, 5 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/virtio/virtio_pci.c b/drivers/virtio/virtio_pci.c
index e37d686edfe..c2cd69ba0a8 100644
--- a/drivers/virtio/virtio_pci.c
+++ b/drivers/virtio/virtio_pci.c
@@ -245,7 +245,7 @@ static struct virtqueue *vp_find_vq(struct virtio_device *vdev, unsigned index,
 	}
 
 	/* activate the queue */
-	iowrite32(virt_to_phys(info->queue) >> PAGE_SHIFT,
+	iowrite32(virt_to_phys(info->queue) >> VIRTIO_PCI_QUEUE_ADDR_SHIFT,
 		  vp_dev->ioaddr + VIRTIO_PCI_QUEUE_PFN);
 
 	/* create the vring */
diff --git a/include/linux/virtio_pci.h b/include/linux/virtio_pci.h
index cdef3574293..e13d7ebcf57 100644
--- a/include/linux/virtio_pci.h
+++ b/include/linux/virtio_pci.h
@@ -53,4 +53,8 @@
 
 /* Virtio ABI version, this must match exactly */
 #define VIRTIO_PCI_ABI_VERSION		0
+
+/* How many bits to shift physical queue address written to QUEUE_PFN.
+ * 12 is historical, and due to x86 page size. */
+#define VIRTIO_PCI_QUEUE_ADDR_SHIFT	12
 #endif
-- 
cgit v1.2.3-70-g09d2


From 5f0d1d7f2286c8a02dab69f5f0bd51681fab161e Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Tue, 30 Dec 2008 09:25:57 -0600
Subject: virtio: rename 'pagesize' arg to vring_init/vring_size

It's really the alignment desired for consumer/producer separation;
historically this x86 pagesize, but with PowerPC it'll still be x86
pagesize.  And in theory lguest could choose a different value.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 include/linux/virtio_ring.h | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/virtio_ring.h b/include/linux/virtio_ring.h
index c4a598fb382..01bf3124e31 100644
--- a/include/linux/virtio_ring.h
+++ b/include/linux/virtio_ring.h
@@ -83,7 +83,7 @@ struct vring {
  *	__u16 avail_idx;
  *	__u16 available[num];
  *
- *	// Padding to the next page boundary.
+ *	// Padding to the next align boundary.
  *	char pad[];
  *
  *	// A ring of used descriptor heads with free-running index.
@@ -93,19 +93,19 @@ struct vring {
  * };
  */
 static inline void vring_init(struct vring *vr, unsigned int num, void *p,
-			      unsigned long pagesize)
+			      unsigned long align)
 {
 	vr->num = num;
 	vr->desc = p;
 	vr->avail = p + num*sizeof(struct vring_desc);
-	vr->used = (void *)(((unsigned long)&vr->avail->ring[num] + pagesize-1)
-			    & ~(pagesize - 1));
+	vr->used = (void *)(((unsigned long)&vr->avail->ring[num] + align-1)
+			    & ~(align - 1));
 }
 
-static inline unsigned vring_size(unsigned int num, unsigned long pagesize)
+static inline unsigned vring_size(unsigned int num, unsigned long align)
 {
 	return ((sizeof(struct vring_desc) * num + sizeof(__u16) * (2 + num)
-		 + pagesize - 1) & ~(pagesize - 1))
+		 + align - 1) & ~(align - 1))
 		+ sizeof(__u16) * 2 + sizeof(struct vring_used_elem) * num;
 }
 
-- 
cgit v1.2.3-70-g09d2


From 498af14783935af487d17dbee4ac451783cbc2b7 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Tue, 30 Dec 2008 09:25:57 -0600
Subject: virtio: Don't use PAGE_SIZE for vring alignment in virtio_pci.

That doesn't work for non-4k guests which are now appearing.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 drivers/virtio/virtio_pci.c | 4 ++--
 include/linux/virtio_pci.h  | 4 ++++
 2 files changed, 6 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/virtio/virtio_pci.c b/drivers/virtio/virtio_pci.c
index c2cd69ba0a8..f28643f3a4e 100644
--- a/drivers/virtio/virtio_pci.c
+++ b/drivers/virtio/virtio_pci.c
@@ -237,7 +237,7 @@ static struct virtqueue *vp_find_vq(struct virtio_device *vdev, unsigned index,
 	info->queue_index = index;
 	info->num = num;
 
-	size = PAGE_ALIGN(vring_size(num, PAGE_SIZE));
+	size = PAGE_ALIGN(vring_size(num, VIRTIO_PCI_VRING_ALIGN));
 	info->queue = alloc_pages_exact(size, GFP_KERNEL|__GFP_ZERO);
 	if (info->queue == NULL) {
 		err = -ENOMEM;
@@ -290,7 +290,7 @@ static void vp_del_vq(struct virtqueue *vq)
 	iowrite16(info->queue_index, vp_dev->ioaddr + VIRTIO_PCI_QUEUE_SEL);
 	iowrite32(0, vp_dev->ioaddr + VIRTIO_PCI_QUEUE_PFN);
 
-	size = PAGE_ALIGN(vring_size(info->num, PAGE_SIZE));
+	size = PAGE_ALIGN(vring_size(info->num, VIRTIO_PCI_VRING_ALIGN));
 	free_pages_exact(info->queue, size);
 	kfree(info);
 }
diff --git a/include/linux/virtio_pci.h b/include/linux/virtio_pci.h
index e13d7ebcf57..cd0fd5d181a 100644
--- a/include/linux/virtio_pci.h
+++ b/include/linux/virtio_pci.h
@@ -57,4 +57,8 @@
 /* How many bits to shift physical queue address written to QUEUE_PFN.
  * 12 is historical, and due to x86 page size. */
 #define VIRTIO_PCI_QUEUE_ADDR_SHIFT	12
+
+/* The alignment to use between consumer and producer parts of vring.
+ * x86 pagesize again. */
+#define VIRTIO_PCI_VRING_ALIGN		4096
 #endif
-- 
cgit v1.2.3-70-g09d2


From 2966af73e70dee461c256b5eb877b2ff757f8c82 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Tue, 30 Dec 2008 09:25:58 -0600
Subject: virtio: use LGUEST_VRING_ALIGN instead of relying on pagesize

This doesn't really matter, since lguest is i386 only at the moment,
but we could actually choose a different value.  (lguest doesn't have
a guarenteed ABI).

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 Documentation/lguest/lguest.c   | 6 +++---
 drivers/lguest/lguest_device.c  | 2 +-
 include/linux/lguest_launcher.h | 4 ++++
 3 files changed, 8 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/lguest/lguest.c b/Documentation/lguest/lguest.c
index 804520633fc..aa2574ca94c 100644
--- a/Documentation/lguest/lguest.c
+++ b/Documentation/lguest/lguest.c
@@ -1030,7 +1030,7 @@ static void update_device_status(struct device *dev)
 		/* Zero out the virtqueues. */
 		for (vq = dev->vq; vq; vq = vq->next) {
 			memset(vq->vring.desc, 0,
-			       vring_size(vq->config.num, getpagesize()));
+			       vring_size(vq->config.num, LGUEST_VRING_ALIGN));
 			lg_last_avail(vq) = 0;
 		}
 	} else if (dev->desc->status & VIRTIO_CONFIG_S_FAILED) {
@@ -1211,7 +1211,7 @@ static void add_virtqueue(struct device *dev, unsigned int num_descs,
 	void *p;
 
 	/* First we need some memory for this virtqueue. */
-	pages = (vring_size(num_descs, getpagesize()) + getpagesize() - 1)
+	pages = (vring_size(num_descs, LGUEST_VRING_ALIGN) + getpagesize() - 1)
 		/ getpagesize();
 	p = get_pages(pages);
 
@@ -1228,7 +1228,7 @@ static void add_virtqueue(struct device *dev, unsigned int num_descs,
 	vq->config.pfn = to_guest_phys(p) / getpagesize();
 
 	/* Initialize the vring. */
-	vring_init(&vq->vring, num_descs, p, getpagesize());
+	vring_init(&vq->vring, num_descs, p, LGUEST_VRING_ALIGN);
 
 	/* Append virtqueue to this device's descriptor.  We use
 	 * device_config() to get the end of the device's current virtqueues;
diff --git a/drivers/lguest/lguest_device.c b/drivers/lguest/lguest_device.c
index a661bbdae3d..f062dc55c57 100644
--- a/drivers/lguest/lguest_device.c
+++ b/drivers/lguest/lguest_device.c
@@ -250,7 +250,7 @@ static struct virtqueue *lg_find_vq(struct virtio_device *vdev,
 	/* Figure out how many pages the ring will take, and map that memory */
 	lvq->pages = lguest_map((unsigned long)lvq->config.pfn << PAGE_SHIFT,
 				DIV_ROUND_UP(vring_size(lvq->config.num,
-							PAGE_SIZE),
+							LGUEST_VRING_ALIGN),
 					     PAGE_SIZE));
 	if (!lvq->pages) {
 		err = -ENOMEM;
diff --git a/include/linux/lguest_launcher.h b/include/linux/lguest_launcher.h
index e7217dc58f3..bd0eba76052 100644
--- a/include/linux/lguest_launcher.h
+++ b/include/linux/lguest_launcher.h
@@ -59,4 +59,8 @@ enum lguest_req
 	LHREQ_IRQ, /* + irq */
 	LHREQ_BREAK, /* + on/off flag (on blocks until someone does off) */
 };
+
+/* The alignment to use between consumer and producer parts of vring.
+ * x86 pagesize for historical reasons. */
+#define LGUEST_VRING_ALIGN	4096
 #endif /* _LINUX_LGUEST_LAUNCHER */
-- 
cgit v1.2.3-70-g09d2


From 87c7d57c17ade5024d95b6ca0da249da49b0672a Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Tue, 30 Dec 2008 09:26:03 -0600
Subject: virtio: hand virtio ring alignment as argument to vring_new_virtqueue

This allows each virtio user to hand in the alignment appropriate to
their virtio_ring structures.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
Acked-by: Christian Borntraeger <borntraeger@de.ibm.com>
---
 drivers/lguest/lguest_device.c | 4 ++--
 drivers/s390/kvm/kvm_virtio.c  | 3 ++-
 drivers/virtio/virtio_pci.c    | 4 ++--
 drivers/virtio/virtio_ring.c   | 3 ++-
 include/linux/virtio_ring.h    | 1 +
 5 files changed, 9 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/lguest/lguest_device.c b/drivers/lguest/lguest_device.c
index f062dc55c57..b02f6bcb64c 100644
--- a/drivers/lguest/lguest_device.c
+++ b/drivers/lguest/lguest_device.c
@@ -259,8 +259,8 @@ static struct virtqueue *lg_find_vq(struct virtio_device *vdev,
 
 	/* OK, tell virtio_ring.c to set up a virtqueue now we know its size
 	 * and we've got a pointer to its pages. */
-	vq = vring_new_virtqueue(lvq->config.num, vdev, lvq->pages,
-				 lg_notify, callback);
+	vq = vring_new_virtqueue(lvq->config.num, LGUEST_VRING_ALIGN,
+				 vdev, lvq->pages, lg_notify, callback);
 	if (!vq) {
 		err = -ENOMEM;
 		goto unmap;
diff --git a/drivers/s390/kvm/kvm_virtio.c b/drivers/s390/kvm/kvm_virtio.c
index f5a2dbe7557..4e8354aa857 100644
--- a/drivers/s390/kvm/kvm_virtio.c
+++ b/drivers/s390/kvm/kvm_virtio.c
@@ -193,7 +193,8 @@ static struct virtqueue *kvm_find_vq(struct virtio_device *vdev,
 	if (err)
 		goto out;
 
-	vq = vring_new_virtqueue(config->num, vdev, (void *) config->address,
+	vq = vring_new_virtqueue(config->num, KVM_S390_VIRTIO_RING_ALIGN,
+				 vdev, (void *) config->address,
 				 kvm_notify, callback);
 	if (!vq) {
 		err = -ENOMEM;
diff --git a/drivers/virtio/virtio_pci.c b/drivers/virtio/virtio_pci.c
index f28643f3a4e..7462a51e820 100644
--- a/drivers/virtio/virtio_pci.c
+++ b/drivers/virtio/virtio_pci.c
@@ -249,8 +249,8 @@ static struct virtqueue *vp_find_vq(struct virtio_device *vdev, unsigned index,
 		  vp_dev->ioaddr + VIRTIO_PCI_QUEUE_PFN);
 
 	/* create the vring */
-	vq = vring_new_virtqueue(info->num, vdev, info->queue,
-				 vp_notify, callback);
+	vq = vring_new_virtqueue(info->num, VIRTIO_PCI_VRING_ALIGN,
+				 vdev, info->queue, vp_notify, callback);
 	if (!vq) {
 		err = -ENOMEM;
 		goto out_activate_queue;
diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
index 6eb5303fed1..5777196bf6c 100644
--- a/drivers/virtio/virtio_ring.c
+++ b/drivers/virtio/virtio_ring.c
@@ -274,6 +274,7 @@ static struct virtqueue_ops vring_vq_ops = {
 };
 
 struct virtqueue *vring_new_virtqueue(unsigned int num,
+				      unsigned int vring_align,
 				      struct virtio_device *vdev,
 				      void *pages,
 				      void (*notify)(struct virtqueue *),
@@ -292,7 +293,7 @@ struct virtqueue *vring_new_virtqueue(unsigned int num,
 	if (!vq)
 		return NULL;
 
-	vring_init(&vq->vring, num, pages, PAGE_SIZE);
+	vring_init(&vq->vring, num, pages, vring_align);
 	vq->vq.callback = callback;
 	vq->vq.vdev = vdev;
 	vq->vq.vq_ops = &vring_vq_ops;
diff --git a/include/linux/virtio_ring.h b/include/linux/virtio_ring.h
index 01bf3124e31..71e03722fb5 100644
--- a/include/linux/virtio_ring.h
+++ b/include/linux/virtio_ring.h
@@ -115,6 +115,7 @@ struct virtio_device;
 struct virtqueue;
 
 struct virtqueue *vring_new_virtqueue(unsigned int num,
+				      unsigned int vring_align,
 				      struct virtio_device *vdev,
 				      void *pages,
 				      void (*notify)(struct virtqueue *vq),
-- 
cgit v1.2.3-70-g09d2


From 1b4aa2faeca1b9922033daf2475b6fc13b0ffea6 Mon Sep 17 00:00:00 2001
From: Hollis Blanchard <hollisb@us.ibm.com>
Date: Thu, 13 Nov 2008 15:48:33 -0600
Subject: virtio: avoid implicit use of Linux page size in balloon interface

Make the balloon interface always use 4K pages, and convert Linux pfns if
necessary. This patch assumes that Linux's PAGE_SHIFT will never be less than
12.

Signed-off-by: Hollis Blanchard <hollisb@us.ibm.com>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au> (modified)
---
 drivers/virtio/virtio_balloon.c | 13 +++++++++++--
 include/linux/virtio_balloon.h  |  3 +++
 2 files changed, 14 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/virtio/virtio_balloon.c b/drivers/virtio/virtio_balloon.c
index 62eab43152d..59268266b79 100644
--- a/drivers/virtio/virtio_balloon.c
+++ b/drivers/virtio/virtio_balloon.c
@@ -56,6 +56,15 @@ static struct virtio_device_id id_table[] = {
 	{ 0 },
 };
 
+static u32 page_to_balloon_pfn(struct page *page)
+{
+	unsigned long pfn = page_to_pfn(page);
+
+	BUILD_BUG_ON(PAGE_SHIFT < VIRTIO_BALLOON_PFN_SHIFT);
+	/* Convert pfn from Linux page size to balloon page size. */
+	return pfn >> (PAGE_SHIFT - VIRTIO_BALLOON_PFN_SHIFT);
+}
+
 static void balloon_ack(struct virtqueue *vq)
 {
 	struct virtio_balloon *vb;
@@ -99,7 +108,7 @@ static void fill_balloon(struct virtio_balloon *vb, size_t num)
 			msleep(200);
 			break;
 		}
-		vb->pfns[vb->num_pfns] = page_to_pfn(page);
+		vb->pfns[vb->num_pfns] = page_to_balloon_pfn(page);
 		totalram_pages--;
 		vb->num_pages++;
 		list_add(&page->lru, &vb->pages);
@@ -132,7 +141,7 @@ static void leak_balloon(struct virtio_balloon *vb, size_t num)
 	for (vb->num_pfns = 0; vb->num_pfns < num; vb->num_pfns++) {
 		page = list_first_entry(&vb->pages, struct page, lru);
 		list_del(&page->lru);
-		vb->pfns[vb->num_pfns] = page_to_pfn(page);
+		vb->pfns[vb->num_pfns] = page_to_balloon_pfn(page);
 		vb->num_pages--;
 	}
 
diff --git a/include/linux/virtio_balloon.h b/include/linux/virtio_balloon.h
index c30c7bfbf39..8726ff77763 100644
--- a/include/linux/virtio_balloon.h
+++ b/include/linux/virtio_balloon.h
@@ -10,6 +10,9 @@
 /* The feature bitmap for virtio balloon */
 #define VIRTIO_BALLOON_F_MUST_TELL_HOST	0 /* Tell before reclaiming pages */
 
+/* Size of a PFN in the balloon interface. */
+#define VIRTIO_BALLOON_PFN_SHIFT 12
+
 struct virtio_balloon_config
 {
 	/* Number of pages host wants Guest to give up. */
-- 
cgit v1.2.3-70-g09d2


From c29834584ea4eafccf2f62a0b8a32e64f792044c Mon Sep 17 00:00:00 2001
From: Christian Borntraeger <borntraeger@de.ibm.com>
Date: Tue, 25 Nov 2008 13:36:26 +0100
Subject: virtio_console: support console resizing

this patch uses the new hvc callback hvc_resize to set the window size
which allows to change the tty size of hvc_console via a hvc_resize
function.

I have added a new feature bit VIRTIO_CONSOLE_F_SIZE. The driver will
change the window size on tty open and via the config_changed callback
of the transport. Currently lguest and kvm_s390 have not implemented this
callback, but the callback can be implemented at a later point in time.

Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 drivers/char/hvc_console.c     |  1 +
 drivers/char/virtio_console.c  | 30 +++++++++++++++++++++++++++++-
 include/linux/virtio_console.h | 11 +++++++++++
 3 files changed, 41 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/char/hvc_console.c b/drivers/char/hvc_console.c
index fb57f67bb42..0587b66d6fc 100644
--- a/drivers/char/hvc_console.c
+++ b/drivers/char/hvc_console.c
@@ -695,6 +695,7 @@ void hvc_resize(struct hvc_struct *hp, struct winsize ws)
 	hp->ws = ws;
 	schedule_work(&hp->tty_resize);
 }
+EXPORT_SYMBOL_GPL(hvc_resize);
 
 /*
  * This kthread is either polling or interrupt driven.  This is determined by
diff --git a/drivers/char/virtio_console.c b/drivers/char/virtio_console.c
index 3fb0d2c88ba..ff6f5a4b58f 100644
--- a/drivers/char/virtio_console.c
+++ b/drivers/char/virtio_console.c
@@ -137,13 +137,34 @@ int __init virtio_cons_early_init(int (*put_chars)(u32, const char *, int))
 	return hvc_instantiate(0, 0, &virtio_cons);
 }
 
+/*
+ * virtio console configuration. This supports:
+ * - console resize
+ */
+static void virtcons_apply_config(struct virtio_device *dev)
+{
+	struct winsize ws;
+
+	if (virtio_has_feature(dev, VIRTIO_CONSOLE_F_SIZE)) {
+		dev->config->get(dev,
+				 offsetof(struct virtio_console_config, cols),
+				 &ws.ws_col, sizeof(u16));
+		dev->config->get(dev,
+				 offsetof(struct virtio_console_config, rows),
+				 &ws.ws_row, sizeof(u16));
+		hvc_resize(hvc, ws);
+	}
+}
+
 /*
  * we support only one console, the hvc struct is a global var
- * There is no need to do anything
+ * We set the configuration at this point, since we now have a tty
  */
 static int notifier_add_vio(struct hvc_struct *hp, int data)
 {
 	hp->irq_requested = 1;
+	virtcons_apply_config(vdev);
+
 	return 0;
 }
 
@@ -234,11 +255,18 @@ static struct virtio_device_id id_table[] = {
 	{ 0 },
 };
 
+static unsigned int features[] = {
+	VIRTIO_CONSOLE_F_SIZE,
+};
+
 static struct virtio_driver virtio_console = {
+	.feature_table = features,
+	.feature_table_size = ARRAY_SIZE(features),
 	.driver.name =	KBUILD_MODNAME,
 	.driver.owner =	THIS_MODULE,
 	.id_table =	id_table,
 	.probe =	virtcons_probe,
+	.config_changed = virtcons_apply_config,
 };
 
 static int __init init(void)
diff --git a/include/linux/virtio_console.h b/include/linux/virtio_console.h
index 19a0da0dba4..7615ffcdd55 100644
--- a/include/linux/virtio_console.h
+++ b/include/linux/virtio_console.h
@@ -7,6 +7,17 @@
 /* The ID for virtio console */
 #define VIRTIO_ID_CONSOLE	3
 
+/* Feature bits */
+#define VIRTIO_CONSOLE_F_SIZE	0	/* Does host provide console size? */
+
+struct virtio_console_config {
+	/* colums of the screens */
+	__u16 cols;
+	/* rows of the screens */
+	__u16 rows;
+} __attribute__((packed));
+
+
 #ifdef __KERNEL__
 int __init virtio_cons_early_init(int (*put_chars)(u32, const char *, int));
 #endif /* __KERNEL__ */
-- 
cgit v1.2.3-70-g09d2


From 58a24566449892dda409b9ad92c2e56c76c5670c Mon Sep 17 00:00:00 2001
From: Matias Zabaljauregui <zabaljauregui@gmail.com>
Date: Mon, 29 Sep 2008 01:40:07 -0300
Subject: lguest: move the initial guest page table creation code to the host

This patch moves the initial guest page table creation code to the host,
so the launcher keeps working with PAE enabled configs.

Signed-off-by: Matias Zabaljauregui <zabaljauregui@gmail.com>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 Documentation/lguest/lguest.c   | 60 ++++------------------------------
 arch/x86/lguest/i386_head.S     | 15 ---------
 drivers/lguest/lg.h             |  2 +-
 drivers/lguest/lguest_user.c    | 13 +++-----
 drivers/lguest/page_tables.c    | 72 +++++++++++++++++++++++++++++++++++++++--
 include/linux/lguest_launcher.h |  2 +-
 6 files changed, 83 insertions(+), 81 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/lguest/lguest.c b/Documentation/lguest/lguest.c
index aa2574ca94c..f2dbbf3bdea 100644
--- a/Documentation/lguest/lguest.c
+++ b/Documentation/lguest/lguest.c
@@ -481,51 +481,6 @@ static unsigned long load_initrd(const char *name, unsigned long mem)
 	/* We return the initrd size. */
 	return len;
 }
-
-/* Once we know how much memory we have we can construct simple linear page
- * tables which set virtual == physical which will get the Guest far enough
- * into the boot to create its own.
- *
- * We lay them out of the way, just below the initrd (which is why we need to
- * know its size here). */
-static unsigned long setup_pagetables(unsigned long mem,
-				      unsigned long initrd_size)
-{
-	unsigned long *pgdir, *linear;
-	unsigned int mapped_pages, i, linear_pages;
-	unsigned int ptes_per_page = getpagesize()/sizeof(void *);
-
-	mapped_pages = mem/getpagesize();
-
-	/* Each PTE page can map ptes_per_page pages: how many do we need? */
-	linear_pages = (mapped_pages + ptes_per_page-1)/ptes_per_page;
-
-	/* We put the toplevel page directory page at the top of memory. */
-	pgdir = from_guest_phys(mem) - initrd_size - getpagesize();
-
-	/* Now we use the next linear_pages pages as pte pages */
-	linear = (void *)pgdir - linear_pages*getpagesize();
-
-	/* Linear mapping is easy: put every page's address into the mapping in
-	 * order.  PAGE_PRESENT contains the flags Present, Writable and
-	 * Executable. */
-	for (i = 0; i < mapped_pages; i++)
-		linear[i] = ((i * getpagesize()) | PAGE_PRESENT);
-
-	/* The top level points to the linear page table pages above. */
-	for (i = 0; i < mapped_pages; i += ptes_per_page) {
-		pgdir[i/ptes_per_page]
-			= ((to_guest_phys(linear) + i*sizeof(void *))
-			   | PAGE_PRESENT);
-	}
-
-	verbose("Linear mapping of %u pages in %u pte pages at %#lx\n",
-		mapped_pages, linear_pages, to_guest_phys(linear));
-
-	/* We return the top level (guest-physical) address: the kernel needs
-	 * to know where it is. */
-	return to_guest_phys(pgdir);
-}
 /*:*/
 
 /* Simple routine to roll all the commandline arguments together with spaces
@@ -548,13 +503,13 @@ static void concat(char *dst, char *args[])
 
 /*L:185 This is where we actually tell the kernel to initialize the Guest.  We
  * saw the arguments it expects when we looked at initialize() in lguest_user.c:
- * the base of Guest "physical" memory, the top physical page to allow, the
- * top level pagetable and the entry point for the Guest. */
-static int tell_kernel(unsigned long pgdir, unsigned long start)
+ * the base of Guest "physical" memory, the top physical page to allow and the
+ * entry point for the Guest. */
+static int tell_kernel(unsigned long start)
 {
 	unsigned long args[] = { LHREQ_INITIALIZE,
 				 (unsigned long)guest_base,
-				 guest_limit / getpagesize(), pgdir, start };
+				 guest_limit / getpagesize(), start };
 	int fd;
 
 	verbose("Guest: %p - %p (%#lx)\n",
@@ -1941,7 +1896,7 @@ int main(int argc, char *argv[])
 {
 	/* Memory, top-level pagetable, code startpoint and size of the
 	 * (optional) initrd. */
-	unsigned long mem = 0, pgdir, start, initrd_size = 0;
+	unsigned long mem = 0, start, initrd_size = 0;
 	/* Two temporaries and the /dev/lguest file descriptor. */
 	int i, c, lguest_fd;
 	/* The boot information for the Guest. */
@@ -2040,9 +1995,6 @@ int main(int argc, char *argv[])
 		boot->hdr.type_of_loader = 0xFF;
 	}
 
-	/* Set up the initial linear pagetables, starting below the initrd. */
-	pgdir = setup_pagetables(mem, initrd_size);
-
 	/* The Linux boot header contains an "E820" memory map: ours is a
 	 * simple, single region. */
 	boot->e820_entries = 1;
@@ -2064,7 +2016,7 @@ int main(int argc, char *argv[])
 
 	/* We tell the kernel to initialize the Guest: this returns the open
 	 * /dev/lguest file descriptor. */
-	lguest_fd = tell_kernel(pgdir, start);
+	lguest_fd = tell_kernel(start);
 
 	/* We clone off a thread, which wakes the Launcher whenever one of the
 	 * input file descriptors needs attention.  We call this the Waker, and
diff --git a/arch/x86/lguest/i386_head.S b/arch/x86/lguest/i386_head.S
index 5c7cef34c9e..10b9bd35a8f 100644
--- a/arch/x86/lguest/i386_head.S
+++ b/arch/x86/lguest/i386_head.S
@@ -30,21 +30,6 @@ ENTRY(lguest_entry)
 	movl $lguest_data - __PAGE_OFFSET, %edx
 	int $LGUEST_TRAP_ENTRY
 
-	/* The Host put the toplevel pagetable in lguest_data.pgdir.  The movsl
-	 * instruction uses %esi implicitly as the source for the copy we're
-	 * about to do. */
-	movl lguest_data - __PAGE_OFFSET + LGUEST_DATA_pgdir, %esi
-
-	/* Copy first 32 entries of page directory to __PAGE_OFFSET entries.
-	 * This means the first 128M of kernel memory will be mapped at
-	 * PAGE_OFFSET where the kernel expects to run.  This will get it far
-	 * enough through boot to switch to its own pagetables. */
-	movl $32, %ecx
-	movl %esi, %edi
-	addl $((__PAGE_OFFSET >> 22) * 4), %edi
-	rep
-	movsl
-
 	/* Set up the initial stack so we can run C code. */
 	movl $(init_thread_union+THREAD_SIZE),%esp
 
diff --git a/drivers/lguest/lg.h b/drivers/lguest/lg.h
index 5faefeaf679..f2c641e0bdd 100644
--- a/drivers/lguest/lg.h
+++ b/drivers/lguest/lg.h
@@ -164,7 +164,7 @@ void copy_gdt(const struct lg_cpu *cpu, struct desc_struct *gdt);
 void copy_gdt_tls(const struct lg_cpu *cpu, struct desc_struct *gdt);
 
 /* page_tables.c: */
-int init_guest_pagetable(struct lguest *lg, unsigned long pgtable);
+int init_guest_pagetable(struct lguest *lg);
 void free_guest_pagetable(struct lguest *lg);
 void guest_new_pagetable(struct lg_cpu *cpu, unsigned long pgtable);
 void guest_set_pmd(struct lguest *lg, unsigned long gpgdir, u32 i);
diff --git a/drivers/lguest/lguest_user.c b/drivers/lguest/lguest_user.c
index e73a000473c..34bc017b8b3 100644
--- a/drivers/lguest/lguest_user.c
+++ b/drivers/lguest/lguest_user.c
@@ -146,7 +146,7 @@ static int lg_cpu_start(struct lg_cpu *cpu, unsigned id, unsigned long start_ip)
 	return 0;
 }
 
-/*L:020 The initialization write supplies 4 pointer sized (32 or 64 bit)
+/*L:020 The initialization write supplies 3 pointer sized (32 or 64 bit)
  * values (in addition to the LHREQ_INITIALIZE value).  These are:
  *
  * base: The start of the Guest-physical memory inside the Launcher memory.
@@ -155,9 +155,6 @@ static int lg_cpu_start(struct lg_cpu *cpu, unsigned id, unsigned long start_ip)
  * allowed to access.  The Guest memory lives inside the Launcher, so it sets
  * this to ensure the Guest can only reach its own memory.
  *
- * pgdir: The (Guest-physical) address of the top of the initial Guest
- * pagetables (which are set up by the Launcher).
- *
  * start: The first instruction to execute ("eip" in x86-speak).
  */
 static int initialize(struct file *file, const unsigned long __user *input)
@@ -166,7 +163,7 @@ static int initialize(struct file *file, const unsigned long __user *input)
 	 * Guest. */
 	struct lguest *lg;
 	int err;
-	unsigned long args[4];
+	unsigned long args[3];
 
 	/* We grab the Big Lguest lock, which protects against multiple
 	 * simultaneous initializations. */
@@ -192,14 +189,14 @@ static int initialize(struct file *file, const unsigned long __user *input)
 	lg->mem_base = (void __user *)args[0];
 	lg->pfn_limit = args[1];
 
-	/* This is the first cpu (cpu 0) and it will start booting at args[3] */
-	err = lg_cpu_start(&lg->cpus[0], 0, args[3]);
+	/* This is the first cpu (cpu 0) and it will start booting at args[2] */
+	err = lg_cpu_start(&lg->cpus[0], 0, args[2]);
 	if (err)
 		goto release_guest;
 
 	/* Initialize the Guest's shadow page tables, using the toplevel
 	 * address the Launcher gave us.  This allocates memory, so can fail. */
-	err = init_guest_pagetable(lg, args[2]);
+	err = init_guest_pagetable(lg);
 	if (err)
 		goto free_regs;
 
diff --git a/drivers/lguest/page_tables.c b/drivers/lguest/page_tables.c
index 81d0c605344..576a8318221 100644
--- a/drivers/lguest/page_tables.c
+++ b/drivers/lguest/page_tables.c
@@ -14,6 +14,7 @@
 #include <linux/percpu.h>
 #include <asm/tlbflush.h>
 #include <asm/uaccess.h>
+#include <asm/bootparam.h>
 #include "lg.h"
 
 /*M:008 We hold reference to pages, which prevents them from being swapped.
@@ -581,15 +582,82 @@ void guest_set_pmd(struct lguest *lg, unsigned long gpgdir, u32 idx)
 		release_pgd(lg, lg->pgdirs[pgdir].pgdir + idx);
 }
 
+/* Once we know how much memory we have we can construct simple identity
+ * (which set virtual == physical) and linear mappings
+ * which will get the Guest far enough into the boot to create its own.
+ *
+ * We lay them out of the way, just below the initrd (which is why we need to
+ * know its size here). */
+static unsigned long setup_pagetables(struct lguest *lg,
+				      unsigned long mem,
+				      unsigned long initrd_size)
+{
+	pgd_t __user *pgdir;
+	pte_t __user *linear;
+	unsigned int mapped_pages, i, linear_pages, phys_linear;
+	unsigned long mem_base = (unsigned long)lg->mem_base;
+
+	/* We have mapped_pages frames to map, so we need
+	 * linear_pages page tables to map them. */
+	mapped_pages = mem / PAGE_SIZE;
+	linear_pages = (mapped_pages + PTRS_PER_PTE - 1) / PTRS_PER_PTE;
+
+	/* We put the toplevel page directory page at the top of memory. */
+	pgdir = (pgd_t *)(mem + mem_base - initrd_size - PAGE_SIZE);
+
+	/* Now we use the next linear_pages pages as pte pages */
+	linear = (void *)pgdir - linear_pages * PAGE_SIZE;
+
+	/* Linear mapping is easy: put every page's address into the
+	 * mapping in order. */
+	for (i = 0; i < mapped_pages; i++) {
+		pte_t pte;
+		pte = pfn_pte(i, __pgprot(_PAGE_PRESENT|_PAGE_RW|_PAGE_USER));
+		if (copy_to_user(&linear[i], &pte, sizeof(pte)) != 0)
+			return -EFAULT;
+	}
+
+	/* The top level points to the linear page table pages above.
+	 * We setup the identity and linear mappings here. */
+	phys_linear = (unsigned long)linear - mem_base;
+	for (i = 0; i < mapped_pages; i += PTRS_PER_PTE) {
+		pgd_t pgd;
+		pgd = __pgd((phys_linear + i * sizeof(pte_t)) |
+			    (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER));
+
+		if (copy_to_user(&pgdir[i / PTRS_PER_PTE], &pgd, sizeof(pgd))
+		    || copy_to_user(&pgdir[pgd_index(PAGE_OFFSET)
+					   + i / PTRS_PER_PTE],
+				    &pgd, sizeof(pgd)))
+			return -EFAULT;
+	}
+
+	/* We return the top level (guest-physical) address: remember where
+	 * this is. */
+	return (unsigned long)pgdir - mem_base;
+}
+
 /*H:500 (vii) Setting up the page tables initially.
  *
  * When a Guest is first created, the Launcher tells us where the toplevel of
  * its first page table is.  We set some things up here: */
-int init_guest_pagetable(struct lguest *lg, unsigned long pgtable)
+int init_guest_pagetable(struct lguest *lg)
 {
+	u64 mem;
+	u32 initrd_size;
+	struct boot_params __user *boot = (struct boot_params *)lg->mem_base;
+
+	/* Get the Guest memory size and the ramdisk size from the boot header
+	 * located at lg->mem_base (Guest address 0). */
+	if (copy_from_user(&mem, &boot->e820_map[0].size, sizeof(mem))
+	    || get_user(initrd_size, &boot->hdr.ramdisk_size))
+		return -EFAULT;
+
 	/* We start on the first shadow page table, and give it a blank PGD
 	 * page. */
-	lg->pgdirs[0].gpgdir = pgtable;
+	lg->pgdirs[0].gpgdir = setup_pagetables(lg, mem, initrd_size);
+	if (IS_ERR_VALUE(lg->pgdirs[0].gpgdir))
+		return lg->pgdirs[0].gpgdir;
 	lg->pgdirs[0].pgdir = (pgd_t *)get_zeroed_page(GFP_KERNEL);
 	if (!lg->pgdirs[0].pgdir)
 		return -ENOMEM;
diff --git a/include/linux/lguest_launcher.h b/include/linux/lguest_launcher.h
index bd0eba76052..a53407a4165 100644
--- a/include/linux/lguest_launcher.h
+++ b/include/linux/lguest_launcher.h
@@ -54,7 +54,7 @@ struct lguest_vqconfig {
 /* Write command first word is a request. */
 enum lguest_req
 {
-	LHREQ_INITIALIZE, /* + base, pfnlimit, pgdir, start */
+	LHREQ_INITIALIZE, /* + base, pfnlimit, start */
 	LHREQ_GETDMA, /* No longer used */
 	LHREQ_IRQ, /* + irq */
 	LHREQ_BREAK, /* + on/off flag (on blocks until someone does off) */
-- 
cgit v1.2.3-70-g09d2


From 0877258d98154565def498833ccb208234c85084 Mon Sep 17 00:00:00 2001
From: Laurent Pinchart <laurent.pinchart@skynet.be>
Date: Sun, 14 Dec 2008 16:21:16 -0300
Subject: V4L/DVB (9897): v4l2: Add camera zoom controls

The zoom controls move the zoom lens group to a an absolute position, as a
relative displacement or at a given speed until reaching physical device
limits. Positive values move the zoom lens group towards the telephoto
direction, negative values towards the wide-angle direction.

Signed-off-by: Laurent Pinchart <laurent.pinchart@skynet.be>
Signed-off-by: Mauro Carvalho Chehab <mchehab@redhat.com>
---
 include/linux/videodev2.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/videodev2.h b/include/linux/videodev2.h
index ec311d4616c..e450a92a622 100644
--- a/include/linux/videodev2.h
+++ b/include/linux/videodev2.h
@@ -1118,6 +1118,10 @@ enum  v4l2_exposure_auto_type {
 #define V4L2_CID_FOCUS_RELATIVE			(V4L2_CID_CAMERA_CLASS_BASE+11)
 #define V4L2_CID_FOCUS_AUTO			(V4L2_CID_CAMERA_CLASS_BASE+12)
 
+#define V4L2_CID_ZOOM_ABSOLUTE			(V4L2_CID_CAMERA_CLASS_BASE+13)
+#define V4L2_CID_ZOOM_RELATIVE			(V4L2_CID_CAMERA_CLASS_BASE+14)
+#define V4L2_CID_ZOOM_CONTINUOUS		(V4L2_CID_CAMERA_CLASS_BASE+15)
+
 /*
  *	T U N I N G
  */
-- 
cgit v1.2.3-70-g09d2


From 046425f8c4ac431db00c09a6d9fba16560b8e5b9 Mon Sep 17 00:00:00 2001
From: Laurent Pinchart <laurent.pinchart@skynet.be>
Date: Sun, 14 Dec 2008 16:22:05 -0300
Subject: V4L/DVB (9898): v4l2: Add privacy control

The privacy control prevents video from being acquired by the camera. A true
value indicates that no image can be captured. Devices that implement the
privacy control must support read access and may support write access.

Signed-off-by: Laurent Pinchart <laurent.pinchart@skynet.be>
Signed-off-by: Mauro Carvalho Chehab <mchehab@redhat.com>
---
 include/linux/videodev2.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/videodev2.h b/include/linux/videodev2.h
index e450a92a622..6e6743bd0f9 100644
--- a/include/linux/videodev2.h
+++ b/include/linux/videodev2.h
@@ -1122,6 +1122,8 @@ enum  v4l2_exposure_auto_type {
 #define V4L2_CID_ZOOM_RELATIVE			(V4L2_CID_CAMERA_CLASS_BASE+14)
 #define V4L2_CID_ZOOM_CONTINUOUS		(V4L2_CID_CAMERA_CLASS_BASE+15)
 
+#define V4L2_CID_PRIVACY			(V4L2_CID_CAMERA_CLASS_BASE+16)
+
 /*
  *	T U N I N G
  */
-- 
cgit v1.2.3-70-g09d2


From 92f45badbbaccdbc1be25085292a1e258948e221 Mon Sep 17 00:00:00 2001
From: Hans Verkuil <hverkuil@xs4all.nl>
Date: Sun, 21 Dec 2008 10:35:25 -0300
Subject: V4L/DVB (9932): v4l2-compat32: fix 32-64 compatibility module

Added all missing v4l1/2 ioctls and fix several broken conversions.
Partially based on work done by Cody Pisto <cpisto@gmail.com>.

Tested-by: Brandon Jenkins <bcjenkins@tvwhere.com>
Signed-off-by: Hans Verkuil <hverkuil@xs4all.nl>
Signed-off-by: Mauro Carvalho Chehab <mchehab@redhat.com>
---
 drivers/media/video/v4l2-compat-ioctl32.c | 781 ++++++++++++++++++------------
 include/linux/videodev2.h                 |   2 +
 2 files changed, 467 insertions(+), 316 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/media/video/v4l2-compat-ioctl32.c b/drivers/media/video/v4l2-compat-ioctl32.c
index 9b50b43b76a..5084773d4f1 100644
--- a/drivers/media/video/v4l2-compat-ioctl32.c
+++ b/drivers/media/video/v4l2-compat-ioctl32.c
@@ -7,12 +7,14 @@
  * Copyright (C) 2001,2002  Andi Kleen, SuSE Labs
  * Copyright (C) 2003       Pavel Machek (pavel@suse.cz)
  * Copyright (C) 2005       Philippe De Muyter (phdm@macqel.be)
+ * Copyright (C) 2008       Hans Verkuil <hverkuil@xs4all.nl>
  *
  * These routines maintain argument size conversion between 32bit and 64bit
  * ioctls.
  */
 
 #include <linux/compat.h>
+#define __OLD_VIDIOC_ /* To allow fixing old calls*/
 #include <linux/videodev.h>
 #include <linux/videodev2.h>
 #include <linux/module.h>
@@ -58,7 +60,6 @@ static int put_video_tuner32(struct video_tuner *kp, struct video_tuner32 __user
 	return 0;
 }
 
-
 struct video_buffer32 {
 	compat_caddr_t base;
 	compat_int_t height, width, depth, bytesperline;
@@ -99,7 +100,7 @@ static int put_video_buffer32(struct video_buffer *kp, struct video_buffer32 __u
 }
 
 struct video_clip32 {
-	s32 x, y, width, height;	/* Its really s32 in videodev.h */
+	s32 x, y, width, height;	/* It's really s32 in videodev.h */
 	compat_caddr_t next;
 };
 
@@ -108,25 +109,72 @@ struct video_window32 {
 	compat_caddr_t clips;
 	compat_int_t clipcount;
 };
-#endif
 
-static int native_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
+static int get_video_window32(struct video_window *kp, struct video_window32 __user *up)
 {
-	int ret = -ENOIOCTLCMD;
+	struct video_clip __user *uclips;
+	struct video_clip __user *kclips;
+	compat_caddr_t p;
+	int nclips;
 
-	if (file->f_op->unlocked_ioctl)
-		ret = file->f_op->unlocked_ioctl(file, cmd, arg);
-	else if (file->f_op->ioctl) {
-		lock_kernel();
-		ret = file->f_op->ioctl(file->f_path.dentry->d_inode, file, cmd, arg);
-		unlock_kernel();
+	if (!access_ok(VERIFY_READ, up, sizeof(struct video_window32)))
+		return -EFAULT;
+
+	if (get_user(nclips, &up->clipcount))
+		return -EFAULT;
+
+	if (!access_ok(VERIFY_READ, up, sizeof(struct video_window32)) ||
+	    get_user(kp->x, &up->x) ||
+	    get_user(kp->y, &up->y) ||
+	    get_user(kp->width, &up->width) ||
+	    get_user(kp->height, &up->height) ||
+	    get_user(kp->chromakey, &up->chromakey) ||
+	    get_user(kp->flags, &up->flags) ||
+	    get_user(kp->clipcount, &up->clipcount))
+		return -EFAULT;
+
+	nclips = kp->clipcount;
+	kp->clips = NULL;
+
+	if (nclips == 0)
+		return 0;
+	if (get_user(p, &up->clips))
+		return -EFAULT;
+	uclips = compat_ptr(p);
+
+	/* If nclips < 0, then it is a clipping bitmap of size
+	   VIDEO_CLIPMAP_SIZE */
+	if (nclips < 0) {
+		if (!access_ok(VERIFY_READ, uclips, VIDEO_CLIPMAP_SIZE))
+			return -EFAULT;
+		kp->clips = compat_alloc_user_space(VIDEO_CLIPMAP_SIZE);
+		if (copy_in_user(kp->clips, uclips, VIDEO_CLIPMAP_SIZE))
+			return -EFAULT;
+		return 0;
 	}
 
-	return ret;
-}
+	/* Otherwise it is an array of video_clip structs. */
+	if (!access_ok(VERIFY_READ, uclips, nclips * sizeof(struct video_clip)))
+		return -EFAULT;
 
+	kp->clips = compat_alloc_user_space(nclips * sizeof(struct video_clip));
+	kclips = kp->clips;
+	while (nclips--) {
+		int err;
+
+		err = copy_in_user(&kclips->x, &uclips->x, sizeof(kclips->x));
+		err |= copy_in_user(&kclips->y, &uclips->y, sizeof(kclips->y));
+		err |= copy_in_user(&kclips->width, &uclips->width, sizeof(kclips->width));
+		err |= copy_in_user(&kclips->height, &uclips->height, sizeof(kclips->height));
+		kclips->next = NULL;
+		if (err)
+			return -EFAULT;
+		kclips++;
+		uclips++;
+	}
+	return 0;
+}
 
-#ifdef CONFIG_VIDEO_V4L1_COMPAT
 /* You get back everything except the clips... */
 static int put_video_window32(struct video_window *kp, struct video_window32 __user *up)
 {
@@ -141,8 +189,55 @@ static int put_video_window32(struct video_window *kp, struct video_window32 __u
 			return -EFAULT;
 	return 0;
 }
+
+struct video_code32 {
+	char		loadwhat[16];	/* name or tag of file being passed */
+	compat_int_t	datasize;
+	unsigned char	*data;
+};
+
+static int get_microcode32(struct video_code *kp, struct video_code32 __user *up)
+{
+	if (!access_ok(VERIFY_READ, up, sizeof(struct video_code32)) ||
+		copy_from_user(kp->loadwhat, up->loadwhat, sizeof(up->loadwhat)) ||
+		get_user(kp->datasize, &up->datasize) ||
+		copy_from_user(kp->data, up->data, up->datasize))
+			return -EFAULT;
+	return 0;
+}
+
+#define VIDIOCGTUNER32		_IOWR('v', 4, struct video_tuner32)
+#define VIDIOCSTUNER32		_IOW('v', 5, struct video_tuner32)
+#define VIDIOCGWIN32		_IOR('v', 9, struct video_window32)
+#define VIDIOCSWIN32		_IOW('v', 10, struct video_window32)
+#define VIDIOCGFBUF32		_IOR('v', 11, struct video_buffer32)
+#define VIDIOCSFBUF32		_IOW('v', 12, struct video_buffer32)
+#define VIDIOCGFREQ32		_IOR('v', 14, u32)
+#define VIDIOCSFREQ32		_IOW('v', 15, u32)
+#define VIDIOCSMICROCODE32	_IOW('v', 27, struct video_code32)
+
+#define VIDIOCCAPTURE32		_IOW('v', 8, s32)
+#define VIDIOCSYNC32		_IOW('v', 18, s32)
+#define VIDIOCSWRITEMODE32	_IOW('v', 25, s32)
+
 #endif
 
+static int native_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
+{
+	int ret = -ENOIOCTLCMD;
+
+	if (file->f_op->unlocked_ioctl)
+		ret = file->f_op->unlocked_ioctl(file, cmd, arg);
+	else if (file->f_op->ioctl) {
+		lock_kernel();
+		ret = file->f_op->ioctl(file->f_path.dentry->d_inode, file, cmd, arg);
+		unlock_kernel();
+	}
+
+	return ret;
+}
+
+
 struct v4l2_clip32 {
 	struct v4l2_rect        c;
 	compat_caddr_t 		next;
@@ -229,12 +324,27 @@ static inline int put_v4l2_vbi_format(struct v4l2_vbi_format *kp, struct v4l2_vb
 	return 0;
 }
 
+static inline int get_v4l2_sliced_vbi_format(struct v4l2_sliced_vbi_format *kp, struct v4l2_sliced_vbi_format __user *up)
+{
+	if (copy_from_user(kp, up, sizeof(struct v4l2_sliced_vbi_format)))
+		return -EFAULT;
+	return 0;
+}
+
+static inline int put_v4l2_sliced_vbi_format(struct v4l2_sliced_vbi_format *kp, struct v4l2_sliced_vbi_format __user *up)
+{
+	if (copy_to_user(up, kp, sizeof(struct v4l2_sliced_vbi_format)))
+		return -EFAULT;
+	return 0;
+}
+
 struct v4l2_format32 {
 	enum v4l2_buf_type type;
 	union {
-		struct v4l2_pix_format	pix;  /* V4L2_BUF_TYPE_VIDEO_CAPTURE */
-		struct v4l2_window32	win;  /* V4L2_BUF_TYPE_VIDEO_OVERLAY */
-		struct v4l2_vbi_format	vbi;  /* V4L2_BUF_TYPE_VBI_CAPTURE */
+		struct v4l2_pix_format	pix;
+		struct v4l2_window32	win;
+		struct v4l2_vbi_format	vbi;
+		struct v4l2_sliced_vbi_format	sliced;
 		__u8	raw_data[200];        /* user-defined */
 	} fmt;
 };
@@ -246,15 +356,27 @@ static int get_v4l2_format32(struct v4l2_format *kp, struct v4l2_format32 __user
 			return -EFAULT;
 	switch (kp->type) {
 	case V4L2_BUF_TYPE_VIDEO_CAPTURE:
+	case V4L2_BUF_TYPE_VIDEO_OUTPUT:
 		return get_v4l2_pix_format(&kp->fmt.pix, &up->fmt.pix);
 	case V4L2_BUF_TYPE_VIDEO_OVERLAY:
+	case V4L2_BUF_TYPE_VIDEO_OUTPUT_OVERLAY:
 		return get_v4l2_window32(&kp->fmt.win, &up->fmt.win);
 	case V4L2_BUF_TYPE_VBI_CAPTURE:
+	case V4L2_BUF_TYPE_VBI_OUTPUT:
 		return get_v4l2_vbi_format(&kp->fmt.vbi, &up->fmt.vbi);
+	case V4L2_BUF_TYPE_SLICED_VBI_CAPTURE:
+	case V4L2_BUF_TYPE_SLICED_VBI_OUTPUT:
+		return get_v4l2_sliced_vbi_format(&kp->fmt.sliced, &up->fmt.sliced);
+	case V4L2_BUF_TYPE_PRIVATE:
+		if (copy_from_user(kp, up, sizeof(kp->fmt.raw_data)))
+			return -EFAULT;
+		return 0;
+	case 0:
+		return -EINVAL;
 	default:
-		printk(KERN_INFO "compat_ioctl: unexpected VIDIOC_FMT type %d\n",
+		printk(KERN_INFO "compat_ioctl32: unexpected VIDIOC_FMT type %d\n",
 								kp->type);
-		return -ENXIO;
+		return -EINVAL;
 	}
 }
 
@@ -265,31 +387,30 @@ static int put_v4l2_format32(struct v4l2_format *kp, struct v4l2_format32 __user
 		return -EFAULT;
 	switch (kp->type) {
 	case V4L2_BUF_TYPE_VIDEO_CAPTURE:
+	case V4L2_BUF_TYPE_VIDEO_OUTPUT:
 		return put_v4l2_pix_format(&kp->fmt.pix, &up->fmt.pix);
 	case V4L2_BUF_TYPE_VIDEO_OVERLAY:
+	case V4L2_BUF_TYPE_VIDEO_OUTPUT_OVERLAY:
 		return put_v4l2_window32(&kp->fmt.win, &up->fmt.win);
 	case V4L2_BUF_TYPE_VBI_CAPTURE:
+	case V4L2_BUF_TYPE_VBI_OUTPUT:
 		return put_v4l2_vbi_format(&kp->fmt.vbi, &up->fmt.vbi);
+	case V4L2_BUF_TYPE_SLICED_VBI_CAPTURE:
+	case V4L2_BUF_TYPE_SLICED_VBI_OUTPUT:
+		return put_v4l2_sliced_vbi_format(&kp->fmt.sliced, &up->fmt.sliced);
+	case V4L2_BUF_TYPE_PRIVATE:
+		if (copy_to_user(up, kp, sizeof(up->fmt.raw_data)))
+			return -EFAULT;
+		return 0;
+	case 0:
+		return -EINVAL;
 	default:
-		return -ENXIO;
+		printk(KERN_INFO "compat_ioctl32: unexpected VIDIOC_FMT type %d\n",
+								kp->type);
+		return -EINVAL;
 	}
 }
 
-static inline int get_v4l2_standard(struct v4l2_standard *kp, struct v4l2_standard __user *up)
-{
-	if (copy_from_user(kp, up, sizeof(struct v4l2_standard)))
-		return -EFAULT;
-	return 0;
-
-}
-
-static inline int put_v4l2_standard(struct v4l2_standard *kp, struct v4l2_standard __user *up)
-{
-	if (copy_to_user(up, kp, sizeof(struct v4l2_standard)))
-		return -EFAULT;
-	return 0;
-}
-
 struct v4l2_standard32 {
 	__u32		     index;
 	__u32		     id[2]; /* __u64 would get the alignment wrong */
@@ -321,21 +442,6 @@ static int put_v4l2_standard32(struct v4l2_standard *kp, struct v4l2_standard32
 	return 0;
 }
 
-static inline int get_v4l2_tuner(struct v4l2_tuner *kp, struct v4l2_tuner __user *up)
-{
-	if (copy_from_user(kp, up, sizeof(struct v4l2_tuner)))
-		return -EFAULT;
-	return 0;
-
-}
-
-static inline int put_v4l2_tuner(struct v4l2_tuner *kp, struct v4l2_tuner __user *up)
-{
-	if (copy_to_user(up, kp, sizeof(struct v4l2_tuner)))
-		return -EFAULT;
-	return 0;
-}
-
 struct v4l2_buffer32 {
 	__u32			index;
 	enum v4l2_buf_type      type;
@@ -459,149 +565,143 @@ static int put_v4l2_framebuffer32(struct v4l2_framebuffer *kp, struct v4l2_frame
 	return 0;
 }
 
-static inline int get_v4l2_input32(struct v4l2_input *kp, struct v4l2_input __user *up)
-{
-	if (copy_from_user(kp, up, sizeof(struct v4l2_input) - 4))
-		return -EFAULT;
-	return 0;
-}
-
-static inline int put_v4l2_input32(struct v4l2_input *kp, struct v4l2_input __user *up)
+struct v4l2_input32 {
+	__u32	     index;		/*  Which input */
+	__u8	     name[32];		/*  Label */
+	__u32	     type;		/*  Type of input */
+	__u32	     audioset;		/*  Associated audios (bitfield) */
+	__u32        tuner;             /*  Associated tuner */
+	v4l2_std_id  std;
+	__u32	     status;
+	__u32	     reserved[4];
+} __attribute__ ((packed));
+
+/* The 64-bit v4l2_input struct has extra padding at the end of the struct.
+   Otherwise it is identical to the 32-bit version. */
+static inline int get_v4l2_input32(struct v4l2_input *kp, struct v4l2_input32 __user *up)
 {
-	if (copy_to_user(up, kp, sizeof(struct v4l2_input) - 4))
+	if (copy_from_user(kp, up, sizeof(struct v4l2_input32)))
 		return -EFAULT;
 	return 0;
 }
 
-static inline int get_v4l2_input(struct v4l2_input *kp, struct v4l2_input __user *up)
+static inline int put_v4l2_input32(struct v4l2_input *kp, struct v4l2_input32 __user *up)
 {
-	if (copy_from_user(kp, up, sizeof(struct v4l2_input)))
+	if (copy_to_user(up, kp, sizeof(struct v4l2_input32)))
 		return -EFAULT;
 	return 0;
 }
 
-static inline int put_v4l2_input(struct v4l2_input *kp, struct v4l2_input __user *up)
-{
-	if (copy_to_user(up, kp, sizeof(struct v4l2_input)))
-		return -EFAULT;
-	return 0;
-}
-
-#ifdef CONFIG_VIDEO_V4L1_COMPAT
-struct video_code32 {
-	char		loadwhat[16];	/* name or tag of file being passed */
-	compat_int_t	datasize;
-	unsigned char	*data;
+struct v4l2_ext_controls32 {
+       __u32 ctrl_class;
+       __u32 count;
+       __u32 error_idx;
+       __u32 reserved[2];
+       compat_caddr_t controls; /* actually struct v4l2_ext_control32 * */
 };
 
-static inline int microcode32(struct video_code *kp, struct video_code32 __user *up)
+static int get_v4l2_ext_controls32(struct v4l2_ext_controls *kp, struct v4l2_ext_controls32 __user *up)
 {
-	if (!access_ok(VERIFY_READ, up, sizeof(struct video_code32)) ||
-		copy_from_user(kp->loadwhat, up->loadwhat, sizeof(up->loadwhat)) ||
-		get_user(kp->datasize, &up->datasize) ||
-		copy_from_user(kp->data, up->data, up->datasize))
+	struct v4l2_ext_control __user *ucontrols;
+	struct v4l2_ext_control __user *kcontrols;
+	int n;
+	compat_caddr_t p;
+
+	if (!access_ok(VERIFY_READ, up, sizeof(struct v4l2_ext_controls32)) ||
+		get_user(kp->ctrl_class, &up->ctrl_class) ||
+		get_user(kp->count, &up->count) ||
+		get_user(kp->error_idx, &up->error_idx) ||
+		copy_from_user(kp->reserved, up->reserved, sizeof(kp->reserved)))
+			return -EFAULT;
+	n = kp->count;
+	if (n == 0) {
+		kp->controls = NULL;
+		return 0;
+	}
+	if (get_user(p, &up->controls))
+		return -EFAULT;
+	ucontrols = compat_ptr(p);
+	if (!access_ok(VERIFY_READ, ucontrols, n * sizeof(struct v4l2_ext_control)))
+		return -EFAULT;
+	kcontrols = compat_alloc_user_space(n * sizeof(struct v4l2_ext_control));
+	kp->controls = kcontrols;
+	while (--n >= 0) {
+		if (copy_in_user(&kcontrols->id, &ucontrols->id, sizeof(__u32)))
+			return -EFAULT;
+		if (copy_in_user(&kcontrols->reserved2, &ucontrols->reserved2, sizeof(ucontrols->reserved2)))
 			return -EFAULT;
+		/* Note: if the void * part of the union ever becomes relevant
+		   then we need to know the type of the control in order to do
+		   the right thing here. Luckily, that is not yet an issue. */
+		if (copy_in_user(&kcontrols->value, &ucontrols->value, sizeof(ucontrols->value)))
+			return -EFAULT;
+		ucontrols++;
+		kcontrols++;
+	}
 	return 0;
 }
 
-#define VIDIOCGTUNER32		_IOWR('v', 4, struct video_tuner32)
-#define VIDIOCSTUNER32		_IOW('v', 5, struct video_tuner32)
-#define VIDIOCGWIN32		_IOR('v', 9, struct video_window32)
-#define VIDIOCSWIN32		_IOW('v', 10, struct video_window32)
-#define VIDIOCGFBUF32		_IOR('v', 11, struct video_buffer32)
-#define VIDIOCSFBUF32		_IOW('v', 12, struct video_buffer32)
-#define VIDIOCGFREQ32		_IOR('v', 14, u32)
-#define VIDIOCSFREQ32		_IOW('v', 15, u32)
-#define VIDIOCSMICROCODE32	_IOW('v', 27, struct video_code32)
-
-#endif
-
-/* VIDIOC_ENUMINPUT32 is VIDIOC_ENUMINPUT minus 4 bytes of padding alignement */
-#define VIDIOC_ENUMINPUT32	(VIDIOC_ENUMINPUT - _IOC(0, 0, 0, 4))
-#define VIDIOC_G_FMT32		_IOWR ('V',  4, struct v4l2_format32)
-#define VIDIOC_S_FMT32		_IOWR ('V',  5, struct v4l2_format32)
-#define VIDIOC_QUERYBUF32	_IOWR ('V',  9, struct v4l2_buffer32)
-#define VIDIOC_G_FBUF32		_IOR  ('V', 10, struct v4l2_framebuffer32)
-#define VIDIOC_S_FBUF32		_IOW  ('V', 11, struct v4l2_framebuffer32)
-/* VIDIOC_OVERLAY is now _IOW, but was _IOWR */
-#define VIDIOC_OVERLAY32	_IOWR ('V', 14, compat_int_t)
-#define VIDIOC_QBUF32		_IOWR ('V', 15, struct v4l2_buffer32)
-#define VIDIOC_DQBUF32		_IOWR ('V', 17, struct v4l2_buffer32)
-#define VIDIOC_STREAMON32	_IOW  ('V', 18, compat_int_t)
-#define VIDIOC_STREAMOFF32	_IOW  ('V', 19, compat_int_t)
-#define VIDIOC_ENUMSTD32	_IOWR ('V', 25, struct v4l2_standard32)
-/* VIDIOC_S_CTRL is now _IOWR, but was _IOW */
-#define VIDIOC_S_CTRL32		_IOW  ('V', 28, struct v4l2_control)
-#define VIDIOC_G_INPUT32	_IOR  ('V', 38, compat_int_t)
-#define VIDIOC_S_INPUT32	_IOWR ('V', 39, compat_int_t)
-#define VIDIOC_TRY_FMT32      	_IOWR ('V', 64, struct v4l2_format32)
-
-#ifdef CONFIG_VIDEO_V4L1_COMPAT
-enum {
-	MaxClips = (~0U-sizeof(struct video_window))/sizeof(struct video_clip)
-};
-
-static int do_set_window(struct file *file, unsigned int cmd, unsigned long arg)
+static int put_v4l2_ext_controls32(struct v4l2_ext_controls *kp, struct v4l2_ext_controls32 __user *up)
 {
-	struct video_window32 __user *up = compat_ptr(arg);
-	struct video_window __user *vw;
-	struct video_clip __user *p;
-	int nclips;
-	u32 n;
-
-	if (!access_ok(VERIFY_READ, up, sizeof(struct video_window32)))
-		return -EFAULT;
+	struct v4l2_ext_control __user *ucontrols;
+	struct v4l2_ext_control __user *kcontrols = kp->controls;
+	int n = kp->count;
+	compat_caddr_t p;
+
+	if (!access_ok(VERIFY_WRITE, up, sizeof(struct v4l2_ext_controls32)) ||
+		put_user(kp->ctrl_class, &up->ctrl_class) ||
+		put_user(kp->count, &up->count) ||
+		put_user(kp->error_idx, &up->error_idx) ||
+		copy_to_user(up->reserved, kp->reserved, sizeof(up->reserved)))
+			return -EFAULT;
+	if (!kp->count)
+		return 0;
 
-	if (get_user(nclips, &up->clipcount))
+	if (get_user(p, &up->controls))
 		return -EFAULT;
-
-	/* Peculiar interface... */
-	if (nclips < 0)
-		nclips = VIDEO_CLIPMAP_SIZE;
-
-	if (nclips > MaxClips)
-		return -ENOMEM;
-
-	vw = compat_alloc_user_space(sizeof(struct video_window) +
-				    nclips * sizeof(struct video_clip));
-
-	p = nclips ? (struct video_clip __user *)(vw + 1) : NULL;
-
-	if (get_user(n, &up->x) || put_user(n, &vw->x) ||
-	    get_user(n, &up->y) || put_user(n, &vw->y) ||
-	    get_user(n, &up->width) || put_user(n, &vw->width) ||
-	    get_user(n, &up->height) || put_user(n, &vw->height) ||
-	    get_user(n, &up->chromakey) || put_user(n, &vw->chromakey) ||
-	    get_user(n, &up->flags) || put_user(n, &vw->flags) ||
-	    get_user(n, &up->clipcount) || put_user(n, &vw->clipcount) ||
-	    get_user(n, &up->clips) || put_user(p, &vw->clips))
+	ucontrols = compat_ptr(p);
+	if (!access_ok(VERIFY_WRITE, ucontrols, n * sizeof(struct v4l2_ext_control)))
 		return -EFAULT;
 
-	if (nclips) {
-		struct video_clip32 __user *u = compat_ptr(n);
-		int i;
-		if (!u)
-			return -EINVAL;
-		for (i = 0; i < nclips; i++, u++, p++) {
-			s32 v;
-			if (!access_ok(VERIFY_READ, u, sizeof(struct video_clip32)) ||
-			    !access_ok(VERIFY_WRITE, p, sizeof(struct video_clip32)) ||
-			    get_user(v, &u->x) ||
-			    put_user(v, &p->x) ||
-			    get_user(v, &u->y) ||
-			    put_user(v, &p->y) ||
-			    get_user(v, &u->width) ||
-			    put_user(v, &p->width) ||
-			    get_user(v, &u->height) ||
-			    put_user(v, &p->height) ||
-			    put_user(NULL, &p->next))
-				return -EFAULT;
-		}
+	while (--n >= 0) {
+		if (copy_in_user(&ucontrols->id, &kcontrols->id, sizeof(__u32)))
+			return -EFAULT;
+		if (copy_in_user(&ucontrols->reserved2, &kcontrols->reserved2,
+					sizeof(ucontrols->reserved2)))
+			return -EFAULT;
+		/* Note: if the void * part of the union ever becomes relevant
+		   then we need to know the type of the control in order to do
+		   the right thing here. Luckily, that is not yet an issue. */
+		if (copy_in_user(&ucontrols->value, &kcontrols->value, sizeof(ucontrols->value)))
+			return -EFAULT;
+		ucontrols++;
+		kcontrols++;
 	}
-
-	return native_ioctl(file, VIDIOCSWIN, (unsigned long)vw);
+	return 0;
 }
-#endif
+
+#define VIDIOC_G_FMT32		_IOWR('V',  4, struct v4l2_format32)
+#define VIDIOC_S_FMT32		_IOWR('V',  5, struct v4l2_format32)
+#define VIDIOC_QUERYBUF32	_IOWR('V',  9, struct v4l2_buffer32)
+#define VIDIOC_G_FBUF32		_IOR ('V', 10, struct v4l2_framebuffer32)
+#define VIDIOC_S_FBUF32		_IOW ('V', 11, struct v4l2_framebuffer32)
+#define VIDIOC_QBUF32		_IOWR('V', 15, struct v4l2_buffer32)
+#define VIDIOC_DQBUF32		_IOWR('V', 17, struct v4l2_buffer32)
+#define VIDIOC_ENUMSTD32	_IOWR('V', 25, struct v4l2_standard32)
+#define VIDIOC_ENUMINPUT32	_IOWR('V', 26, struct v4l2_input32)
+#define VIDIOC_TRY_FMT32      	_IOWR('V', 64, struct v4l2_format32)
+#define VIDIOC_G_EXT_CTRLS32    _IOWR('V', 71, struct v4l2_ext_controls32)
+#define VIDIOC_S_EXT_CTRLS32    _IOWR('V', 72, struct v4l2_ext_controls32)
+#define VIDIOC_TRY_EXT_CTRLS32  _IOWR('V', 73, struct v4l2_ext_controls32)
+
+#define VIDIOC_OVERLAY32	_IOW ('V', 14, s32)
+#define VIDIOC_OVERLAY32_OLD	_IOWR('V', 14, s32)
+#define VIDIOC_STREAMON32	_IOW ('V', 18, s32)
+#define VIDIOC_STREAMOFF32	_IOW ('V', 19, s32)
+#define VIDIOC_G_INPUT32	_IOR ('V', 38, s32)
+#define VIDIOC_S_INPUT32	_IOWR('V', 39, s32)
+#define VIDIOC_G_OUTPUT32	_IOR ('V', 46, s32)
+#define VIDIOC_S_OUTPUT32	_IOWR('V', 47, s32)
 
 static int do_video_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 {
@@ -616,45 +716,51 @@ static int do_video_ioctl(struct file *file, unsigned int cmd, unsigned long arg
 		struct v4l2_format v2f;
 		struct v4l2_buffer v2b;
 		struct v4l2_framebuffer v2fb;
-		struct v4l2_standard v2s;
 		struct v4l2_input v2i;
-		struct v4l2_tuner v2t;
+		struct v4l2_standard v2s;
+		struct v4l2_ext_controls v2ecs;
 		unsigned long vx;
+		int vi;
 	} karg;
 	void __user *up = compat_ptr(arg);
 	int compatible_arg = 1;
 	int err = 0;
-	int realcmd = cmd;
 
 	/* First, convert the command. */
 	switch (cmd) {
 #ifdef CONFIG_VIDEO_V4L1_COMPAT
-	case VIDIOCGTUNER32: realcmd = cmd = VIDIOCGTUNER; break;
-	case VIDIOCSTUNER32: realcmd = cmd = VIDIOCSTUNER; break;
-	case VIDIOCGWIN32: realcmd = cmd = VIDIOCGWIN; break;
-	case VIDIOCGFBUF32: realcmd = cmd = VIDIOCGFBUF; break;
-	case VIDIOCSFBUF32: realcmd = cmd = VIDIOCSFBUF; break;
-	case VIDIOCGFREQ32: realcmd = cmd = VIDIOCGFREQ; break;
-	case VIDIOCSFREQ32: realcmd = cmd = VIDIOCSFREQ; break;
-	case VIDIOCSMICROCODE32: realcmd = cmd = VIDIOCSMICROCODE; break;
+	case VIDIOCGTUNER32: cmd = VIDIOCGTUNER; break;
+	case VIDIOCSTUNER32: cmd = VIDIOCSTUNER; break;
+	case VIDIOCGWIN32: cmd = VIDIOCGWIN; break;
+	case VIDIOCSWIN32: cmd = VIDIOCSWIN; break;
+	case VIDIOCGFBUF32: cmd = VIDIOCGFBUF; break;
+	case VIDIOCSFBUF32: cmd = VIDIOCSFBUF; break;
+	case VIDIOCGFREQ32: cmd = VIDIOCGFREQ; break;
+	case VIDIOCSFREQ32: cmd = VIDIOCSFREQ; break;
+	case VIDIOCSMICROCODE32: cmd = VIDIOCSMICROCODE; break;
 #endif
-	case VIDIOC_G_FMT32: realcmd = cmd = VIDIOC_G_FMT; break;
-	case VIDIOC_S_FMT32: realcmd = cmd = VIDIOC_S_FMT; break;
-	case VIDIOC_QUERYBUF32: realcmd = cmd = VIDIOC_QUERYBUF; break;
-	case VIDIOC_QBUF32: realcmd = cmd = VIDIOC_QBUF; break;
-	case VIDIOC_DQBUF32: realcmd = cmd = VIDIOC_DQBUF; break;
-	case VIDIOC_STREAMON32: realcmd = cmd = VIDIOC_STREAMON; break;
-	case VIDIOC_STREAMOFF32: realcmd = cmd = VIDIOC_STREAMOFF; break;
-	case VIDIOC_G_FBUF32: realcmd = cmd = VIDIOC_G_FBUF; break;
-	case VIDIOC_S_FBUF32: realcmd = cmd = VIDIOC_S_FBUF; break;
-	case VIDIOC_OVERLAY32: realcmd = cmd = VIDIOC_OVERLAY; break;
-	case VIDIOC_ENUMSTD32: realcmd = VIDIOC_ENUMSTD; break;
-	case VIDIOC_ENUMINPUT32: realcmd = VIDIOC_ENUMINPUT; break;
-	case VIDIOC_S_CTRL32: realcmd = cmd = VIDIOC_S_CTRL; break;
-	case VIDIOC_G_INPUT32: realcmd = cmd = VIDIOC_G_INPUT; break;
-	case VIDIOC_S_INPUT32: realcmd = cmd = VIDIOC_S_INPUT; break;
-	case VIDIOC_TRY_FMT32: realcmd = cmd = VIDIOC_TRY_FMT; break;
-	};
+	case VIDIOC_G_FMT32: cmd = VIDIOC_G_FMT; break;
+	case VIDIOC_S_FMT32: cmd = VIDIOC_S_FMT; break;
+	case VIDIOC_QUERYBUF32: cmd = VIDIOC_QUERYBUF; break;
+	case VIDIOC_G_FBUF32: cmd = VIDIOC_G_FBUF; break;
+	case VIDIOC_S_FBUF32: cmd = VIDIOC_S_FBUF; break;
+	case VIDIOC_QBUF32: cmd = VIDIOC_QBUF; break;
+	case VIDIOC_DQBUF32: cmd = VIDIOC_DQBUF; break;
+	case VIDIOC_ENUMSTD32: cmd = VIDIOC_ENUMSTD; break;
+	case VIDIOC_ENUMINPUT32: cmd = VIDIOC_ENUMINPUT; break;
+	case VIDIOC_TRY_FMT32: cmd = VIDIOC_TRY_FMT; break;
+	case VIDIOC_G_EXT_CTRLS32: cmd = VIDIOC_G_EXT_CTRLS; break;
+	case VIDIOC_S_EXT_CTRLS32: cmd = VIDIOC_S_EXT_CTRLS; break;
+	case VIDIOC_TRY_EXT_CTRLS32: cmd = VIDIOC_TRY_EXT_CTRLS; break;
+	case VIDIOC_OVERLAY32: cmd = VIDIOC_OVERLAY; break;
+	case VIDIOC_OVERLAY32_OLD: cmd = VIDIOC_OVERLAY; break;
+	case VIDIOC_STREAMON32: cmd = VIDIOC_STREAMON; break;
+	case VIDIOC_STREAMOFF32: cmd = VIDIOC_STREAMOFF; break;
+	case VIDIOC_G_INPUT32: cmd = VIDIOC_G_INPUT; break;
+	case VIDIOC_S_INPUT32: cmd = VIDIOC_S_INPUT; break;
+	case VIDIOC_G_OUTPUT32: cmd = VIDIOC_G_OUTPUT; break;
+	case VIDIOC_S_OUTPUT32: cmd = VIDIOC_S_OUTPUT; break;
+	}
 
 	switch (cmd) {
 #ifdef CONFIG_VIDEO_V4L1_COMPAT
@@ -662,7 +768,6 @@ static int do_video_ioctl(struct file *file, unsigned int cmd, unsigned long arg
 	case VIDIOCGTUNER:
 		err = get_video_tuner32(&karg.vt, up);
 		compatible_arg = 0;
-
 		break;
 
 	case VIDIOCSFBUF:
@@ -670,19 +775,42 @@ static int do_video_ioctl(struct file *file, unsigned int cmd, unsigned long arg
 		compatible_arg = 0;
 		break;
 
+	case VIDIOCSWIN:
+		err = get_video_window32(&karg.vw, up);
+		compatible_arg = 0;
+		break;
+
+	case VIDIOCGWIN:
+	case VIDIOCGFBUF:
+	case VIDIOCGFREQ:
+		compatible_arg = 0;
+		break;
+
+	case VIDIOCSMICROCODE:
+		err = get_microcode32(&karg.vc, up);
+		compatible_arg = 0;
+		break;
 
 	case VIDIOCSFREQ:
+		err = get_user(karg.vx, (u32 __user *)up);
+		compatible_arg = 0;
+		break;
+
+	case VIDIOCCAPTURE:
+	case VIDIOCSYNC:
+	case VIDIOCSWRITEMODE:
 #endif
-	case VIDIOC_S_INPUT:
 	case VIDIOC_OVERLAY:
 	case VIDIOC_STREAMON:
 	case VIDIOC_STREAMOFF:
-		err = get_user(karg.vx, (u32 __user *)up);
-		compatible_arg = 1;
+	case VIDIOC_S_INPUT:
+	case VIDIOC_S_OUTPUT:
+		err = get_user(karg.vi, (s32 __user *)up);
+		compatible_arg = 0;
 		break;
 
-	case VIDIOC_S_FBUF:
-		err = get_v4l2_framebuffer32(&karg.v2fb, up);
+	case VIDIOC_G_INPUT:
+	case VIDIOC_G_OUTPUT:
 		compatible_arg = 0;
 		break;
 
@@ -700,122 +828,108 @@ static int do_video_ioctl(struct file *file, unsigned int cmd, unsigned long arg
 		compatible_arg = 0;
 		break;
 
-	case VIDIOC_ENUMSTD:
-		err = get_v4l2_standard(&karg.v2s, up);
+	case VIDIOC_S_FBUF:
+		err = get_v4l2_framebuffer32(&karg.v2fb, up);
 		compatible_arg = 0;
 		break;
 
-	case VIDIOC_ENUMSTD32:
-		err = get_v4l2_standard32(&karg.v2s, up);
+	case VIDIOC_G_FBUF:
 		compatible_arg = 0;
 		break;
 
-	case VIDIOC_ENUMINPUT:
-		err = get_v4l2_input(&karg.v2i, up);
+	case VIDIOC_ENUMSTD:
+		err = get_v4l2_standard32(&karg.v2s, up);
 		compatible_arg = 0;
 		break;
 
-	case VIDIOC_ENUMINPUT32:
+	case VIDIOC_ENUMINPUT:
 		err = get_v4l2_input32(&karg.v2i, up);
 		compatible_arg = 0;
 		break;
 
-	case VIDIOC_G_TUNER:
-	case VIDIOC_S_TUNER:
-		err = get_v4l2_tuner(&karg.v2t, up);
+	case VIDIOC_G_EXT_CTRLS:
+	case VIDIOC_S_EXT_CTRLS:
+	case VIDIOC_TRY_EXT_CTRLS:
+		err = get_v4l2_ext_controls32(&karg.v2ecs, up);
 		compatible_arg = 0;
 		break;
-
-#ifdef CONFIG_VIDEO_V4L1_COMPAT
-	case VIDIOCGWIN:
-	case VIDIOCGFBUF:
-	case VIDIOCGFREQ:
-#endif
-	case VIDIOC_G_FBUF:
-	case VIDIOC_G_INPUT:
-		compatible_arg = 0;
-		break;
-#ifdef CONFIG_VIDEO_V4L1_COMPAT
-	case VIDIOCSMICROCODE:
-		err = microcode32(&karg.vc, up);
-		compatible_arg = 0;
-		break;
-#endif
-	};
+	}
 	if (err)
-		goto out;
+		return err;
 
 	if (compatible_arg)
-		err = native_ioctl(file, realcmd, (unsigned long)up);
+		err = native_ioctl(file, cmd, (unsigned long)up);
 	else {
 		mm_segment_t old_fs = get_fs();
 
 		set_fs(KERNEL_DS);
-		err = native_ioctl(file, realcmd, (unsigned long)&karg);
+		err = native_ioctl(file, cmd, (unsigned long)&karg);
 		set_fs(old_fs);
 	}
-	if (err == 0) {
-		switch (cmd) {
+
+	/* Special case: even after an error we need to put the
+	   results back for these ioctls since the error_idx will
+	   contain information on which control failed. */
+	switch (cmd) {
+	case VIDIOC_G_EXT_CTRLS:
+	case VIDIOC_S_EXT_CTRLS:
+	case VIDIOC_TRY_EXT_CTRLS:
+		if (put_v4l2_ext_controls32(&karg.v2ecs, up))
+			err = -EFAULT;
+		break;
+	}
+	if (err)
+		return err;
+
+	switch (cmd) {
 #ifdef CONFIG_VIDEO_V4L1_COMPAT
-		case VIDIOCGTUNER:
-			err = put_video_tuner32(&karg.vt, up);
-			break;
+	case VIDIOCGTUNER:
+		err = put_video_tuner32(&karg.vt, up);
+		break;
 
-		case VIDIOCGWIN:
-			err = put_video_window32(&karg.vw, up);
-			break;
+	case VIDIOCGWIN:
+		err = put_video_window32(&karg.vw, up);
+		break;
 
-		case VIDIOCGFBUF:
-			err = put_video_buffer32(&karg.vb, up);
-			break;
+	case VIDIOCGFBUF:
+		err = put_video_buffer32(&karg.vb, up);
+		break;
 
+	case VIDIOCGFREQ:
+		err = put_user(((u32)karg.vx), (u32 __user *)up);
+		break;
 #endif
-		case VIDIOC_G_FBUF:
-			err = put_v4l2_framebuffer32(&karg.v2fb, up);
-			break;
-
-		case VIDIOC_G_FMT:
-		case VIDIOC_S_FMT:
-		case VIDIOC_TRY_FMT:
-			err = put_v4l2_format32(&karg.v2f, up);
-			break;
-
-		case VIDIOC_QUERYBUF:
-		case VIDIOC_QBUF:
-		case VIDIOC_DQBUF:
-			err = put_v4l2_buffer32(&karg.v2b, up);
-			break;
-
-		case VIDIOC_ENUMSTD:
-			err = put_v4l2_standard(&karg.v2s, up);
-			break;
-
-		case VIDIOC_ENUMSTD32:
-			err = put_v4l2_standard32(&karg.v2s, up);
-			break;
-
-		case VIDIOC_G_TUNER:
-		case VIDIOC_S_TUNER:
-			err = put_v4l2_tuner(&karg.v2t, up);
-			break;
-
-		case VIDIOC_ENUMINPUT:
-			err = put_v4l2_input(&karg.v2i, up);
-			break;
-
-		case VIDIOC_ENUMINPUT32:
-			err = put_v4l2_input32(&karg.v2i, up);
-			break;
+	case VIDIOC_S_INPUT:
+	case VIDIOC_S_OUTPUT:
+	case VIDIOC_G_INPUT:
+	case VIDIOC_G_OUTPUT:
+		err = put_user(((s32)karg.vi), (s32 __user *)up);
+		break;
 
-#ifdef CONFIG_VIDEO_V4L1_COMPAT
-		case VIDIOCGFREQ:
-#endif
-		case VIDIOC_G_INPUT:
-			err = put_user(((u32)karg.vx), (u32 __user *)up);
-			break;
-		};
+	case VIDIOC_G_FBUF:
+		err = put_v4l2_framebuffer32(&karg.v2fb, up);
+		break;
+
+	case VIDIOC_G_FMT:
+	case VIDIOC_S_FMT:
+	case VIDIOC_TRY_FMT:
+		err = put_v4l2_format32(&karg.v2f, up);
+		break;
+
+	case VIDIOC_QUERYBUF:
+	case VIDIOC_QBUF:
+	case VIDIOC_DQBUF:
+		err = put_v4l2_buffer32(&karg.v2b, up);
+		break;
+
+	case VIDIOC_ENUMSTD:
+		err = put_v4l2_standard32(&karg.v2s, up);
+		break;
+
+	case VIDIOC_ENUMINPUT:
+		err = put_v4l2_input32(&karg.v2i, up);
+		break;
 	}
-out:
 	return err;
 }
 
@@ -828,26 +942,48 @@ long v4l_compat_ioctl32(struct file *file, unsigned int cmd, unsigned long arg)
 
 	switch (cmd) {
 #ifdef CONFIG_VIDEO_V4L1_COMPAT
-	case VIDIOCSWIN32:
-		ret = do_set_window(file, cmd, arg);
-		break;
+	case VIDIOCGCAP:
+	case VIDIOCGCHAN:
+	case VIDIOCSCHAN:
 	case VIDIOCGTUNER32:
 	case VIDIOCSTUNER32:
+	case VIDIOCGPICT:
+	case VIDIOCSPICT:
+	case VIDIOCCAPTURE32:
 	case VIDIOCGWIN32:
+	case VIDIOCSWIN32:
 	case VIDIOCGFBUF32:
 	case VIDIOCSFBUF32:
+	case VIDIOCKEY:
 	case VIDIOCGFREQ32:
 	case VIDIOCSFREQ32:
 	case VIDIOCGAUDIO:
 	case VIDIOCSAUDIO:
+	case VIDIOCSYNC32:
+	case VIDIOCMCAPTURE:
+	case VIDIOCGMBUF:
+	case VIDIOCGUNIT:
+	case VIDIOCGCAPTURE:
+	case VIDIOCSCAPTURE:
+	case VIDIOCSPLAYMODE:
+	case VIDIOCSWRITEMODE32:
+	case VIDIOCGPLAYINFO:
+	case VIDIOCSMICROCODE32:
 	case VIDIOCGVBIFMT:
 	case VIDIOCSVBIFMT:
+#endif
+#ifdef __OLD_VIDIOC_
+	case VIDIOC_OVERLAY32_OLD:
+	case VIDIOC_S_PARM_OLD:
+	case VIDIOC_S_CTRL_OLD:
+	case VIDIOC_G_AUDIO_OLD:
+	case VIDIOC_G_AUDOUT_OLD:
+	case VIDIOC_CROPCAP_OLD:
 #endif
 	case VIDIOC_QUERYCAP:
+	case VIDIOC_RESERVED:
 	case VIDIOC_ENUM_FMT:
 	case VIDIOC_G_FMT32:
-	case VIDIOC_CROPCAP:
-	case VIDIOC_S_CROP:
 	case VIDIOC_S_FMT32:
 	case VIDIOC_REQBUFS:
 	case VIDIOC_QUERYBUF32:
@@ -862,43 +998,56 @@ long v4l_compat_ioctl32(struct file *file, unsigned int cmd, unsigned long arg)
 	case VIDIOC_S_PARM:
 	case VIDIOC_G_STD:
 	case VIDIOC_S_STD:
-	case VIDIOC_G_TUNER:
-	case VIDIOC_S_TUNER:
-	case VIDIOC_ENUMSTD:
 	case VIDIOC_ENUMSTD32:
-	case VIDIOC_ENUMINPUT:
 	case VIDIOC_ENUMINPUT32:
 	case VIDIOC_G_CTRL:
 	case VIDIOC_S_CTRL:
-	case VIDIOC_S_CTRL32:
-	case VIDIOC_S_FREQUENCY:
-	case VIDIOC_G_FREQUENCY:
+	case VIDIOC_G_TUNER:
+	case VIDIOC_S_TUNER:
+	case VIDIOC_G_AUDIO:
+	case VIDIOC_S_AUDIO:
 	case VIDIOC_QUERYCTRL:
+	case VIDIOC_QUERYMENU:
 	case VIDIOC_G_INPUT32:
 	case VIDIOC_S_INPUT32:
+	case VIDIOC_G_OUTPUT32:
+	case VIDIOC_S_OUTPUT32:
+	case VIDIOC_ENUMOUTPUT:
+	case VIDIOC_G_AUDOUT:
+	case VIDIOC_S_AUDOUT:
+	case VIDIOC_G_MODULATOR:
+	case VIDIOC_S_MODULATOR:
+	case VIDIOC_S_FREQUENCY:
+	case VIDIOC_G_FREQUENCY:
+	case VIDIOC_CROPCAP:
+	case VIDIOC_G_CROP:
+	case VIDIOC_S_CROP:
+	case VIDIOC_G_JPEGCOMP:
+	case VIDIOC_S_JPEGCOMP:
+	case VIDIOC_QUERYSTD:
 	case VIDIOC_TRY_FMT32:
-	case VIDIOC_S_HW_FREQ_SEEK:
+	case VIDIOC_ENUMAUDIO:
+	case VIDIOC_ENUMAUDOUT:
+	case VIDIOC_G_PRIORITY:
+	case VIDIOC_S_PRIORITY:
+	case VIDIOC_G_SLICED_VBI_CAP:
+	case VIDIOC_LOG_STATUS:
+	case VIDIOC_G_EXT_CTRLS32:
+	case VIDIOC_S_EXT_CTRLS32:
+	case VIDIOC_TRY_EXT_CTRLS32:
 	case VIDIOC_ENUM_FRAMESIZES:
 	case VIDIOC_ENUM_FRAMEINTERVALS:
+	case VIDIOC_G_ENC_INDEX:
+	case VIDIOC_ENCODER_CMD:
+	case VIDIOC_TRY_ENCODER_CMD:
+	case VIDIOC_DBG_S_REGISTER:
+	case VIDIOC_DBG_G_REGISTER:
+	case VIDIOC_G_CHIP_IDENT:
+	case VIDIOC_S_HW_FREQ_SEEK:
 		ret = do_video_ioctl(file, cmd, arg);
 		break;
 
 #ifdef CONFIG_VIDEO_V4L1_COMPAT
-	/* Little v, the video4linux ioctls (conflict?) */
-	case VIDIOCGCAP:
-	case VIDIOCGCHAN:
-	case VIDIOCSCHAN:
-	case VIDIOCGPICT:
-	case VIDIOCSPICT:
-	case VIDIOCCAPTURE:
-	case VIDIOCKEY:
-	case VIDIOCSYNC:
-	case VIDIOCMCAPTURE:
-	case VIDIOCGMBUF:
-	case VIDIOCGUNIT:
-	case VIDIOCGCAPTURE:
-	case VIDIOCSCAPTURE:
-
 	/* BTTV specific... */
 	case _IOW('v',  BASE_VIDIOCPRIVATE+0, char [256]):
 	case _IOR('v',  BASE_VIDIOCPRIVATE+1, char [256]):
diff --git a/include/linux/videodev2.h b/include/linux/videodev2.h
index 6e6743bd0f9..e2cfd6add78 100644
--- a/include/linux/videodev2.h
+++ b/include/linux/videodev2.h
@@ -1465,6 +1465,8 @@ struct v4l2_chip_ident {
 #define VIDIOC_G_CHIP_IDENT     _IOWR('V', 81, struct v4l2_chip_ident)
 #endif
 #define VIDIOC_S_HW_FREQ_SEEK	 _IOW('V', 82, struct v4l2_hw_freq_seek)
+/* Reminder: when adding new ioctls please add support for them to
+   drivers/media/video/v4l2-compat-ioctl32.c as well! */
 
 #ifdef __OLD_VIDIOC_
 /* for compatibility, will go away some day */
-- 
cgit v1.2.3-70-g09d2


From 4b00eb25340c1a9b9eedaf0bc5b0f0d18eddb028 Mon Sep 17 00:00:00 2001
From: Hans Verkuil <hverkuil@xs4all.nl>
Date: Fri, 19 Dec 2008 11:17:56 -0300
Subject: V4L/DVB (9944): videodev2.h: fix typo.

The comment said CX2584X instead of CX2341X.

Signed-off-by: Hans Verkuil <hverkuil@xs4all.nl>
Signed-off-by: Mauro Carvalho Chehab <mchehab@redhat.com>
---
 include/linux/videodev2.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/videodev2.h b/include/linux/videodev2.h
index e2cfd6add78..754c8d9685a 100644
--- a/include/linux/videodev2.h
+++ b/include/linux/videodev2.h
@@ -1051,7 +1051,7 @@ enum v4l2_mpeg_video_bitrate_mode {
 #define V4L2_CID_MPEG_VIDEO_MUTE 		(V4L2_CID_MPEG_BASE+210)
 #define V4L2_CID_MPEG_VIDEO_MUTE_YUV 		(V4L2_CID_MPEG_BASE+211)
 
-/*  MPEG-class control IDs specific to the CX2584x driver as defined by V4L2 */
+/*  MPEG-class control IDs specific to the CX2341x driver as defined by V4L2 */
 #define V4L2_CID_MPEG_CX2341X_BASE 				(V4L2_CTRL_CLASS_MPEG | 0x1000)
 #define V4L2_CID_MPEG_CX2341X_VIDEO_SPATIAL_FILTER_MODE 	(V4L2_CID_MPEG_CX2341X_BASE+0)
 enum v4l2_mpeg_cx2341x_video_spatial_filter_mode {
-- 
cgit v1.2.3-70-g09d2


From 531c98e71805b32e9ea35a218119100bbd2b7615 Mon Sep 17 00:00:00 2001
From: Mauro Carvalho Chehab <mchehab@redhat.com>
Date: Mon, 22 Dec 2008 13:18:27 -0300
Subject: V4L/DVB (9953): em28xx: Add suport for debugging AC97 anciliary chips

The em28xx driver can be coupled to an anciliary AC97 chip. This patch
allows read/write AC97 registers directly.

Signed-off-by: Mauro Carvalho Chehab <mchehab@redhat.com>
---
 drivers/media/video/em28xx/em28xx-core.c  |  4 ++--
 drivers/media/video/em28xx/em28xx-video.c | 19 +++++++++++++++++++
 drivers/media/video/em28xx/em28xx.h       |  3 +++
 include/linux/videodev2.h                 |  1 +
 4 files changed, 25 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/media/video/em28xx/em28xx-core.c b/drivers/media/video/em28xx/em28xx-core.c
index c647dcb20a5..5964998daf2 100644
--- a/drivers/media/video/em28xx/em28xx-core.c
+++ b/drivers/media/video/em28xx/em28xx-core.c
@@ -259,7 +259,7 @@ static int em28xx_is_ac97_ready(struct em28xx *dev)
  * em28xx_read_ac97()
  * write a 16 bit value to the specified AC97 address (LSB first!)
  */
-static int em28xx_read_ac97(struct em28xx *dev, u8 reg)
+int em28xx_read_ac97(struct em28xx *dev, u8 reg)
 {
 	int ret;
 	u8 addr = (reg & 0x7f) | 0x80;
@@ -285,7 +285,7 @@ static int em28xx_read_ac97(struct em28xx *dev, u8 reg)
  * em28xx_write_ac97()
  * write a 16 bit value to the specified AC97 address (LSB first!)
  */
-static int em28xx_write_ac97(struct em28xx *dev, u8 reg, u16 val)
+int em28xx_write_ac97(struct em28xx *dev, u8 reg, u16 val)
 {
 	int ret;
 	u8 addr = reg & 0x7f;
diff --git a/drivers/media/video/em28xx/em28xx-video.c b/drivers/media/video/em28xx/em28xx-video.c
index 4701b6589b1..4ea4920c927 100644
--- a/drivers/media/video/em28xx/em28xx-video.c
+++ b/drivers/media/video/em28xx/em28xx-video.c
@@ -1221,6 +1221,17 @@ static int vidioc_g_register(struct file *file, void *priv,
 	struct em28xx         *dev = fh->dev;
 	int ret;
 
+	if (reg->match_type == V4L2_CHIP_MATCH_AC97) {
+		mutex_lock(&dev->lock);
+		ret = em28xx_read_ac97(dev, reg->reg);
+		mutex_unlock(&dev->lock);
+		if (ret < 0)
+			return ret;
+
+		reg->val = ret;
+		return 0;
+	}
+
 	if (!v4l2_chip_match_host(reg->match_type, reg->match_chip))
 		return -EINVAL;
 
@@ -1256,6 +1267,14 @@ static int vidioc_s_register(struct file *file, void *priv,
 	__le64 buf;
 	int    rc;
 
+	if (reg->match_type == V4L2_CHIP_MATCH_AC97) {
+		mutex_lock(&dev->lock);
+		rc = em28xx_write_ac97(dev, reg->reg, reg->val);
+		mutex_unlock(&dev->lock);
+
+		return rc;
+	}
+
 	buf = cpu_to_le64(reg->val);
 
 	mutex_lock(&dev->lock);
diff --git a/drivers/media/video/em28xx/em28xx.h b/drivers/media/video/em28xx/em28xx.h
index 80d1d55d80f..100f27819cd 100644
--- a/drivers/media/video/em28xx/em28xx.h
+++ b/drivers/media/video/em28xx/em28xx.h
@@ -564,6 +564,9 @@ int em28xx_write_regs_req(struct em28xx *dev, u8 req, u16 reg, char *buf,
 int em28xx_write_regs(struct em28xx *dev, u16 reg, char *buf, int len);
 int em28xx_write_reg(struct em28xx *dev, u16 reg, u8 val);
 
+int em28xx_read_ac97(struct em28xx *dev, u8 reg);
+int em28xx_write_ac97(struct em28xx *dev, u8 reg, u16 val);
+
 int em28xx_audio_analog_set(struct em28xx *dev);
 int em28xx_audio_setup(struct em28xx *dev);
 
diff --git a/include/linux/videodev2.h b/include/linux/videodev2.h
index 754c8d9685a..e31144d2223 100644
--- a/include/linux/videodev2.h
+++ b/include/linux/videodev2.h
@@ -1376,6 +1376,7 @@ struct v4l2_streamparm {
 #define V4L2_CHIP_MATCH_HOST       0  /* Match against chip ID on host (0 for the host) */
 #define V4L2_CHIP_MATCH_I2C_DRIVER 1  /* Match against I2C driver ID */
 #define V4L2_CHIP_MATCH_I2C_ADDR   2  /* Match against I2C 7-bit address */
+#define V4L2_CHIP_MATCH_AC97       3  /* Match against anciliary AC97 chip */
 
 struct v4l2_register {
 	__u32 match_type; /* Match type */
-- 
cgit v1.2.3-70-g09d2


From 91962fa713bd8bf47434b02ac661fdc201365fa5 Mon Sep 17 00:00:00 2001
From: Magnus Damm <damm@igel.co.jp>
Date: Thu, 18 Dec 2008 11:45:00 -0300
Subject: V4L/DVB (10078): video: add NV16 and NV61 pixel formats

This patch adds support for NV16 and NV61 pixel formats.

These pixel formats use two planes; one for 8-bit Y values and
one for interleaved 8-bit U and V values. NV16/NV61 formats are
very similar to NV12/NV21 with the exception that NV16/NV61 are
using the same number of lines for both planes. The difference
between NV16 and NV61 is the U and V byte order.

The fourcc values are extrapolated from the NV12/NV21 case.

Signed-off-by: Magnus Damm <damm@igel.co.jp>
Signed-off-by: Guennadi Liakhovetski <g.liakhovetski@gmx.de>
Signed-off-by: Mauro Carvalho Chehab <mchehab@redhat.com>
---
 include/linux/videodev2.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/videodev2.h b/include/linux/videodev2.h
index e31144d2223..1f126e30766 100644
--- a/include/linux/videodev2.h
+++ b/include/linux/videodev2.h
@@ -305,6 +305,8 @@ struct v4l2_pix_format {
 /* two planes -- one Y, one Cr + Cb interleaved  */
 #define V4L2_PIX_FMT_NV12    v4l2_fourcc('N', 'V', '1', '2') /* 12  Y/CbCr 4:2:0  */
 #define V4L2_PIX_FMT_NV21    v4l2_fourcc('N', 'V', '2', '1') /* 12  Y/CrCb 4:2:0  */
+#define V4L2_PIX_FMT_NV16    v4l2_fourcc('N', 'V', '1', '6') /* 16  Y/CbCr 4:2:2  */
+#define V4L2_PIX_FMT_NV61    v4l2_fourcc('N', 'V', '6', '1') /* 16  Y/CrCb 4:2:2  */
 
 /*  The following formats are not defined in the V4L2 specification */
 #define V4L2_PIX_FMT_YUV410  v4l2_fourcc('Y', 'U', 'V', '9') /*  9  YUV 4:1:0     */
-- 
cgit v1.2.3-70-g09d2


From 457533a7d3402d1d91fbc125c8bd1bd16dcd3cd4 Mon Sep 17 00:00:00 2001
From: Martin Schwidefsky <schwidefsky@de.ibm.com>
Date: Wed, 31 Dec 2008 15:11:37 +0100
Subject: [PATCH] fix scaled & unscaled cputime accounting

The utimescaled / stimescaled fields in the task structure and the
global cpustat should be set on all architectures. On s390 the calls
to account_user_time_scaled and account_system_time_scaled never have
been added. In addition system time that is accounted as guest time
to the user time of a process is accounted to the scaled system time
instead of the scaled user time.
To fix the bugs and to prevent future forgetfulness this patch merges
account_system_time_scaled into account_system_time and
account_user_time_scaled into account_user_time.

Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Jeremy Fitzhardinge <jeremy@xensource.com>
Cc: Chris Wright <chrisw@sous-sol.org>
Cc: Michael Neuling <mikey@neuling.org>
Acked-by: Paul Mackerras <paulus@samba.org>
Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
---
 arch/ia64/kernel/time.c     | 12 ++++--------
 arch/powerpc/kernel/time.c  |  7 ++-----
 arch/s390/kernel/vtime.c    | 10 +++++-----
 include/linux/kernel_stat.h |  6 ++----
 kernel/sched.c              | 41 ++++++++++++++++-------------------------
 kernel/time/tick-sched.c    |  5 +++--
 kernel/timer.c              | 12 +++++-------
 7 files changed, 37 insertions(+), 56 deletions(-)

(limited to 'include/linux')

diff --git a/arch/ia64/kernel/time.c b/arch/ia64/kernel/time.c
index 65c10a42c88..4ee36781704 100644
--- a/arch/ia64/kernel/time.c
+++ b/arch/ia64/kernel/time.c
@@ -93,13 +93,11 @@ void ia64_account_on_switch(struct task_struct *prev, struct task_struct *next)
 	now = ia64_get_itc();
 
 	delta_stime = cycle_to_cputime(pi->ac_stime + (now - pi->ac_stamp));
-	account_system_time(prev, 0, delta_stime);
-	account_system_time_scaled(prev, delta_stime);
+	account_system_time(prev, 0, delta_stime, delta_stime);
 
 	if (pi->ac_utime) {
 		delta_utime = cycle_to_cputime(pi->ac_utime);
-		account_user_time(prev, delta_utime);
-		account_user_time_scaled(prev, delta_utime);
+		account_user_time(prev, delta_utime, delta_utime);
 	}
 
 	pi->ac_stamp = ni->ac_stamp = now;
@@ -122,8 +120,7 @@ void account_system_vtime(struct task_struct *tsk)
 	now = ia64_get_itc();
 
 	delta_stime = cycle_to_cputime(ti->ac_stime + (now - ti->ac_stamp));
-	account_system_time(tsk, 0, delta_stime);
-	account_system_time_scaled(tsk, delta_stime);
+	account_system_time(tsk, 0, delta_stime, delta_stime);
 	ti->ac_stime = 0;
 
 	ti->ac_stamp = now;
@@ -143,8 +140,7 @@ void account_process_tick(struct task_struct *p, int user_tick)
 
 	if (ti->ac_utime) {
 		delta_utime = cycle_to_cputime(ti->ac_utime);
-		account_user_time(p, delta_utime);
-		account_user_time_scaled(p, delta_utime);
+		account_user_time(p, delta_utime, delta_utime);
 		ti->ac_utime = 0;
 	}
 }
diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c
index e1f3a514042..92650ccad2e 100644
--- a/arch/powerpc/kernel/time.c
+++ b/arch/powerpc/kernel/time.c
@@ -256,8 +256,7 @@ void account_system_vtime(struct task_struct *tsk)
 		delta += sys_time;
 		get_paca()->system_time = 0;
 	}
-	account_system_time(tsk, 0, delta);
-	account_system_time_scaled(tsk, deltascaled);
+	account_system_time(tsk, 0, delta, deltascaled);
 	per_cpu(cputime_last_delta, smp_processor_id()) = delta;
 	per_cpu(cputime_scaled_last_delta, smp_processor_id()) = deltascaled;
 	local_irq_restore(flags);
@@ -275,10 +274,8 @@ void account_process_tick(struct task_struct *tsk, int user_tick)
 
 	utime = get_paca()->user_time;
 	get_paca()->user_time = 0;
-	account_user_time(tsk, utime);
-
 	utimescaled = cputime_to_scaled(utime);
-	account_user_time_scaled(tsk, utimescaled);
+	account_user_time(tsk, utime, utimescaled);
 }
 
 /*
diff --git a/arch/s390/kernel/vtime.c b/arch/s390/kernel/vtime.c
index 75a6e62ea97..07283aea2e5 100644
--- a/arch/s390/kernel/vtime.c
+++ b/arch/s390/kernel/vtime.c
@@ -50,12 +50,12 @@ void account_process_tick(struct task_struct *tsk, int user_tick)
 	rcu_user_flag = cputime != 0;
 	S390_lowcore.user_timer -= cputime << 12;
 	S390_lowcore.steal_clock -= cputime << 12;
-	account_user_time(tsk, cputime);
+	account_user_time(tsk, cputime, cputime);
 
 	cputime =  S390_lowcore.system_timer >> 12;
 	S390_lowcore.system_timer -= cputime << 12;
 	S390_lowcore.steal_clock -= cputime << 12;
-	account_system_time(tsk, HARDIRQ_OFFSET, cputime);
+	account_system_time(tsk, HARDIRQ_OFFSET, cputime, cputime);
 
 	cputime = S390_lowcore.steal_clock;
 	if ((__s64) cputime > 0) {
@@ -82,12 +82,12 @@ void account_vtime(struct task_struct *tsk)
 	cputime = S390_lowcore.user_timer >> 12;
 	S390_lowcore.user_timer -= cputime << 12;
 	S390_lowcore.steal_clock -= cputime << 12;
-	account_user_time(tsk, cputime);
+	account_user_time(tsk, cputime, cputime);
 
 	cputime =  S390_lowcore.system_timer >> 12;
 	S390_lowcore.system_timer -= cputime << 12;
 	S390_lowcore.steal_clock -= cputime << 12;
-	account_system_time(tsk, 0, cputime);
+	account_system_time(tsk, 0, cputime, cputime);
 }
 
 /*
@@ -107,7 +107,7 @@ void account_system_vtime(struct task_struct *tsk)
 	cputime =  S390_lowcore.system_timer >> 12;
 	S390_lowcore.system_timer -= cputime << 12;
 	S390_lowcore.steal_clock -= cputime << 12;
-	account_system_time(tsk, 0, cputime);
+	account_system_time(tsk, 0, cputime, cputime);
 }
 EXPORT_SYMBOL_GPL(account_system_vtime);
 
diff --git a/include/linux/kernel_stat.h b/include/linux/kernel_stat.h
index 4ee4b3d2316..c78a459662a 100644
--- a/include/linux/kernel_stat.h
+++ b/include/linux/kernel_stat.h
@@ -79,10 +79,8 @@ static inline unsigned int kstat_irqs(unsigned int irq)
 }
 
 extern unsigned long long task_delta_exec(struct task_struct *);
-extern void account_user_time(struct task_struct *, cputime_t);
-extern void account_user_time_scaled(struct task_struct *, cputime_t);
-extern void account_system_time(struct task_struct *, int, cputime_t);
-extern void account_system_time_scaled(struct task_struct *, cputime_t);
+extern void account_user_time(struct task_struct *, cputime_t, cputime_t);
+extern void account_system_time(struct task_struct *, int, cputime_t, cputime_t);
 extern void account_steal_time(struct task_struct *, cputime_t);
 
 #endif /* _LINUX_KERNEL_STAT_H */
diff --git a/kernel/sched.c b/kernel/sched.c
index fff1c4a20b6..5b03679ff71 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -4080,13 +4080,17 @@ unsigned long long task_delta_exec(struct task_struct *p)
  * Account user cpu time to a process.
  * @p: the process that the cpu time gets accounted to
  * @cputime: the cpu time spent in user space since the last update
+ * @cputime_scaled: cputime scaled by cpu frequency
  */
-void account_user_time(struct task_struct *p, cputime_t cputime)
+void account_user_time(struct task_struct *p, cputime_t cputime,
+		       cputime_t cputime_scaled)
 {
 	struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
 	cputime64_t tmp;
 
+	/* Add user time to process. */
 	p->utime = cputime_add(p->utime, cputime);
+	p->utimescaled = cputime_add(p->utimescaled, cputime_scaled);
 	account_group_user_time(p, cputime);
 
 	/* Add user time to cpustat. */
@@ -4103,51 +4107,49 @@ void account_user_time(struct task_struct *p, cputime_t cputime)
  * Account guest cpu time to a process.
  * @p: the process that the cpu time gets accounted to
  * @cputime: the cpu time spent in virtual machine since the last update
+ * @cputime_scaled: cputime scaled by cpu frequency
  */
-static void account_guest_time(struct task_struct *p, cputime_t cputime)
+static void account_guest_time(struct task_struct *p, cputime_t cputime,
+			       cputime_t cputime_scaled)
 {
 	cputime64_t tmp;
 	struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
 
 	tmp = cputime_to_cputime64(cputime);
 
+	/* Add guest time to process. */
 	p->utime = cputime_add(p->utime, cputime);
+	p->utimescaled = cputime_add(p->utimescaled, cputime_scaled);
 	account_group_user_time(p, cputime);
 	p->gtime = cputime_add(p->gtime, cputime);
 
+	/* Add guest time to cpustat. */
 	cpustat->user = cputime64_add(cpustat->user, tmp);
 	cpustat->guest = cputime64_add(cpustat->guest, tmp);
 }
 
-/*
- * Account scaled user cpu time to a process.
- * @p: the process that the cpu time gets accounted to
- * @cputime: the cpu time spent in user space since the last update
- */
-void account_user_time_scaled(struct task_struct *p, cputime_t cputime)
-{
-	p->utimescaled = cputime_add(p->utimescaled, cputime);
-}
-
 /*
  * Account system cpu time to a process.
  * @p: the process that the cpu time gets accounted to
  * @hardirq_offset: the offset to subtract from hardirq_count()
  * @cputime: the cpu time spent in kernel space since the last update
+ * @cputime_scaled: cputime scaled by cpu frequency
  */
 void account_system_time(struct task_struct *p, int hardirq_offset,
-			 cputime_t cputime)
+			 cputime_t cputime, cputime_t cputime_scaled)
 {
 	struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
 	struct rq *rq = this_rq();
 	cputime64_t tmp;
 
 	if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) {
-		account_guest_time(p, cputime);
+		account_guest_time(p, cputime, cputime_scaled);
 		return;
 	}
 
+	/* Add system time to process. */
 	p->stime = cputime_add(p->stime, cputime);
+	p->stimescaled = cputime_add(p->stimescaled, cputime_scaled);
 	account_group_system_time(p, cputime);
 
 	/* Add system time to cpustat. */
@@ -4166,17 +4168,6 @@ void account_system_time(struct task_struct *p, int hardirq_offset,
 	acct_update_integrals(p);
 }
 
-/*
- * Account scaled system cpu time to a process.
- * @p: the process that the cpu time gets accounted to
- * @hardirq_offset: the offset to subtract from hardirq_count()
- * @cputime: the cpu time spent in kernel space since the last update
- */
-void account_system_time_scaled(struct task_struct *p, cputime_t cputime)
-{
-	p->stimescaled = cputime_add(p->stimescaled, cputime);
-}
-
 /*
  * Account for involuntary wait time.
  * @p: the process from which the cpu time has been stolen
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 8f3fc2582d3..1f2fce2479f 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -420,6 +420,7 @@ void tick_nohz_restart_sched_tick(void)
 	int cpu = smp_processor_id();
 	struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
 	unsigned long ticks;
+	cputime_t cputime;
 	ktime_t now;
 
 	local_irq_disable();
@@ -452,8 +453,8 @@ void tick_nohz_restart_sched_tick(void)
 	 */
 	if (ticks && ticks < LONG_MAX) {
 		add_preempt_count(HARDIRQ_OFFSET);
-		account_system_time(current, HARDIRQ_OFFSET,
-				    jiffies_to_cputime(ticks));
+		cputime = jiffies_to_cputime(ticks);
+		account_system_time(current, HARDIRQ_OFFSET, cputime, cputime);
 		sub_preempt_count(HARDIRQ_OFFSET);
 	}
 
diff --git a/kernel/timer.c b/kernel/timer.c
index 566257d1dc1..b5efb528aa1 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -1023,13 +1023,11 @@ void account_process_tick(struct task_struct *p, int user_tick)
 {
 	cputime_t one_jiffy = jiffies_to_cputime(1);
 
-	if (user_tick) {
-		account_user_time(p, one_jiffy);
-		account_user_time_scaled(p, cputime_to_scaled(one_jiffy));
-	} else {
-		account_system_time(p, HARDIRQ_OFFSET, one_jiffy);
-		account_system_time_scaled(p, cputime_to_scaled(one_jiffy));
-	}
+	if (user_tick)
+		account_user_time(p, one_jiffy, cputime_to_scaled(one_jiffy));
+	else
+		account_system_time(p, HARDIRQ_OFFSET, one_jiffy,
+				    cputime_to_scaled(one_jiffy));
 }
 #endif
 
-- 
cgit v1.2.3-70-g09d2


From 79741dd35713ff4f6fd0eafd59fa94e8a4ba922d Mon Sep 17 00:00:00 2001
From: Martin Schwidefsky <schwidefsky@de.ibm.com>
Date: Wed, 31 Dec 2008 15:11:38 +0100
Subject: [PATCH] idle cputime accounting

The cpu time spent by the idle process actually doing something is
currently accounted as idle time. This is plain wrong, the architectures
that support VIRT_CPU_ACCOUNTING=y can do better: distinguish between the
time spent doing nothing and the time spent by idle doing work. The first
is accounted with account_idle_time and the second with account_system_time.
The architectures that use the account_xxx_time interface directly and not
the account_xxx_ticks interface now need to do the check for the idle
process in their arch code. In particular to improve the system vs true
idle time accounting the arch code needs to measure the true idle time
instead of just testing for the idle process.
To improve the tick based accounting as well we would need an architecture
primitive that can tell us if the pt_regs of the interrupted context
points to the magic instruction that halts the cpu.

In addition idle time is no more added to the stime of the idle process.
This field now contains the system time of the idle process as it should
be. On systems without VIRT_CPU_ACCOUNTING this will always be zero as
every tick that occurs while idle is running will be accounted as idle
time.

This patch contains the necessary common code changes to be able to
distinguish idle system time and true idle time. The architectures with
support for VIRT_CPU_ACCOUNTING need some changes to exploit this.

Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
---
 arch/ia64/kernel/time.c       | 10 ++++--
 arch/powerpc/kernel/process.c |  1 +
 arch/powerpc/kernel/time.c    | 13 +++++--
 arch/s390/kernel/vtime.c      | 20 ++++++++---
 arch/x86/xen/time.c           | 10 +++---
 include/linux/kernel_stat.h   |  7 +++-
 include/linux/sched.h         |  1 -
 kernel/sched.c                | 80 ++++++++++++++++++++++++++++++++++---------
 kernel/time/tick-sched.c      | 13 ++++---
 kernel/timer.c                | 13 -------
 10 files changed, 114 insertions(+), 54 deletions(-)

(limited to 'include/linux')

diff --git a/arch/ia64/kernel/time.c b/arch/ia64/kernel/time.c
index 4ee36781704..f0ebb342409 100644
--- a/arch/ia64/kernel/time.c
+++ b/arch/ia64/kernel/time.c
@@ -93,7 +93,10 @@ void ia64_account_on_switch(struct task_struct *prev, struct task_struct *next)
 	now = ia64_get_itc();
 
 	delta_stime = cycle_to_cputime(pi->ac_stime + (now - pi->ac_stamp));
-	account_system_time(prev, 0, delta_stime, delta_stime);
+	if (idle_task(smp_processor_id()) != prev)
+		account_system_time(prev, 0, delta_stime, delta_stime);
+	else
+		account_idle_time(delta_stime);
 
 	if (pi->ac_utime) {
 		delta_utime = cycle_to_cputime(pi->ac_utime);
@@ -120,7 +123,10 @@ void account_system_vtime(struct task_struct *tsk)
 	now = ia64_get_itc();
 
 	delta_stime = cycle_to_cputime(ti->ac_stime + (now - ti->ac_stamp));
-	account_system_time(tsk, 0, delta_stime, delta_stime);
+	if (irq_count() || idle_task(smp_processor_id()) != tsk)
+		account_system_time(tsk, 0, delta_stime, delta_stime);
+	else
+		account_idle_time(delta_stime);
 	ti->ac_stime = 0;
 
 	ti->ac_stamp = now;
diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c
index 51b201ddf9a..fb7049c054c 100644
--- a/arch/powerpc/kernel/process.c
+++ b/arch/powerpc/kernel/process.c
@@ -33,6 +33,7 @@
 #include <linux/mqueue.h>
 #include <linux/hardirq.h>
 #include <linux/utsname.h>
+#include <linux/kernel_stat.h>
 
 #include <asm/pgtable.h>
 #include <asm/uaccess.h>
diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c
index 92650ccad2e..3be355c1cfa 100644
--- a/arch/powerpc/kernel/time.c
+++ b/arch/powerpc/kernel/time.c
@@ -256,7 +256,10 @@ void account_system_vtime(struct task_struct *tsk)
 		delta += sys_time;
 		get_paca()->system_time = 0;
 	}
-	account_system_time(tsk, 0, delta, deltascaled);
+	if (in_irq() || idle_task(smp_processor_id()) != tsk)
+		account_system_time(tsk, 0, delta, deltascaled);
+	else
+		account_idle_time(delta);
 	per_cpu(cputime_last_delta, smp_processor_id()) = delta;
 	per_cpu(cputime_scaled_last_delta, smp_processor_id()) = deltascaled;
 	local_irq_restore(flags);
@@ -335,8 +338,12 @@ void calculate_steal_time(void)
 	tb = mftb();
 	purr = mfspr(SPRN_PURR);
 	stolen = (tb - pme->tb) - (purr - pme->purr);
-	if (stolen > 0)
-		account_steal_time(current, stolen);
+	if (stolen > 0) {
+		if (idle_task(smp_processor_id()) != current)
+			account_steal_time(stolen);
+		else
+			account_idle_time(stolen);
+	}
 	pme->tb = tb;
 	pme->purr = purr;
 }
diff --git a/arch/s390/kernel/vtime.c b/arch/s390/kernel/vtime.c
index 07283aea2e5..4a4a34caec5 100644
--- a/arch/s390/kernel/vtime.c
+++ b/arch/s390/kernel/vtime.c
@@ -55,13 +55,19 @@ void account_process_tick(struct task_struct *tsk, int user_tick)
 	cputime =  S390_lowcore.system_timer >> 12;
 	S390_lowcore.system_timer -= cputime << 12;
 	S390_lowcore.steal_clock -= cputime << 12;
-	account_system_time(tsk, HARDIRQ_OFFSET, cputime, cputime);
+	if (idle_task(smp_processor_id()) != current)
+		account_system_time(tsk, HARDIRQ_OFFSET, cputime, cputime);
+	else
+		account_idle_time(cputime);
 
 	cputime = S390_lowcore.steal_clock;
 	if ((__s64) cputime > 0) {
 		cputime >>= 12;
 		S390_lowcore.steal_clock -= cputime << 12;
-		account_steal_time(tsk, cputime);
+		if (idle_task(smp_processor_id()) != current)
+			account_steal_time(cputime);
+		else
+			account_idle_time(cputime);
 	}
 }
 
@@ -87,7 +93,10 @@ void account_vtime(struct task_struct *tsk)
 	cputime =  S390_lowcore.system_timer >> 12;
 	S390_lowcore.system_timer -= cputime << 12;
 	S390_lowcore.steal_clock -= cputime << 12;
-	account_system_time(tsk, 0, cputime, cputime);
+	if (idle_task(smp_processor_id()) != current)
+		account_system_time(tsk, 0, cputime, cputime);
+	else
+		account_idle_time(cputime);
 }
 
 /*
@@ -107,7 +116,10 @@ void account_system_vtime(struct task_struct *tsk)
 	cputime =  S390_lowcore.system_timer >> 12;
 	S390_lowcore.system_timer -= cputime << 12;
 	S390_lowcore.steal_clock -= cputime << 12;
-	account_system_time(tsk, 0, cputime, cputime);
+	if (in_irq() || idle_task(smp_processor_id()) != current)
+		account_system_time(tsk, 0, cputime, cputime);
+	else
+		account_idle_time(cputime);
 }
 EXPORT_SYMBOL_GPL(account_system_vtime);
 
diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c
index c9f7cda48ed..732e52dc991 100644
--- a/arch/x86/xen/time.c
+++ b/arch/x86/xen/time.c
@@ -132,8 +132,7 @@ static void do_stolen_accounting(void)
 	*snap = state;
 
 	/* Add the appropriate number of ticks of stolen time,
-	   including any left-overs from last time.  Passing NULL to
-	   account_steal_time accounts the time as stolen. */
+	   including any left-overs from last time. */
 	stolen = runnable + offline + __get_cpu_var(residual_stolen);
 
 	if (stolen < 0)
@@ -141,11 +140,10 @@ static void do_stolen_accounting(void)
 
 	ticks = iter_div_u64_rem(stolen, NS_PER_TICK, &stolen);
 	__get_cpu_var(residual_stolen) = stolen;
-	account_steal_time(NULL, ticks);
+	account_steal_ticks(ticks);
 
 	/* Add the appropriate number of ticks of blocked time,
-	   including any left-overs from last time.  Passing idle to
-	   account_steal_time accounts the time as idle/wait. */
+	   including any left-overs from last time. */
 	blocked += __get_cpu_var(residual_blocked);
 
 	if (blocked < 0)
@@ -153,7 +151,7 @@ static void do_stolen_accounting(void)
 
 	ticks = iter_div_u64_rem(blocked, NS_PER_TICK, &blocked);
 	__get_cpu_var(residual_blocked) = blocked;
-	account_steal_time(idle_task(smp_processor_id()), ticks);
+	account_idle_ticks(ticks);
 }
 
 /*
diff --git a/include/linux/kernel_stat.h b/include/linux/kernel_stat.h
index c78a459662a..570d2041311 100644
--- a/include/linux/kernel_stat.h
+++ b/include/linux/kernel_stat.h
@@ -81,6 +81,11 @@ static inline unsigned int kstat_irqs(unsigned int irq)
 extern unsigned long long task_delta_exec(struct task_struct *);
 extern void account_user_time(struct task_struct *, cputime_t, cputime_t);
 extern void account_system_time(struct task_struct *, int, cputime_t, cputime_t);
-extern void account_steal_time(struct task_struct *, cputime_t);
+extern void account_steal_time(cputime_t);
+extern void account_idle_time(cputime_t);
+
+extern void account_process_tick(struct task_struct *, int user);
+extern void account_steal_ticks(unsigned long ticks);
+extern void account_idle_ticks(unsigned long ticks);
 
 #endif /* _LINUX_KERNEL_STAT_H */
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 8395e715809..b475d4db805 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -284,7 +284,6 @@ long io_schedule_timeout(long timeout);
 
 extern void cpu_init (void);
 extern void trap_init(void);
-extern void account_process_tick(struct task_struct *task, int user);
 extern void update_process_times(int user);
 extern void scheduler_tick(void);
 
diff --git a/kernel/sched.c b/kernel/sched.c
index 5b03679ff71..635eaffe1e4 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -4139,7 +4139,6 @@ void account_system_time(struct task_struct *p, int hardirq_offset,
 			 cputime_t cputime, cputime_t cputime_scaled)
 {
 	struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
-	struct rq *rq = this_rq();
 	cputime64_t tmp;
 
 	if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) {
@@ -4158,37 +4157,84 @@ void account_system_time(struct task_struct *p, int hardirq_offset,
 		cpustat->irq = cputime64_add(cpustat->irq, tmp);
 	else if (softirq_count())
 		cpustat->softirq = cputime64_add(cpustat->softirq, tmp);
-	else if (p != rq->idle)
-		cpustat->system = cputime64_add(cpustat->system, tmp);
-	else if (atomic_read(&rq->nr_iowait) > 0)
-		cpustat->iowait = cputime64_add(cpustat->iowait, tmp);
 	else
-		cpustat->idle = cputime64_add(cpustat->idle, tmp);
+		cpustat->system = cputime64_add(cpustat->system, tmp);
+
 	/* Account for system time used */
 	acct_update_integrals(p);
 }
 
 /*
  * Account for involuntary wait time.
- * @p: the process from which the cpu time has been stolen
  * @steal: the cpu time spent in involuntary wait
  */
-void account_steal_time(struct task_struct *p, cputime_t steal)
+void account_steal_time(cputime_t cputime)
+{
+	struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
+	cputime64_t cputime64 = cputime_to_cputime64(cputime);
+
+	cpustat->steal = cputime64_add(cpustat->steal, cputime64);
+}
+
+/*
+ * Account for idle time.
+ * @cputime: the cpu time spent in idle wait
+ */
+void account_idle_time(cputime_t cputime)
 {
 	struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
-	cputime64_t tmp = cputime_to_cputime64(steal);
+	cputime64_t cputime64 = cputime_to_cputime64(cputime);
 	struct rq *rq = this_rq();
 
-	if (p == rq->idle) {
-		p->stime = cputime_add(p->stime, steal);
-		if (atomic_read(&rq->nr_iowait) > 0)
-			cpustat->iowait = cputime64_add(cpustat->iowait, tmp);
-		else
-			cpustat->idle = cputime64_add(cpustat->idle, tmp);
-	} else
-		cpustat->steal = cputime64_add(cpustat->steal, tmp);
+	if (atomic_read(&rq->nr_iowait) > 0)
+		cpustat->iowait = cputime64_add(cpustat->iowait, cputime64);
+	else
+		cpustat->idle = cputime64_add(cpustat->idle, cputime64);
+}
+
+#ifndef CONFIG_VIRT_CPU_ACCOUNTING
+
+/*
+ * Account a single tick of cpu time.
+ * @p: the process that the cpu time gets accounted to
+ * @user_tick: indicates if the tick is a user or a system tick
+ */
+void account_process_tick(struct task_struct *p, int user_tick)
+{
+	cputime_t one_jiffy = jiffies_to_cputime(1);
+	cputime_t one_jiffy_scaled = cputime_to_scaled(one_jiffy);
+	struct rq *rq = this_rq();
+
+	if (user_tick)
+		account_user_time(p, one_jiffy, one_jiffy_scaled);
+	else if (p != rq->idle)
+		account_system_time(p, HARDIRQ_OFFSET, one_jiffy,
+				    one_jiffy_scaled);
+	else
+		account_idle_time(one_jiffy);
+}
+
+/*
+ * Account multiple ticks of steal time.
+ * @p: the process from which the cpu time has been stolen
+ * @ticks: number of stolen ticks
+ */
+void account_steal_ticks(unsigned long ticks)
+{
+	account_steal_time(jiffies_to_cputime(ticks));
+}
+
+/*
+ * Account multiple ticks of idle time.
+ * @ticks: number of stolen ticks
+ */
+void account_idle_ticks(unsigned long ticks)
+{
+	account_idle_time(jiffies_to_cputime(ticks));
 }
 
+#endif
+
 /*
  * Use precise platform statistics if available:
  */
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 1f2fce2479f..611fa4c0baa 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -419,8 +419,9 @@ void tick_nohz_restart_sched_tick(void)
 {
 	int cpu = smp_processor_id();
 	struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
+#ifndef CONFIG_VIRT_CPU_ACCOUNTING
 	unsigned long ticks;
-	cputime_t cputime;
+#endif
 	ktime_t now;
 
 	local_irq_disable();
@@ -442,6 +443,7 @@ void tick_nohz_restart_sched_tick(void)
 	tick_do_update_jiffies64(now);
 	cpu_clear(cpu, nohz_cpu_mask);
 
+#ifndef CONFIG_VIRT_CPU_ACCOUNTING
 	/*
 	 * We stopped the tick in idle. Update process times would miss the
 	 * time we slept as update_process_times does only a 1 tick
@@ -451,12 +453,9 @@ void tick_nohz_restart_sched_tick(void)
 	/*
 	 * We might be one off. Do not randomly account a huge number of ticks!
 	 */
-	if (ticks && ticks < LONG_MAX) {
-		add_preempt_count(HARDIRQ_OFFSET);
-		cputime = jiffies_to_cputime(ticks);
-		account_system_time(current, HARDIRQ_OFFSET, cputime, cputime);
-		sub_preempt_count(HARDIRQ_OFFSET);
-	}
+	if (ticks && ticks < LONG_MAX)
+		account_idle_ticks(ticks);
+#endif
 
 	touch_softlockup_watchdog();
 	/*
diff --git a/kernel/timer.c b/kernel/timer.c
index b5efb528aa1..dee3f641a7a 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -1018,19 +1018,6 @@ unsigned long get_next_timer_interrupt(unsigned long now)
 }
 #endif
 
-#ifndef CONFIG_VIRT_CPU_ACCOUNTING
-void account_process_tick(struct task_struct *p, int user_tick)
-{
-	cputime_t one_jiffy = jiffies_to_cputime(1);
-
-	if (user_tick)
-		account_user_time(p, one_jiffy, cputime_to_scaled(one_jiffy));
-	else
-		account_system_time(p, HARDIRQ_OFFSET, one_jiffy,
-				    cputime_to_scaled(one_jiffy));
-}
-#endif
-
 /*
  * Called from the timer interrupt handler to charge one tick to the current
  * process.  user_tick is 1 if the tick is user time, 0 for system.
-- 
cgit v1.2.3-70-g09d2


From c4abb7c9cde24b7351a47328ef866e6a2bbb1ad0 Mon Sep 17 00:00:00 2001
From: Jan Kiszka <jan.kiszka@siemens.com>
Date: Fri, 26 Sep 2008 09:30:55 +0200
Subject: KVM: x86: Support for user space injected NMIs

Introduces the KVM_NMI IOCTL to the generic x86 part of KVM for
injecting NMIs from user space and also extends the statistic report
accordingly.

Based on the original patch by Sheng Yang.

Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
Signed-off-by: Sheng Yang <sheng.yang@intel.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/include/asm/kvm_host.h |  2 ++
 arch/x86/kvm/x86.c              | 46 +++++++++++++++++++++++++++++++++++++++--
 include/linux/kvm.h             | 11 ++++++++--
 3 files changed, 55 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index bfbbdea869b..a40fa847892 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -398,6 +398,7 @@ struct kvm_vcpu_stat {
 	u32 halt_exits;
 	u32 halt_wakeup;
 	u32 request_irq_exits;
+	u32 request_nmi_exits;
 	u32 irq_exits;
 	u32 host_state_reload;
 	u32 efer_reload;
@@ -406,6 +407,7 @@ struct kvm_vcpu_stat {
 	u32 insn_emulation_fail;
 	u32 hypercalls;
 	u32 irq_injections;
+	u32 nmi_injections;
 };
 
 struct descriptor_table {
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 1fa9a6db633..07971451b94 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -86,6 +86,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
 	{ "halt_wakeup", VCPU_STAT(halt_wakeup) },
 	{ "hypercalls", VCPU_STAT(hypercalls) },
 	{ "request_irq", VCPU_STAT(request_irq_exits) },
+	{ "request_nmi", VCPU_STAT(request_nmi_exits) },
 	{ "irq_exits", VCPU_STAT(irq_exits) },
 	{ "host_state_reload", VCPU_STAT(host_state_reload) },
 	{ "efer_reload", VCPU_STAT(efer_reload) },
@@ -93,6 +94,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
 	{ "insn_emulation", VCPU_STAT(insn_emulation) },
 	{ "insn_emulation_fail", VCPU_STAT(insn_emulation_fail) },
 	{ "irq_injections", VCPU_STAT(irq_injections) },
+	{ "nmi_injections", VCPU_STAT(nmi_injections) },
 	{ "mmu_shadow_zapped", VM_STAT(mmu_shadow_zapped) },
 	{ "mmu_pte_write", VM_STAT(mmu_pte_write) },
 	{ "mmu_pte_updated", VM_STAT(mmu_pte_updated) },
@@ -1318,6 +1320,15 @@ static int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu,
 	return 0;
 }
 
+static int kvm_vcpu_ioctl_nmi(struct kvm_vcpu *vcpu)
+{
+	vcpu_load(vcpu);
+	kvm_inject_nmi(vcpu);
+	vcpu_put(vcpu);
+
+	return 0;
+}
+
 static int vcpu_ioctl_tpr_access_reporting(struct kvm_vcpu *vcpu,
 					   struct kvm_tpr_access_ctl *tac)
 {
@@ -1377,6 +1388,13 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
 		r = 0;
 		break;
 	}
+	case KVM_NMI: {
+		r = kvm_vcpu_ioctl_nmi(vcpu);
+		if (r)
+			goto out;
+		r = 0;
+		break;
+	}
 	case KVM_SET_CPUID: {
 		struct kvm_cpuid __user *cpuid_arg = argp;
 		struct kvm_cpuid cpuid;
@@ -2812,18 +2830,37 @@ static int dm_request_for_irq_injection(struct kvm_vcpu *vcpu,
 		(kvm_x86_ops->get_rflags(vcpu) & X86_EFLAGS_IF));
 }
 
+/*
+ * Check if userspace requested a NMI window, and that the NMI window
+ * is open.
+ *
+ * No need to exit to userspace if we already have a NMI queued.
+ */
+static int dm_request_for_nmi_injection(struct kvm_vcpu *vcpu,
+					struct kvm_run *kvm_run)
+{
+	return (!vcpu->arch.nmi_pending &&
+		kvm_run->request_nmi_window &&
+		vcpu->arch.nmi_window_open);
+}
+
 static void post_kvm_run_save(struct kvm_vcpu *vcpu,
 			      struct kvm_run *kvm_run)
 {
 	kvm_run->if_flag = (kvm_x86_ops->get_rflags(vcpu) & X86_EFLAGS_IF) != 0;
 	kvm_run->cr8 = kvm_get_cr8(vcpu);
 	kvm_run->apic_base = kvm_get_apic_base(vcpu);
-	if (irqchip_in_kernel(vcpu->kvm))
+	if (irqchip_in_kernel(vcpu->kvm)) {
 		kvm_run->ready_for_interrupt_injection = 1;
-	else
+		kvm_run->ready_for_nmi_injection = 1;
+	} else {
 		kvm_run->ready_for_interrupt_injection =
 					(vcpu->arch.interrupt_window_open &&
 					 vcpu->arch.irq_summary == 0);
+		kvm_run->ready_for_nmi_injection =
+					(vcpu->arch.nmi_window_open &&
+					 vcpu->arch.nmi_pending == 0);
+	}
 }
 
 static void vapic_enter(struct kvm_vcpu *vcpu)
@@ -2999,6 +3036,11 @@ static int __vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 		}
 
 		if (r > 0) {
+			if (dm_request_for_nmi_injection(vcpu, kvm_run)) {
+				r = -EINTR;
+				kvm_run->exit_reason = KVM_EXIT_NMI;
+				++vcpu->stat.request_nmi_exits;
+			}
 			if (dm_request_for_irq_injection(vcpu, kvm_run)) {
 				r = -EINTR;
 				kvm_run->exit_reason = KVM_EXIT_INTR;
diff --git a/include/linux/kvm.h b/include/linux/kvm.h
index f18b86fa865..44fd7fa0af2 100644
--- a/include/linux/kvm.h
+++ b/include/linux/kvm.h
@@ -83,18 +83,22 @@ struct kvm_irqchip {
 #define KVM_EXIT_S390_SIEIC       13
 #define KVM_EXIT_S390_RESET       14
 #define KVM_EXIT_DCR              15
+#define KVM_EXIT_NMI              16
+#define KVM_EXIT_NMI_WINDOW_OPEN  17
 
 /* for KVM_RUN, returned by mmap(vcpu_fd, offset=0) */
 struct kvm_run {
 	/* in */
 	__u8 request_interrupt_window;
-	__u8 padding1[7];
+	__u8 request_nmi_window;
+	__u8 padding1[6];
 
 	/* out */
 	__u32 exit_reason;
 	__u8 ready_for_interrupt_injection;
 	__u8 if_flag;
-	__u8 padding2[2];
+	__u8 ready_for_nmi_injection;
+	__u8 padding2;
 
 	/* in (pre_kvm_run), out (post_kvm_run) */
 	__u64 cr8;
@@ -387,6 +391,7 @@ struct kvm_trace_rec {
 #define KVM_CAP_DEVICE_ASSIGNMENT 17
 #endif
 #define KVM_CAP_IOMMU 18
+#define KVM_CAP_NMI 19
 
 /*
  * ioctls for VM fds
@@ -458,6 +463,8 @@ struct kvm_trace_rec {
 #define KVM_S390_INITIAL_RESET    _IO(KVMIO,  0x97)
 #define KVM_GET_MP_STATE          _IOR(KVMIO,  0x98, struct kvm_mp_state)
 #define KVM_SET_MP_STATE          _IOW(KVMIO,  0x99, struct kvm_mp_state)
+/* Available with KVM_CAP_NMI */
+#define KVM_NMI                   _IO(KVMIO,  0x9a)
 
 #define KVM_TRC_INJ_VIRQ         (KVM_TRC_HANDLER + 0x02)
 #define KVM_TRC_REDELIVER_EVT    (KVM_TRC_HANDLER + 0x03)
-- 
cgit v1.2.3-70-g09d2


From e19e30effac03f5a005a8e42ed941a2a5dc62654 Mon Sep 17 00:00:00 2001
From: Sheng Yang <sheng@linux.intel.com>
Date: Mon, 20 Oct 2008 16:07:10 +0800
Subject: KVM: IRQ ACK notifier should be used with in-kernel irqchip

Also remove unnecessary parameter of unregister irq ack notifier.

Signed-off-by: Sheng Yang <sheng@linux.intel.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 include/linux/kvm_host.h | 3 +--
 virt/kvm/irq_comm.c      | 8 ++++++--
 virt/kvm/kvm_main.c      | 2 +-
 3 files changed, 8 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index bb92be2153b..3a0fb77d1f6 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -316,8 +316,7 @@ void kvm_set_irq(struct kvm *kvm, int irq_source_id, int irq, int level);
 void kvm_notify_acked_irq(struct kvm *kvm, unsigned gsi);
 void kvm_register_irq_ack_notifier(struct kvm *kvm,
 				   struct kvm_irq_ack_notifier *kian);
-void kvm_unregister_irq_ack_notifier(struct kvm *kvm,
-				     struct kvm_irq_ack_notifier *kian);
+void kvm_unregister_irq_ack_notifier(struct kvm_irq_ack_notifier *kian);
 int kvm_request_irq_source_id(struct kvm *kvm);
 void kvm_free_irq_source_id(struct kvm *kvm, int irq_source_id);
 
diff --git a/virt/kvm/irq_comm.c b/virt/kvm/irq_comm.c
index 55ad76ee2d0..9fbbdea3d1d 100644
--- a/virt/kvm/irq_comm.c
+++ b/virt/kvm/irq_comm.c
@@ -58,12 +58,16 @@ void kvm_notify_acked_irq(struct kvm *kvm, unsigned gsi)
 void kvm_register_irq_ack_notifier(struct kvm *kvm,
 				   struct kvm_irq_ack_notifier *kian)
 {
+	/* Must be called with in-kernel IRQ chip, otherwise it's nonsense */
+	ASSERT(irqchip_in_kernel(kvm));
+	ASSERT(kian);
 	hlist_add_head(&kian->link, &kvm->arch.irq_ack_notifier_list);
 }
 
-void kvm_unregister_irq_ack_notifier(struct kvm *kvm,
-				     struct kvm_irq_ack_notifier *kian)
+void kvm_unregister_irq_ack_notifier(struct kvm_irq_ack_notifier *kian)
 {
+	if (!kian)
+		return;
 	hlist_del(&kian->link);
 }
 
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index a87f45edfae..4f43abe198e 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -143,7 +143,7 @@ static void kvm_free_assigned_device(struct kvm *kvm,
 	if (irqchip_in_kernel(kvm) && assigned_dev->irq_requested)
 		free_irq(assigned_dev->host_irq, (void *)assigned_dev);
 
-	kvm_unregister_irq_ack_notifier(kvm, &assigned_dev->ack_notifier);
+	kvm_unregister_irq_ack_notifier(&assigned_dev->ack_notifier);
 	kvm_free_irq_source_id(kvm, assigned_dev->irq_source_id);
 
 	if (cancel_work_sync(&assigned_dev->interrupt_work))
-- 
cgit v1.2.3-70-g09d2


From 4f906c19ae29397409bedabf7bbe5cb42ad90332 Mon Sep 17 00:00:00 2001
From: Sheng Yang <sheng@linux.intel.com>
Date: Mon, 24 Nov 2008 14:32:51 +0800
Subject: KVM: Replace irq_requested with more generic irq_requested_type

Separate guest irq type and host irq type, for we can support guest using INTx
with host using MSI (but not opposite combination).

Signed-off-by: Sheng Yang <sheng@linux.intel.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 include/linux/kvm_host.h | 4 +++-
 virt/kvm/kvm_main.c      | 9 +++++----
 2 files changed, 8 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 3a0fb77d1f6..c3d4b96a08f 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -307,7 +307,9 @@ struct kvm_assigned_dev_kernel {
 	int host_devfn;
 	int host_irq;
 	int guest_irq;
-	int irq_requested;
+#define KVM_ASSIGNED_DEV_GUEST_INTX	(1 << 0)
+#define KVM_ASSIGNED_DEV_HOST_INTX	(1 << 8)
+	unsigned long irq_requested_type;
 	int irq_source_id;
 	struct pci_dev *dev;
 	struct kvm *kvm;
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index ef2f03cf42c..638de47e167 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -140,7 +140,7 @@ static void kvm_free_assigned_device(struct kvm *kvm,
 				     struct kvm_assigned_dev_kernel
 				     *assigned_dev)
 {
-	if (irqchip_in_kernel(kvm) && assigned_dev->irq_requested)
+	if (irqchip_in_kernel(kvm) && assigned_dev->irq_requested_type)
 		free_irq(assigned_dev->host_irq, (void *)assigned_dev);
 
 	kvm_unregister_irq_ack_notifier(&assigned_dev->ack_notifier);
@@ -180,7 +180,7 @@ static int assigned_device_update_intx(struct kvm *kvm,
 			struct kvm_assigned_dev_kernel *adev,
 			struct kvm_assigned_irq *airq)
 {
-	if (adev->irq_requested) {
+	if (adev->irq_requested_type & KVM_ASSIGNED_DEV_GUEST_INTX) {
 		adev->guest_irq = airq->guest_irq;
 		adev->ack_notifier.gsi = airq->guest_irq;
 		return 0;
@@ -207,7 +207,8 @@ static int assigned_device_update_intx(struct kvm *kvm,
 			return -EIO;
 	}
 
-	adev->irq_requested = true;
+	adev->irq_requested_type = KVM_ASSIGNED_DEV_GUEST_INTX |
+				   KVM_ASSIGNED_DEV_HOST_INTX;
 	return 0;
 }
 
@@ -227,7 +228,7 @@ static int kvm_vm_ioctl_assign_irq(struct kvm *kvm,
 		return -EINVAL;
 	}
 
-	if (!match->irq_requested) {
+	if (!match->irq_requested_type) {
 		INIT_WORK(&match->interrupt_work,
 				kvm_assigned_dev_interrupt_work_handler);
 		if (irqchip_in_kernel(kvm)) {
-- 
cgit v1.2.3-70-g09d2


From 0937c48d075ddd59ae2c12a6fa8308b9c7a63753 Mon Sep 17 00:00:00 2001
From: Sheng Yang <sheng@linux.intel.com>
Date: Mon, 24 Nov 2008 14:32:53 +0800
Subject: KVM: Add fields for MSI device assignment

Prepared for kvm_arch_assigned_device_msi_dispatch().

Signed-off-by: Sheng Yang <sheng@linux.intel.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 include/linux/kvm.h      | 7 +++++++
 include/linux/kvm_host.h | 4 ++++
 2 files changed, 11 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/kvm.h b/include/linux/kvm.h
index 44fd7fa0af2..bb283c388a2 100644
--- a/include/linux/kvm.h
+++ b/include/linux/kvm.h
@@ -507,10 +507,17 @@ struct kvm_assigned_irq {
 	__u32 guest_irq;
 	__u32 flags;
 	union {
+		struct {
+			__u32 addr_lo;
+			__u32 addr_hi;
+			__u32 data;
+		} guest_msi;
 		__u32 reserved[12];
 	};
 };
 
 #define KVM_DEV_ASSIGN_ENABLE_IOMMU	(1 << 0)
 
+#define KVM_DEV_IRQ_ASSIGN_ENABLE_MSI	(1 << 0)
+
 #endif
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index c3d4b96a08f..8091a4d90dd 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -16,6 +16,7 @@
 #include <linux/mm.h>
 #include <linux/preempt.h>
 #include <linux/marker.h>
+#include <linux/msi.h>
 #include <asm/signal.h>
 
 #include <linux/kvm.h>
@@ -307,8 +308,11 @@ struct kvm_assigned_dev_kernel {
 	int host_devfn;
 	int host_irq;
 	int guest_irq;
+	struct msi_msg guest_msi;
 #define KVM_ASSIGNED_DEV_GUEST_INTX	(1 << 0)
+#define KVM_ASSIGNED_DEV_GUEST_MSI	(1 << 1)
 #define KVM_ASSIGNED_DEV_HOST_INTX	(1 << 8)
+#define KVM_ASSIGNED_DEV_HOST_MSI	(1 << 9)
 	unsigned long irq_requested_type;
 	int irq_source_id;
 	struct pci_dev *dev;
-- 
cgit v1.2.3-70-g09d2


From 6b9cc7fd469869bed38831c5adac3f59dc25eaf5 Mon Sep 17 00:00:00 2001
From: Sheng Yang <sheng@linux.intel.com>
Date: Mon, 24 Nov 2008 14:32:56 +0800
Subject: KVM: Enable MSI for device assignment

We enable guest MSI and host MSI support in this patch. The userspace want to
enable MSI should set KVM_DEV_IRQ_ASSIGN_ENABLE_MSI in the assigned_irq's flag.
Function would return -ENOTTY if can't enable MSI, userspace shouldn't set MSI
Enable bit when KVM_ASSIGN_IRQ return -ENOTTY with
KVM_DEV_IRQ_ASSIGN_ENABLE_MSI.

Userspace can tell the support of MSI device from #ifdef KVM_CAP_DEVICE_MSI.

Signed-off-by: Sheng Yang <sheng@linux.intel.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 include/linux/kvm.h |  3 ++
 virt/kvm/kvm_main.c | 81 +++++++++++++++++++++++++++++++++++++++++++++++++----
 2 files changed, 78 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kvm.h b/include/linux/kvm.h
index bb283c388a2..0997e6f5490 100644
--- a/include/linux/kvm.h
+++ b/include/linux/kvm.h
@@ -392,6 +392,9 @@ struct kvm_trace_rec {
 #endif
 #define KVM_CAP_IOMMU 18
 #define KVM_CAP_NMI 19
+#if defined(CONFIG_X86)
+#define KVM_CAP_DEVICE_MSI 20
+#endif
 
 /*
  * ioctls for VM fds
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 228c1d18a61..bf36ae9ae7d 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -159,9 +159,15 @@ static void kvm_assigned_dev_interrupt_work_handler(struct work_struct *work)
 	 * finer-grained lock, update this
 	 */
 	mutex_lock(&assigned_dev->kvm->lock);
-	kvm_set_irq(assigned_dev->kvm,
-		    assigned_dev->irq_source_id,
-		    assigned_dev->guest_irq, 1);
+	if (assigned_dev->irq_requested_type & KVM_ASSIGNED_DEV_GUEST_INTX)
+		kvm_set_irq(assigned_dev->kvm,
+			    assigned_dev->irq_source_id,
+			    assigned_dev->guest_irq, 1);
+	else if (assigned_dev->irq_requested_type &
+				KVM_ASSIGNED_DEV_GUEST_MSI) {
+		assigned_device_msi_dispatch(assigned_dev);
+		enable_irq(assigned_dev->host_irq);
+	}
 	mutex_unlock(&assigned_dev->kvm->lock);
 	kvm_put_kvm(assigned_dev->kvm);
 }
@@ -197,6 +203,8 @@ static void kvm_free_assigned_device(struct kvm *kvm,
 {
 	if (irqchip_in_kernel(kvm) && assigned_dev->irq_requested_type)
 		free_irq(assigned_dev->host_irq, (void *)assigned_dev);
+	if (assigned_dev->irq_requested_type & KVM_ASSIGNED_DEV_HOST_MSI)
+		pci_disable_msi(assigned_dev->dev);
 
 	kvm_unregister_irq_ack_notifier(&assigned_dev->ack_notifier);
 	kvm_free_irq_source_id(kvm, assigned_dev->irq_source_id);
@@ -242,6 +250,11 @@ static int assigned_device_update_intx(struct kvm *kvm,
 		return 0;
 
 	if (irqchip_in_kernel(kvm)) {
+		if (adev->irq_requested_type & KVM_ASSIGNED_DEV_HOST_MSI) {
+			free_irq(adev->host_irq, (void *)kvm);
+			pci_disable_msi(adev->dev);
+		}
+
 		if (!capable(CAP_SYS_RAWIO))
 			return -EPERM;
 
@@ -265,6 +278,41 @@ static int assigned_device_update_intx(struct kvm *kvm,
 	return 0;
 }
 
+#ifdef CONFIG_X86
+static int assigned_device_update_msi(struct kvm *kvm,
+			struct kvm_assigned_dev_kernel *adev,
+			struct kvm_assigned_irq *airq)
+{
+	int r;
+
+	/* x86 don't care upper address of guest msi message addr */
+	adev->guest_msi.address_lo = airq->guest_msi.addr_lo;
+	adev->guest_msi.data = airq->guest_msi.data;
+	adev->ack_notifier.gsi = -1;
+
+	if (adev->irq_requested_type & KVM_ASSIGNED_DEV_HOST_MSI)
+		return 0;
+
+	if (irqchip_in_kernel(kvm)) {
+		if (adev->irq_requested_type & KVM_ASSIGNED_DEV_HOST_INTX)
+			free_irq(adev->host_irq, (void *)adev);
+
+		r = pci_enable_msi(adev->dev);
+		if (r)
+			return r;
+
+		adev->host_irq = adev->dev->irq;
+		if (request_irq(adev->host_irq, kvm_assigned_dev_intr, 0,
+				"kvm_assigned_msi_device", (void *)adev))
+			return -EIO;
+	}
+
+	adev->irq_requested_type = KVM_ASSIGNED_DEV_GUEST_MSI |
+				   KVM_ASSIGNED_DEV_HOST_MSI;
+	return 0;
+}
+#endif
+
 static int kvm_vm_ioctl_assign_irq(struct kvm *kvm,
 				   struct kvm_assigned_irq
 				   *assigned_irq)
@@ -301,9 +349,30 @@ static int kvm_vm_ioctl_assign_irq(struct kvm *kvm,
 		}
 	}
 
-	r = assigned_device_update_intx(kvm, match, assigned_irq);
-	if (r)
-		goto out_release;
+	if (assigned_irq->flags & KVM_DEV_IRQ_ASSIGN_ENABLE_MSI) {
+#ifdef CONFIG_X86
+		r = assigned_device_update_msi(kvm, match, assigned_irq);
+		if (r) {
+			printk(KERN_WARNING "kvm: failed to enable "
+					"MSI device!\n");
+			goto out_release;
+		}
+#else
+		r = -ENOTTY;
+#endif
+	} else if (assigned_irq->host_irq == 0 && match->dev->irq == 0) {
+		/* Host device IRQ 0 means don't support INTx */
+		printk(KERN_WARNING "kvm: wait device to enable MSI!\n");
+		r = 0;
+	} else {
+		/* Non-sharing INTx mode */
+		r = assigned_device_update_intx(kvm, match, assigned_irq);
+		if (r) {
+			printk(KERN_WARNING "kvm: failed to enable "
+					"INTx device!\n");
+			goto out_release;
+		}
+	}
 
 	mutex_unlock(&kvm->lock);
 	return r;
-- 
cgit v1.2.3-70-g09d2


From 1a811b6167089bcdb84284f2dc9fd0b4d0f1899d Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Mon, 8 Dec 2008 18:25:27 +0200
Subject: KVM: Advertise the bug in memory region destruction as fixed

Userspace might need to act differently.

Signed-off-by: Avi Kivity <avi@redhat.com>
---
 include/linux/kvm.h |  2 ++
 virt/kvm/kvm_main.c | 13 ++++++++++++-
 2 files changed, 14 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/kvm.h b/include/linux/kvm.h
index 0997e6f5490..48807767e72 100644
--- a/include/linux/kvm.h
+++ b/include/linux/kvm.h
@@ -395,6 +395,8 @@ struct kvm_trace_rec {
 #if defined(CONFIG_X86)
 #define KVM_CAP_DEVICE_MSI 20
 #endif
+/* Bug in KVM_SET_USER_MEMORY_REGION fixed: */
+#define KVM_CAP_DESTROY_MEMORY_REGION_WORKS 21
 
 /*
  * ioctls for VM fds
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index e7644b90667..e066eb125e5 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -1905,6 +1905,17 @@ static int kvm_dev_ioctl_create_vm(void)
 	return fd;
 }
 
+static long kvm_dev_ioctl_check_extension_generic(long arg)
+{
+	switch (arg) {
+	case KVM_CAP_DESTROY_MEMORY_REGION_WORKS:
+		return 1;
+	default:
+		break;
+	}
+	return kvm_dev_ioctl_check_extension(arg);
+}
+
 static long kvm_dev_ioctl(struct file *filp,
 			  unsigned int ioctl, unsigned long arg)
 {
@@ -1924,7 +1935,7 @@ static long kvm_dev_ioctl(struct file *filp,
 		r = kvm_dev_ioctl_create_vm();
 		break;
 	case KVM_CHECK_EXTENSION:
-		r = kvm_dev_ioctl_check_extension(arg);
+		r = kvm_dev_ioctl_check_extension_generic(arg);
 		break;
 	case KVM_GET_VCPU_MMAP_SIZE:
 		r = -EINVAL;
-- 
cgit v1.2.3-70-g09d2


From defaf1587c5d7dff828f6f11c8941e5bcef00f50 Mon Sep 17 00:00:00 2001
From: Mark McLoughlin <markmc@redhat.com>
Date: Tue, 2 Dec 2008 12:16:33 +0000
Subject: KVM: fix handling of ACK from shared guest IRQ

If an assigned device shares a guest irq with an emulated
device then we currently interpret an ack generated by the
emulated device as originating from the assigned device
leading to e.g. "Unbalanced enable for IRQ 4347" from the
enable_irq() in kvm_assigned_dev_ack_irq().

The fix is fairly simple - don't enable the physical device
irq unless it was previously disabled.

Of course, this can still lead to a situation where a
non-assigned device ACK can cause the physical device irq to
be reenabled before the device was serviced. However, being
level sensitive, the interrupt will merely be regenerated.

Signed-off-by: Mark McLoughlin <markmc@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 include/linux/kvm_host.h |  1 +
 virt/kvm/kvm_main.c      | 15 ++++++++++++++-
 2 files changed, 15 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 8091a4d90dd..eafabd5c66b 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -307,6 +307,7 @@ struct kvm_assigned_dev_kernel {
 	int host_busnr;
 	int host_devfn;
 	int host_irq;
+	bool host_irq_disabled;
 	int guest_irq;
 	struct msi_msg guest_msi;
 #define KVM_ASSIGNED_DEV_GUEST_INTX	(1 << 0)
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index eb70ca6c714..fc6127cbea1 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -170,6 +170,7 @@ static void kvm_assigned_dev_interrupt_work_handler(struct work_struct *work)
 				KVM_ASSIGNED_DEV_GUEST_MSI) {
 		assigned_device_msi_dispatch(assigned_dev);
 		enable_irq(assigned_dev->host_irq);
+		assigned_dev->host_irq_disabled = false;
 	}
 	mutex_unlock(&assigned_dev->kvm->lock);
 	kvm_put_kvm(assigned_dev->kvm);
@@ -181,8 +182,12 @@ static irqreturn_t kvm_assigned_dev_intr(int irq, void *dev_id)
 		(struct kvm_assigned_dev_kernel *) dev_id;
 
 	kvm_get_kvm(assigned_dev->kvm);
+
 	schedule_work(&assigned_dev->interrupt_work);
+
 	disable_irq_nosync(irq);
+	assigned_dev->host_irq_disabled = true;
+
 	return IRQ_HANDLED;
 }
 
@@ -196,8 +201,16 @@ static void kvm_assigned_dev_ack_irq(struct kvm_irq_ack_notifier *kian)
 
 	dev = container_of(kian, struct kvm_assigned_dev_kernel,
 			   ack_notifier);
+
 	kvm_set_irq(dev->kvm, dev->irq_source_id, dev->guest_irq, 0);
-	enable_irq(dev->host_irq);
+
+	/* The guest irq may be shared so this ack may be
+	 * from another device.
+	 */
+	if (dev->host_irq_disabled) {
+		enable_irq(dev->host_irq);
+		dev->host_irq_disabled = false;
+	}
 }
 
 static void kvm_free_assigned_irq(struct kvm *kvm,
-- 
cgit v1.2.3-70-g09d2


From 4531220b71f0399e71cda0c4cf749e7281a7416a Mon Sep 17 00:00:00 2001
From: Jan Kiszka <jan.kiszka@siemens.com>
Date: Thu, 11 Dec 2008 16:54:54 +0100
Subject: KVM: x86: Rework user space NMI injection as KVM_CAP_USER_NMI

There is no point in doing the ready_for_nmi_injection/
request_nmi_window dance with user space. First, we don't do this for
in-kernel irqchip anyway, while the code path is the same as for user
space irqchip mode. And second, there is nothing to loose if a pending
NMI is overwritten by another one (in contrast to IRQs where we have to
save the number). Actually, there is even the risk of raising spurious
NMIs this way because the reason for the held-back NMI might already be
handled while processing the first one.

Therefore this patch creates a simplified user space NMI injection
interface, exporting it under KVM_CAP_USER_NMI and dropping the old
KVM_CAP_NMI capability. And this time we also take care to provide the
interface only on archs supporting NMIs via KVM (right now only x86).

Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/vmx.c  | 24 ++----------------------
 arch/x86/kvm/x86.c  | 28 ++--------------------------
 include/linux/kvm.h | 11 +++++------
 3 files changed, 9 insertions(+), 54 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 487e1dcdce3..6259d746764 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -2498,15 +2498,13 @@ static void do_interrupt_requests(struct kvm_vcpu *vcpu,
 	}
 	if (vcpu->arch.nmi_injected) {
 		vmx_inject_nmi(vcpu);
-		if (vcpu->arch.nmi_pending || kvm_run->request_nmi_window)
+		if (vcpu->arch.nmi_pending)
 			enable_nmi_window(vcpu);
 		else if (vcpu->arch.irq_summary
 			 || kvm_run->request_interrupt_window)
 			enable_irq_window(vcpu);
 		return;
 	}
-	if (!vcpu->arch.nmi_window_open || kvm_run->request_nmi_window)
-		enable_nmi_window(vcpu);
 
 	if (vcpu->arch.interrupt_window_open) {
 		if (vcpu->arch.irq_summary && !vcpu->arch.interrupt.pending)
@@ -3040,14 +3038,6 @@ static int handle_nmi_window(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 	vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
 	++vcpu->stat.nmi_window_exits;
 
-	/*
-	 * If the user space waits to inject a NMI, exit as soon as possible
-	 */
-	if (kvm_run->request_nmi_window && !vcpu->arch.nmi_pending) {
-		kvm_run->exit_reason = KVM_EXIT_NMI_WINDOW_OPEN;
-		return 0;
-	}
-
 	return 1;
 }
 
@@ -3162,7 +3152,7 @@ static int kvm_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
 			vmx->soft_vnmi_blocked = 0;
 			vcpu->arch.nmi_window_open = 1;
 		} else if (vmx->vnmi_blocked_time > 1000000000LL &&
-		    (kvm_run->request_nmi_window || vcpu->arch.nmi_pending)) {
+			   vcpu->arch.nmi_pending) {
 			/*
 			 * This CPU don't support us in finding the end of an
 			 * NMI-blocked window if the guest runs with IRQs
@@ -3175,16 +3165,6 @@ static int kvm_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
 			vmx->soft_vnmi_blocked = 0;
 			vmx->vcpu.arch.nmi_window_open = 1;
 		}
-
-		/*
-		 * If the user space waits to inject an NNI, exit ASAP
-		 */
-		if (vcpu->arch.nmi_window_open && kvm_run->request_nmi_window
-		    && !vcpu->arch.nmi_pending) {
-			kvm_run->exit_reason = KVM_EXIT_NMI_WINDOW_OPEN;
-			++vcpu->stat.nmi_window_exits;
-			return 0;
-		}
 	}
 
 	if (exit_reason < kvm_vmx_max_exit_handlers
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 10302d3bd41..0e6aa8141dc 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -2887,37 +2887,18 @@ static int dm_request_for_irq_injection(struct kvm_vcpu *vcpu,
 		(kvm_x86_ops->get_rflags(vcpu) & X86_EFLAGS_IF));
 }
 
-/*
- * Check if userspace requested a NMI window, and that the NMI window
- * is open.
- *
- * No need to exit to userspace if we already have a NMI queued.
- */
-static int dm_request_for_nmi_injection(struct kvm_vcpu *vcpu,
-					struct kvm_run *kvm_run)
-{
-	return (!vcpu->arch.nmi_pending &&
-		kvm_run->request_nmi_window &&
-		vcpu->arch.nmi_window_open);
-}
-
 static void post_kvm_run_save(struct kvm_vcpu *vcpu,
 			      struct kvm_run *kvm_run)
 {
 	kvm_run->if_flag = (kvm_x86_ops->get_rflags(vcpu) & X86_EFLAGS_IF) != 0;
 	kvm_run->cr8 = kvm_get_cr8(vcpu);
 	kvm_run->apic_base = kvm_get_apic_base(vcpu);
-	if (irqchip_in_kernel(vcpu->kvm)) {
+	if (irqchip_in_kernel(vcpu->kvm))
 		kvm_run->ready_for_interrupt_injection = 1;
-		kvm_run->ready_for_nmi_injection = 1;
-	} else {
+	else
 		kvm_run->ready_for_interrupt_injection =
 					(vcpu->arch.interrupt_window_open &&
 					 vcpu->arch.irq_summary == 0);
-		kvm_run->ready_for_nmi_injection =
-					(vcpu->arch.nmi_window_open &&
-					 vcpu->arch.nmi_pending == 0);
-	}
 }
 
 static void vapic_enter(struct kvm_vcpu *vcpu)
@@ -3093,11 +3074,6 @@ static int __vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 		}
 
 		if (r > 0) {
-			if (dm_request_for_nmi_injection(vcpu, kvm_run)) {
-				r = -EINTR;
-				kvm_run->exit_reason = KVM_EXIT_NMI;
-				++vcpu->stat.request_nmi_exits;
-			}
 			if (dm_request_for_irq_injection(vcpu, kvm_run)) {
 				r = -EINTR;
 				kvm_run->exit_reason = KVM_EXIT_INTR;
diff --git a/include/linux/kvm.h b/include/linux/kvm.h
index 48807767e72..35525ac6333 100644
--- a/include/linux/kvm.h
+++ b/include/linux/kvm.h
@@ -84,21 +84,18 @@ struct kvm_irqchip {
 #define KVM_EXIT_S390_RESET       14
 #define KVM_EXIT_DCR              15
 #define KVM_EXIT_NMI              16
-#define KVM_EXIT_NMI_WINDOW_OPEN  17
 
 /* for KVM_RUN, returned by mmap(vcpu_fd, offset=0) */
 struct kvm_run {
 	/* in */
 	__u8 request_interrupt_window;
-	__u8 request_nmi_window;
-	__u8 padding1[6];
+	__u8 padding1[7];
 
 	/* out */
 	__u32 exit_reason;
 	__u8 ready_for_interrupt_injection;
 	__u8 if_flag;
-	__u8 ready_for_nmi_injection;
-	__u8 padding2;
+	__u8 padding2[2];
 
 	/* in (pre_kvm_run), out (post_kvm_run) */
 	__u64 cr8;
@@ -391,12 +388,14 @@ struct kvm_trace_rec {
 #define KVM_CAP_DEVICE_ASSIGNMENT 17
 #endif
 #define KVM_CAP_IOMMU 18
-#define KVM_CAP_NMI 19
 #if defined(CONFIG_X86)
 #define KVM_CAP_DEVICE_MSI 20
 #endif
 /* Bug in KVM_SET_USER_MEMORY_REGION fixed: */
 #define KVM_CAP_DESTROY_MEMORY_REGION_WORKS 21
+#if defined(CONFIG_X86)
+#define KVM_CAP_USER_NMI 22
+#endif
 
 /*
  * ioctls for VM fds
-- 
cgit v1.2.3-70-g09d2


From b30f8af3358b5c66be223e3a9f3d11b3d02b4a8f Mon Sep 17 00:00:00 2001
From: Jarkko Lavinen <jarkko.lavinen@nokia.com>
Date: Mon, 17 Nov 2008 14:35:21 +0200
Subject: mmc: Add 8-bit bus width support

Signed-off-by: Jarkko Lavinen <jarkko.lavinen@nokia.com>
Signed-off-by: Pierre Ossman <drzeus@drzeus.cx>
---
 drivers/mmc/core/mmc.c   | 18 ++++++++++++++----
 include/linux/mmc/host.h |  2 ++
 2 files changed, 16 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mmc/core/mmc.c b/drivers/mmc/core/mmc.c
index fdd7c760be8..c232d11a7ed 100644
--- a/drivers/mmc/core/mmc.c
+++ b/drivers/mmc/core/mmc.c
@@ -434,13 +434,24 @@ static int mmc_init_card(struct mmc_host *host, u32 ocr,
 	 * Activate wide bus (if supported).
 	 */
 	if ((card->csd.mmca_vsn >= CSD_SPEC_VER_4) &&
-		(host->caps & MMC_CAP_4_BIT_DATA)) {
+	    (host->caps & (MMC_CAP_4_BIT_DATA | MMC_CAP_8_BIT_DATA))) {
+		unsigned ext_csd_bit, bus_width;
+
+		if (host->caps & MMC_CAP_8_BIT_DATA) {
+			ext_csd_bit = EXT_CSD_BUS_WIDTH_8;
+			bus_width = MMC_BUS_WIDTH_8;
+		} else {
+			ext_csd_bit = EXT_CSD_BUS_WIDTH_4;
+			bus_width = MMC_BUS_WIDTH_4;
+		}
+
 		err = mmc_switch(card, EXT_CSD_CMD_SET_NORMAL,
-			EXT_CSD_BUS_WIDTH, EXT_CSD_BUS_WIDTH_4);
+				 EXT_CSD_BUS_WIDTH, ext_csd_bit);
+
 		if (err)
 			goto free_card;
 
-		mmc_set_bus_width(card->host, MMC_BUS_WIDTH_4);
+		mmc_set_bus_width(card->host, bus_width);
 	}
 
 	if (!oldcard)
@@ -624,4 +635,3 @@ err:
 
 	return err;
 }
-
diff --git a/include/linux/mmc/host.h b/include/linux/mmc/host.h
index f842f234e44..4e457256bd3 100644
--- a/include/linux/mmc/host.h
+++ b/include/linux/mmc/host.h
@@ -41,6 +41,7 @@ struct mmc_ios {
 
 #define MMC_BUS_WIDTH_1		0
 #define MMC_BUS_WIDTH_4		2
+#define MMC_BUS_WIDTH_8		3
 
 	unsigned char	timing;			/* timing specification used */
 
@@ -116,6 +117,7 @@ struct mmc_host {
 #define MMC_CAP_SDIO_IRQ	(1 << 3)	/* Can signal pending SDIO IRQs */
 #define MMC_CAP_SPI		(1 << 4)	/* Talks only SPI protocols */
 #define MMC_CAP_NEEDS_POLL	(1 << 5)	/* Needs polling for card-detection */
+#define MMC_CAP_8_BIT_DATA	(1 << 6)	/* Can the host do 8 bit transfers */
 
 	/* host specific block data */
 	unsigned int		max_seg_size;	/* see blk_queue_max_segment_size */
-- 
cgit v1.2.3-70-g09d2


From 86e8286a0e48663e1e86a5884b30a6d05de2993a Mon Sep 17 00:00:00 2001
From: Anton Vorontsov <avorontsov@ru.mvista.com>
Date: Wed, 26 Nov 2008 22:54:17 +0300
Subject: mmc: Add mmc_vddrange_to_ocrmask() helper function

This function sets the OCR mask bits according to provided voltage
ranges. Will be used by the mmc_spi OpenFirmware bindings.

Signed-off-by: Anton Vorontsov <avorontsov@ru.mvista.com>
Signed-off-by: Pierre Ossman <drzeus@drzeus.cx>
---
 drivers/mmc/core/core.c  | 75 ++++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/mmc/core.h |  2 ++
 2 files changed, 77 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/mmc/core/core.c b/drivers/mmc/core/core.c
index f7284b905eb..5f288aeeb72 100644
--- a/drivers/mmc/core/core.c
+++ b/drivers/mmc/core/core.c
@@ -20,6 +20,7 @@
 #include <linux/err.h>
 #include <linux/leds.h>
 #include <linux/scatterlist.h>
+#include <linux/log2.h>
 
 #include <linux/mmc/card.h>
 #include <linux/mmc/host.h>
@@ -448,6 +449,80 @@ void mmc_set_bus_width(struct mmc_host *host, unsigned int width)
 	mmc_set_ios(host);
 }
 
+/**
+ * mmc_vdd_to_ocrbitnum - Convert a voltage to the OCR bit number
+ * @vdd:	voltage (mV)
+ * @low_bits:	prefer low bits in boundary cases
+ *
+ * This function returns the OCR bit number according to the provided @vdd
+ * value. If conversion is not possible a negative errno value returned.
+ *
+ * Depending on the @low_bits flag the function prefers low or high OCR bits
+ * on boundary voltages. For example,
+ * with @low_bits = true, 3300 mV translates to ilog2(MMC_VDD_32_33);
+ * with @low_bits = false, 3300 mV translates to ilog2(MMC_VDD_33_34);
+ *
+ * Any value in the [1951:1999] range translates to the ilog2(MMC_VDD_20_21).
+ */
+static int mmc_vdd_to_ocrbitnum(int vdd, bool low_bits)
+{
+	const int max_bit = ilog2(MMC_VDD_35_36);
+	int bit;
+
+	if (vdd < 1650 || vdd > 3600)
+		return -EINVAL;
+
+	if (vdd >= 1650 && vdd <= 1950)
+		return ilog2(MMC_VDD_165_195);
+
+	if (low_bits)
+		vdd -= 1;
+
+	/* Base 2000 mV, step 100 mV, bit's base 8. */
+	bit = (vdd - 2000) / 100 + 8;
+	if (bit > max_bit)
+		return max_bit;
+	return bit;
+}
+
+/**
+ * mmc_vddrange_to_ocrmask - Convert a voltage range to the OCR mask
+ * @vdd_min:	minimum voltage value (mV)
+ * @vdd_max:	maximum voltage value (mV)
+ *
+ * This function returns the OCR mask bits according to the provided @vdd_min
+ * and @vdd_max values. If conversion is not possible the function returns 0.
+ *
+ * Notes wrt boundary cases:
+ * This function sets the OCR bits for all boundary voltages, for example
+ * [3300:3400] range is translated to MMC_VDD_32_33 | MMC_VDD_33_34 |
+ * MMC_VDD_34_35 mask.
+ */
+u32 mmc_vddrange_to_ocrmask(int vdd_min, int vdd_max)
+{
+	u32 mask = 0;
+
+	if (vdd_max < vdd_min)
+		return 0;
+
+	/* Prefer high bits for the boundary vdd_max values. */
+	vdd_max = mmc_vdd_to_ocrbitnum(vdd_max, false);
+	if (vdd_max < 0)
+		return 0;
+
+	/* Prefer low bits for the boundary vdd_min values. */
+	vdd_min = mmc_vdd_to_ocrbitnum(vdd_min, true);
+	if (vdd_min < 0)
+		return 0;
+
+	/* Fill the mask, from max bit to min bit. */
+	while (vdd_max >= vdd_min)
+		mask |= 1 << vdd_max--;
+
+	return mask;
+}
+EXPORT_SYMBOL(mmc_vddrange_to_ocrmask);
+
 /*
  * Mask off any voltages we don't support and select
  * the lowest voltage
diff --git a/include/linux/mmc/core.h b/include/linux/mmc/core.h
index 143cebf0586..7ac8b500d55 100644
--- a/include/linux/mmc/core.h
+++ b/include/linux/mmc/core.h
@@ -151,4 +151,6 @@ static inline void mmc_claim_host(struct mmc_host *host)
 	__mmc_claim_host(host, NULL);
 }
 
+extern u32 mmc_vddrange_to_ocrmask(int vdd_min, int vdd_max);
+
 #endif
-- 
cgit v1.2.3-70-g09d2


From 9c43df57910bbba540a6cb5c9132302a9ea5f41a Mon Sep 17 00:00:00 2001
From: Anton Vorontsov <avorontsov@ru.mvista.com>
Date: Tue, 30 Dec 2008 18:15:28 +0300
Subject: mmc_spi: Add support for OpenFirmware bindings

The support is implemented via platform data accessors, new module
(of_mmc_spi) will be created automatically when the driver compiles
on OpenFirmware platforms. Link-time dependency will load the module
automatically.

Signed-off-by: Anton Vorontsov <avorontsov@ru.mvista.com>
Signed-off-by: Pierre Ossman <drzeus@drzeus.cx>
---
 drivers/mmc/host/Makefile     |   3 +
 drivers/mmc/host/mmc_spi.c    |   4 +-
 drivers/mmc/host/of_mmc_spi.c | 149 ++++++++++++++++++++++++++++++++++++++++++
 include/linux/spi/mmc_spi.h   |  15 ++++-
 4 files changed, 169 insertions(+), 2 deletions(-)
 create mode 100644 drivers/mmc/host/of_mmc_spi.c

(limited to 'include/linux')

diff --git a/drivers/mmc/host/Makefile b/drivers/mmc/host/Makefile
index c794cc5ce44..f4853288bbb 100644
--- a/drivers/mmc/host/Makefile
+++ b/drivers/mmc/host/Makefile
@@ -19,6 +19,9 @@ obj-$(CONFIG_MMC_AT91)		+= at91_mci.o
 obj-$(CONFIG_MMC_ATMELMCI)	+= atmel-mci.o
 obj-$(CONFIG_MMC_TIFM_SD)	+= tifm_sd.o
 obj-$(CONFIG_MMC_SPI)		+= mmc_spi.o
+ifeq ($(CONFIG_OF),y)
+obj-$(CONFIG_MMC_SPI)		+= of_mmc_spi.o
+endif
 obj-$(CONFIG_MMC_S3C)   	+= s3cmci.o
 obj-$(CONFIG_MMC_SDRICOH_CS)	+= sdricoh_cs.o
 obj-$(CONFIG_MMC_TMIO)		+= tmio_mmc.o
diff --git a/drivers/mmc/host/mmc_spi.c b/drivers/mmc/host/mmc_spi.c
index ad00e163231..87e211df68a 100644
--- a/drivers/mmc/host/mmc_spi.c
+++ b/drivers/mmc/host/mmc_spi.c
@@ -1285,7 +1285,7 @@ static int mmc_spi_probe(struct spi_device *spi)
 	/* Platform data is used to hook up things like card sensing
 	 * and power switching gpios.
 	 */
-	host->pdata = spi->dev.platform_data;
+	host->pdata = mmc_spi_get_pdata(spi);
 	if (host->pdata)
 		mmc->ocr_avail = host->pdata->ocr_mask;
 	if (!mmc->ocr_avail) {
@@ -1368,6 +1368,7 @@ fail_glue_init:
 
 fail_nobuf1:
 	mmc_free_host(mmc);
+	mmc_spi_put_pdata(spi);
 	dev_set_drvdata(&spi->dev, NULL);
 
 nomem:
@@ -1402,6 +1403,7 @@ static int __devexit mmc_spi_remove(struct spi_device *spi)
 
 		spi->max_speed_hz = mmc->f_max;
 		mmc_free_host(mmc);
+		mmc_spi_put_pdata(spi);
 		dev_set_drvdata(&spi->dev, NULL);
 	}
 	return 0;
diff --git a/drivers/mmc/host/of_mmc_spi.c b/drivers/mmc/host/of_mmc_spi.c
new file mode 100644
index 00000000000..fb2921f8099
--- /dev/null
+++ b/drivers/mmc/host/of_mmc_spi.c
@@ -0,0 +1,149 @@
+/*
+ * OpenFirmware bindings for the MMC-over-SPI driver
+ *
+ * Copyright (c) MontaVista Software, Inc. 2008.
+ *
+ * Author: Anton Vorontsov <avorontsov@ru.mvista.com>
+ *
+ * This program is free software; you can redistribute  it and/or modify it
+ * under  the terms of  the GNU General  Public License as published by the
+ * Free Software Foundation;  either version 2 of the  License, or (at your
+ * option) any later version.
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/device.h>
+#include <linux/gpio.h>
+#include <linux/of.h>
+#include <linux/of_gpio.h>
+#include <linux/spi/spi.h>
+#include <linux/spi/mmc_spi.h>
+#include <linux/mmc/core.h>
+#include <linux/mmc/host.h>
+
+enum {
+	CD_GPIO = 0,
+	WP_GPIO,
+	NUM_GPIOS,
+};
+
+struct of_mmc_spi {
+	int gpios[NUM_GPIOS];
+	bool alow_gpios[NUM_GPIOS];
+	struct mmc_spi_platform_data pdata;
+};
+
+static struct of_mmc_spi *to_of_mmc_spi(struct device *dev)
+{
+	return container_of(dev->platform_data, struct of_mmc_spi, pdata);
+}
+
+static int of_mmc_spi_read_gpio(struct device *dev, int gpio_num)
+{
+	struct of_mmc_spi *oms = to_of_mmc_spi(dev);
+	bool active_low = oms->alow_gpios[gpio_num];
+	bool value = gpio_get_value(oms->gpios[gpio_num]);
+
+	return active_low ^ value;
+}
+
+static int of_mmc_spi_get_cd(struct device *dev)
+{
+	return of_mmc_spi_read_gpio(dev, CD_GPIO);
+}
+
+static int of_mmc_spi_get_ro(struct device *dev)
+{
+	return of_mmc_spi_read_gpio(dev, WP_GPIO);
+}
+
+struct mmc_spi_platform_data *mmc_spi_get_pdata(struct spi_device *spi)
+{
+	struct device *dev = &spi->dev;
+	struct device_node *np = dev_archdata_get_node(&dev->archdata);
+	struct of_mmc_spi *oms;
+	const u32 *voltage_ranges;
+	int num_ranges;
+	int i;
+	int ret = -EINVAL;
+
+	if (dev->platform_data || !np)
+		return dev->platform_data;
+
+	oms = kzalloc(sizeof(*oms), GFP_KERNEL);
+	if (!oms)
+		return NULL;
+
+	voltage_ranges = of_get_property(np, "voltage-ranges", &num_ranges);
+	num_ranges = num_ranges / sizeof(*voltage_ranges) / 2;
+	if (!voltage_ranges || !num_ranges) {
+		dev_err(dev, "OF: voltage-ranges unspecified\n");
+		goto err_ocr;
+	}
+
+	for (i = 0; i < num_ranges; i++) {
+		const int j = i * 2;
+		u32 mask;
+
+		mask = mmc_vddrange_to_ocrmask(voltage_ranges[j],
+					       voltage_ranges[j + 1]);
+		if (!mask) {
+			ret = -EINVAL;
+			dev_err(dev, "OF: voltage-range #%d is invalid\n", i);
+			goto err_ocr;
+		}
+		oms->pdata.ocr_mask |= mask;
+	}
+
+	for (i = 0; i < ARRAY_SIZE(oms->gpios); i++) {
+		enum of_gpio_flags gpio_flags;
+
+		oms->gpios[i] = of_get_gpio_flags(np, i, &gpio_flags);
+		if (!gpio_is_valid(oms->gpios[i]))
+			continue;
+
+		ret = gpio_request(oms->gpios[i], dev->bus_id);
+		if (ret < 0) {
+			oms->gpios[i] = -EINVAL;
+			continue;
+		}
+
+		if (gpio_flags & OF_GPIO_ACTIVE_LOW)
+			oms->alow_gpios[i] = true;
+	}
+
+	if (gpio_is_valid(oms->gpios[CD_GPIO]))
+		oms->pdata.get_cd = of_mmc_spi_get_cd;
+	if (gpio_is_valid(oms->gpios[WP_GPIO]))
+		oms->pdata.get_ro = of_mmc_spi_get_ro;
+
+	/* We don't support interrupts yet, let's poll. */
+	oms->pdata.caps |= MMC_CAP_NEEDS_POLL;
+
+	dev->platform_data = &oms->pdata;
+	return dev->platform_data;
+err_ocr:
+	kfree(oms);
+	return NULL;
+}
+EXPORT_SYMBOL(mmc_spi_get_pdata);
+
+void mmc_spi_put_pdata(struct spi_device *spi)
+{
+	struct device *dev = &spi->dev;
+	struct device_node *np = dev_archdata_get_node(&dev->archdata);
+	struct of_mmc_spi *oms = to_of_mmc_spi(dev);
+	int i;
+
+	if (!dev->platform_data || !np)
+		return;
+
+	for (i = 0; i < ARRAY_SIZE(oms->gpios); i++) {
+		if (gpio_is_valid(oms->gpios[i]))
+			gpio_free(oms->gpios[i]);
+	}
+	kfree(oms);
+	dev->platform_data = NULL;
+}
+EXPORT_SYMBOL(mmc_spi_put_pdata);
diff --git a/include/linux/spi/mmc_spi.h b/include/linux/spi/mmc_spi.h
index a3626aedaec..0f4eb165f25 100644
--- a/include/linux/spi/mmc_spi.h
+++ b/include/linux/spi/mmc_spi.h
@@ -1,9 +1,10 @@
 #ifndef __LINUX_SPI_MMC_SPI_H
 #define __LINUX_SPI_MMC_SPI_H
 
+#include <linux/device.h>
+#include <linux/spi/spi.h>
 #include <linux/interrupt.h>
 
-struct device;
 struct mmc_host;
 
 /* Put this in platform_data of a device being used to manage an MMC/SD
@@ -41,4 +42,16 @@ struct mmc_spi_platform_data {
 	void (*setpower)(struct device *, unsigned int maskval);
 };
 
+#ifdef CONFIG_OF
+extern struct mmc_spi_platform_data *mmc_spi_get_pdata(struct spi_device *spi);
+extern void mmc_spi_put_pdata(struct spi_device *spi);
+#else
+static inline struct mmc_spi_platform_data *
+mmc_spi_get_pdata(struct spi_device *spi)
+{
+	return spi->dev.platform_data;
+}
+static inline void mmc_spi_put_pdata(struct spi_device *spi) {}
+#endif /* CONFIG_OF */
+
 #endif /* __LINUX_SPI_MMC_SPI_H */
-- 
cgit v1.2.3-70-g09d2


From be6d3e56a6b9b3a4ee44a0685e39e595073c6f0d Mon Sep 17 00:00:00 2001
From: Kentaro Takeda <takedakn@nttdata.co.jp>
Date: Wed, 17 Dec 2008 13:24:15 +0900
Subject: introduce new LSM hooks where vfsmount is available.

Add new LSM hooks for path-based checks.  Call them on directory-modifying
operations at the points where we still know the vfsmount involved.

Signed-off-by: Kentaro Takeda <takedakn@nttdata.co.jp>
Signed-off-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Signed-off-by: Toshiharu Harada <haradats@nttdata.co.jp>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namei.c               |  36 +++++++++++++
 fs/open.c                |   5 ++
 include/linux/security.h | 137 +++++++++++++++++++++++++++++++++++++++++++++++
 net/unix/af_unix.c       |   4 ++
 security/Kconfig         |   9 ++++
 security/capability.c    |  57 ++++++++++++++++++++
 security/security.c      |  66 +++++++++++++++++++++++
 7 files changed, 314 insertions(+)

(limited to 'include/linux')

diff --git a/fs/namei.c b/fs/namei.c
index af3783fff1d..ab441af4196 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1556,6 +1556,9 @@ int may_open(struct nameidata *nd, int acc_mode, int flag)
 		 * Refuse to truncate files with mandatory locks held on them.
 		 */
 		error = locks_verify_locked(inode);
+		if (!error)
+			error = security_path_truncate(&nd->path, 0,
+					       ATTR_MTIME|ATTR_CTIME|ATTR_OPEN);
 		if (!error) {
 			DQUOT_INIT(inode);
 
@@ -1586,7 +1589,11 @@ static int __open_namei_create(struct nameidata *nd, struct path *path,
 
 	if (!IS_POSIXACL(dir->d_inode))
 		mode &= ~current->fs->umask;
+	error = security_path_mknod(&nd->path, path->dentry, mode, 0);
+	if (error)
+		goto out_unlock;
 	error = vfs_create(dir->d_inode, path->dentry, mode, nd);
+out_unlock:
 	mutex_unlock(&dir->d_inode->i_mutex);
 	dput(nd->path.dentry);
 	nd->path.dentry = path->dentry;
@@ -1999,6 +2006,9 @@ asmlinkage long sys_mknodat(int dfd, const char __user *filename, int mode,
 	error = mnt_want_write(nd.path.mnt);
 	if (error)
 		goto out_dput;
+	error = security_path_mknod(&nd.path, dentry, mode, dev);
+	if (error)
+		goto out_drop_write;
 	switch (mode & S_IFMT) {
 		case 0: case S_IFREG:
 			error = vfs_create(nd.path.dentry->d_inode,dentry,mode,&nd);
@@ -2011,6 +2021,7 @@ asmlinkage long sys_mknodat(int dfd, const char __user *filename, int mode,
 			error = vfs_mknod(nd.path.dentry->d_inode,dentry,mode,0);
 			break;
 	}
+out_drop_write:
 	mnt_drop_write(nd.path.mnt);
 out_dput:
 	dput(dentry);
@@ -2070,7 +2081,11 @@ asmlinkage long sys_mkdirat(int dfd, const char __user *pathname, int mode)
 	error = mnt_want_write(nd.path.mnt);
 	if (error)
 		goto out_dput;
+	error = security_path_mkdir(&nd.path, dentry, mode);
+	if (error)
+		goto out_drop_write;
 	error = vfs_mkdir(nd.path.dentry->d_inode, dentry, mode);
+out_drop_write:
 	mnt_drop_write(nd.path.mnt);
 out_dput:
 	dput(dentry);
@@ -2180,7 +2195,11 @@ static long do_rmdir(int dfd, const char __user *pathname)
 	error = mnt_want_write(nd.path.mnt);
 	if (error)
 		goto exit3;
+	error = security_path_rmdir(&nd.path, dentry);
+	if (error)
+		goto exit4;
 	error = vfs_rmdir(nd.path.dentry->d_inode, dentry);
+exit4:
 	mnt_drop_write(nd.path.mnt);
 exit3:
 	dput(dentry);
@@ -2265,7 +2284,11 @@ static long do_unlinkat(int dfd, const char __user *pathname)
 		error = mnt_want_write(nd.path.mnt);
 		if (error)
 			goto exit2;
+		error = security_path_unlink(&nd.path, dentry);
+		if (error)
+			goto exit3;
 		error = vfs_unlink(nd.path.dentry->d_inode, dentry);
+exit3:
 		mnt_drop_write(nd.path.mnt);
 	exit2:
 		dput(dentry);
@@ -2346,7 +2369,11 @@ asmlinkage long sys_symlinkat(const char __user *oldname,
 	error = mnt_want_write(nd.path.mnt);
 	if (error)
 		goto out_dput;
+	error = security_path_symlink(&nd.path, dentry, from);
+	if (error)
+		goto out_drop_write;
 	error = vfs_symlink(nd.path.dentry->d_inode, dentry, from);
+out_drop_write:
 	mnt_drop_write(nd.path.mnt);
 out_dput:
 	dput(dentry);
@@ -2443,7 +2470,11 @@ asmlinkage long sys_linkat(int olddfd, const char __user *oldname,
 	error = mnt_want_write(nd.path.mnt);
 	if (error)
 		goto out_dput;
+	error = security_path_link(old_path.dentry, &nd.path, new_dentry);
+	if (error)
+		goto out_drop_write;
 	error = vfs_link(old_path.dentry, nd.path.dentry->d_inode, new_dentry);
+out_drop_write:
 	mnt_drop_write(nd.path.mnt);
 out_dput:
 	dput(new_dentry);
@@ -2679,8 +2710,13 @@ asmlinkage long sys_renameat(int olddfd, const char __user *oldname,
 	error = mnt_want_write(oldnd.path.mnt);
 	if (error)
 		goto exit5;
+	error = security_path_rename(&oldnd.path, old_dentry,
+				     &newnd.path, new_dentry);
+	if (error)
+		goto exit6;
 	error = vfs_rename(old_dir->d_inode, old_dentry,
 				   new_dir->d_inode, new_dentry);
+exit6:
 	mnt_drop_write(oldnd.path.mnt);
 exit5:
 	dput(new_dentry);
diff --git a/fs/open.c b/fs/open.c
index c0a426d5766..1cd7d40e999 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -272,6 +272,8 @@ static long do_sys_truncate(const char __user *pathname, loff_t length)
 		goto put_write_and_out;
 
 	error = locks_verify_truncate(inode, NULL, length);
+	if (!error)
+		error = security_path_truncate(&path, length, 0);
 	if (!error) {
 		DQUOT_INIT(inode);
 		error = do_truncate(path.dentry, length, 0, NULL);
@@ -328,6 +330,9 @@ static long do_sys_ftruncate(unsigned int fd, loff_t length, int small)
 		goto out_putf;
 
 	error = locks_verify_truncate(inode, file, length);
+	if (!error)
+		error = security_path_truncate(&file->f_path, length,
+					       ATTR_MTIME|ATTR_CTIME);
 	if (!error)
 		error = do_truncate(dentry, length, ATTR_MTIME|ATTR_CTIME, file);
 out_putf:
diff --git a/include/linux/security.h b/include/linux/security.h
index 3416cb85e77..b92b5e453f6 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -335,17 +335,37 @@ static inline void security_free_mnt_opts(struct security_mnt_opts *opts)
  *	@dir contains the inode structure of the parent directory of the new link.
  *	@new_dentry contains the dentry structure for the new link.
  *	Return 0 if permission is granted.
+ * @path_link:
+ *	Check permission before creating a new hard link to a file.
+ *	@old_dentry contains the dentry structure for an existing link
+ *	to the file.
+ *	@new_dir contains the path structure of the parent directory of
+ *	the new link.
+ *	@new_dentry contains the dentry structure for the new link.
+ *	Return 0 if permission is granted.
  * @inode_unlink:
  *	Check the permission to remove a hard link to a file.
  *	@dir contains the inode structure of parent directory of the file.
  *	@dentry contains the dentry structure for file to be unlinked.
  *	Return 0 if permission is granted.
+ * @path_unlink:
+ *	Check the permission to remove a hard link to a file.
+ *	@dir contains the path structure of parent directory of the file.
+ *	@dentry contains the dentry structure for file to be unlinked.
+ *	Return 0 if permission is granted.
  * @inode_symlink:
  *	Check the permission to create a symbolic link to a file.
  *	@dir contains the inode structure of parent directory of the symbolic link.
  *	@dentry contains the dentry structure of the symbolic link.
  *	@old_name contains the pathname of file.
  *	Return 0 if permission is granted.
+ * @path_symlink:
+ *	Check the permission to create a symbolic link to a file.
+ *	@dir contains the path structure of parent directory of
+ *	the symbolic link.
+ *	@dentry contains the dentry structure of the symbolic link.
+ *	@old_name contains the pathname of file.
+ *	Return 0 if permission is granted.
  * @inode_mkdir:
  *	Check permissions to create a new directory in the existing directory
  *	associated with inode strcture @dir.
@@ -353,11 +373,25 @@ static inline void security_free_mnt_opts(struct security_mnt_opts *opts)
  *	@dentry contains the dentry structure of new directory.
  *	@mode contains the mode of new directory.
  *	Return 0 if permission is granted.
+ * @path_mkdir:
+ *	Check permissions to create a new directory in the existing directory
+ *	associated with path strcture @path.
+ *	@dir containst the path structure of parent of the directory
+ *	to be created.
+ *	@dentry contains the dentry structure of new directory.
+ *	@mode contains the mode of new directory.
+ *	Return 0 if permission is granted.
  * @inode_rmdir:
  *	Check the permission to remove a directory.
  *	@dir contains the inode structure of parent of the directory to be removed.
  *	@dentry contains the dentry structure of directory to be removed.
  *	Return 0 if permission is granted.
+ * @path_rmdir:
+ *	Check the permission to remove a directory.
+ *	@dir contains the path structure of parent of the directory to be
+ *	removed.
+ *	@dentry contains the dentry structure of directory to be removed.
+ *	Return 0 if permission is granted.
  * @inode_mknod:
  *	Check permissions when creating a special file (or a socket or a fifo
  *	file created via the mknod system call).  Note that if mknod operation
@@ -368,6 +402,15 @@ static inline void security_free_mnt_opts(struct security_mnt_opts *opts)
  *	@mode contains the mode of the new file.
  *	@dev contains the device number.
  *	Return 0 if permission is granted.
+ * @path_mknod:
+ *	Check permissions when creating a file. Note that this hook is called
+ *	even if mknod operation is being done for a regular file.
+ *	@dir contains the path structure of parent of the new file.
+ *	@dentry contains the dentry structure of the new file.
+ *	@mode contains the mode of the new file.
+ *	@dev contains the undecoded device number. Use new_decode_dev() to get
+ *	the decoded device number.
+ *	Return 0 if permission is granted.
  * @inode_rename:
  *	Check for permission to rename a file or directory.
  *	@old_dir contains the inode structure for parent of the old link.
@@ -375,6 +418,13 @@ static inline void security_free_mnt_opts(struct security_mnt_opts *opts)
  *	@new_dir contains the inode structure for parent of the new link.
  *	@new_dentry contains the dentry structure of the new link.
  *	Return 0 if permission is granted.
+ * @path_rename:
+ *	Check for permission to rename a file or directory.
+ *	@old_dir contains the path structure for parent of the old link.
+ *	@old_dentry contains the dentry structure of the old link.
+ *	@new_dir contains the path structure for parent of the new link.
+ *	@new_dentry contains the dentry structure of the new link.
+ *	Return 0 if permission is granted.
  * @inode_readlink:
  *	Check the permission to read the symbolic link.
  *	@dentry contains the dentry structure for the file link.
@@ -403,6 +453,12 @@ static inline void security_free_mnt_opts(struct security_mnt_opts *opts)
  *	@dentry contains the dentry structure for the file.
  *	@attr is the iattr structure containing the new file attributes.
  *	Return 0 if permission is granted.
+ * @path_truncate:
+ *	Check permission before truncating a file.
+ *	@path contains the path structure for the file.
+ *	@length is the new length of the file.
+ *	@time_attrs is the flags passed to do_truncate().
+ *	Return 0 if permission is granted.
  * @inode_getattr:
  *	Check permission before obtaining file attributes.
  *	@mnt is the vfsmount where the dentry was looked up
@@ -1331,6 +1387,22 @@ struct security_operations {
 				   struct super_block *newsb);
 	int (*sb_parse_opts_str) (char *options, struct security_mnt_opts *opts);
 
+#ifdef CONFIG_SECURITY_PATH
+	int (*path_unlink) (struct path *dir, struct dentry *dentry);
+	int (*path_mkdir) (struct path *dir, struct dentry *dentry, int mode);
+	int (*path_rmdir) (struct path *dir, struct dentry *dentry);
+	int (*path_mknod) (struct path *dir, struct dentry *dentry, int mode,
+			   unsigned int dev);
+	int (*path_truncate) (struct path *path, loff_t length,
+			      unsigned int time_attrs);
+	int (*path_symlink) (struct path *dir, struct dentry *dentry,
+			     const char *old_name);
+	int (*path_link) (struct dentry *old_dentry, struct path *new_dir,
+			  struct dentry *new_dentry);
+	int (*path_rename) (struct path *old_dir, struct dentry *old_dentry,
+			    struct path *new_dir, struct dentry *new_dentry);
+#endif
+
 	int (*inode_alloc_security) (struct inode *inode);
 	void (*inode_free_security) (struct inode *inode);
 	int (*inode_init_security) (struct inode *inode, struct inode *dir,
@@ -2705,6 +2777,71 @@ static inline void security_skb_classify_flow(struct sk_buff *skb, struct flowi
 
 #endif	/* CONFIG_SECURITY_NETWORK_XFRM */
 
+#ifdef CONFIG_SECURITY_PATH
+int security_path_unlink(struct path *dir, struct dentry *dentry);
+int security_path_mkdir(struct path *dir, struct dentry *dentry, int mode);
+int security_path_rmdir(struct path *dir, struct dentry *dentry);
+int security_path_mknod(struct path *dir, struct dentry *dentry, int mode,
+			unsigned int dev);
+int security_path_truncate(struct path *path, loff_t length,
+			   unsigned int time_attrs);
+int security_path_symlink(struct path *dir, struct dentry *dentry,
+			  const char *old_name);
+int security_path_link(struct dentry *old_dentry, struct path *new_dir,
+		       struct dentry *new_dentry);
+int security_path_rename(struct path *old_dir, struct dentry *old_dentry,
+			 struct path *new_dir, struct dentry *new_dentry);
+#else	/* CONFIG_SECURITY_PATH */
+static inline int security_path_unlink(struct path *dir, struct dentry *dentry)
+{
+	return 0;
+}
+
+static inline int security_path_mkdir(struct path *dir, struct dentry *dentry,
+				      int mode)
+{
+	return 0;
+}
+
+static inline int security_path_rmdir(struct path *dir, struct dentry *dentry)
+{
+	return 0;
+}
+
+static inline int security_path_mknod(struct path *dir, struct dentry *dentry,
+				      int mode, unsigned int dev)
+{
+	return 0;
+}
+
+static inline int security_path_truncate(struct path *path, loff_t length,
+					 unsigned int time_attrs)
+{
+	return 0;
+}
+
+static inline int security_path_symlink(struct path *dir, struct dentry *dentry,
+					const char *old_name)
+{
+	return 0;
+}
+
+static inline int security_path_link(struct dentry *old_dentry,
+				     struct path *new_dir,
+				     struct dentry *new_dentry)
+{
+	return 0;
+}
+
+static inline int security_path_rename(struct path *old_dir,
+				       struct dentry *old_dentry,
+				       struct path *new_dir,
+				       struct dentry *new_dentry)
+{
+	return 0;
+}
+#endif	/* CONFIG_SECURITY_PATH */
+
 #ifdef CONFIG_KEYS
 #ifdef CONFIG_SECURITY
 
diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index c6250d0055d..d1b89820ab4 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -836,7 +836,11 @@ static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
 		err = mnt_want_write(nd.path.mnt);
 		if (err)
 			goto out_mknod_dput;
+		err = security_path_mknod(&nd.path, dentry, mode, 0);
+		if (err)
+			goto out_mknod_drop_write;
 		err = vfs_mknod(nd.path.dentry->d_inode, dentry, mode, 0);
+out_mknod_drop_write:
 		mnt_drop_write(nd.path.mnt);
 		if (err)
 			goto out_mknod_dput;
diff --git a/security/Kconfig b/security/Kconfig
index d9f47ce7e20..9438535d7fd 100644
--- a/security/Kconfig
+++ b/security/Kconfig
@@ -81,6 +81,15 @@ config SECURITY_NETWORK_XFRM
 	  IPSec.
 	  If you are unsure how to answer this question, answer N.
 
+config SECURITY_PATH
+	bool "Security hooks for pathname based access control"
+	depends on SECURITY
+	help
+	  This enables the security hooks for pathname based access control.
+	  If enabled, a security module can use these hooks to
+	  implement pathname based access controls.
+	  If you are unsure how to answer this question, answer N.
+
 config SECURITY_FILE_CAPABILITIES
 	bool "File POSIX Capabilities"
 	default n
diff --git a/security/capability.c b/security/capability.c
index 2dce66fcb99..c545bd1300b 100644
--- a/security/capability.c
+++ b/security/capability.c
@@ -263,6 +263,53 @@ static void cap_inode_getsecid(const struct inode *inode, u32 *secid)
 	*secid = 0;
 }
 
+#ifdef CONFIG_SECURITY_PATH
+static int cap_path_mknod(struct path *dir, struct dentry *dentry, int mode,
+			  unsigned int dev)
+{
+	return 0;
+}
+
+static int cap_path_mkdir(struct path *dir, struct dentry *dentry, int mode)
+{
+	return 0;
+}
+
+static int cap_path_rmdir(struct path *dir, struct dentry *dentry)
+{
+	return 0;
+}
+
+static int cap_path_unlink(struct path *dir, struct dentry *dentry)
+{
+	return 0;
+}
+
+static int cap_path_symlink(struct path *dir, struct dentry *dentry,
+			    const char *old_name)
+{
+	return 0;
+}
+
+static int cap_path_link(struct dentry *old_dentry, struct path *new_dir,
+			 struct dentry *new_dentry)
+{
+	return 0;
+}
+
+static int cap_path_rename(struct path *old_path, struct dentry *old_dentry,
+			   struct path *new_path, struct dentry *new_dentry)
+{
+	return 0;
+}
+
+static int cap_path_truncate(struct path *path, loff_t length,
+			     unsigned int time_attrs)
+{
+	return 0;
+}
+#endif
+
 static int cap_file_permission(struct file *file, int mask)
 {
 	return 0;
@@ -883,6 +930,16 @@ void security_fixup_ops(struct security_operations *ops)
 	set_to_cap_if_null(ops, inode_setsecurity);
 	set_to_cap_if_null(ops, inode_listsecurity);
 	set_to_cap_if_null(ops, inode_getsecid);
+#ifdef CONFIG_SECURITY_PATH
+	set_to_cap_if_null(ops, path_mknod);
+	set_to_cap_if_null(ops, path_mkdir);
+	set_to_cap_if_null(ops, path_rmdir);
+	set_to_cap_if_null(ops, path_unlink);
+	set_to_cap_if_null(ops, path_symlink);
+	set_to_cap_if_null(ops, path_link);
+	set_to_cap_if_null(ops, path_rename);
+	set_to_cap_if_null(ops, path_truncate);
+#endif
 	set_to_cap_if_null(ops, file_permission);
 	set_to_cap_if_null(ops, file_alloc_security);
 	set_to_cap_if_null(ops, file_free_security);
diff --git a/security/security.c b/security/security.c
index d85dbb37c97..678d4d07b85 100644
--- a/security/security.c
+++ b/security/security.c
@@ -355,6 +355,72 @@ int security_inode_init_security(struct inode *inode, struct inode *dir,
 }
 EXPORT_SYMBOL(security_inode_init_security);
 
+#ifdef CONFIG_SECURITY_PATH
+int security_path_mknod(struct path *path, struct dentry *dentry, int mode,
+			unsigned int dev)
+{
+	if (unlikely(IS_PRIVATE(path->dentry->d_inode)))
+		return 0;
+	return security_ops->path_mknod(path, dentry, mode, dev);
+}
+EXPORT_SYMBOL(security_path_mknod);
+
+int security_path_mkdir(struct path *path, struct dentry *dentry, int mode)
+{
+	if (unlikely(IS_PRIVATE(path->dentry->d_inode)))
+		return 0;
+	return security_ops->path_mkdir(path, dentry, mode);
+}
+
+int security_path_rmdir(struct path *path, struct dentry *dentry)
+{
+	if (unlikely(IS_PRIVATE(path->dentry->d_inode)))
+		return 0;
+	return security_ops->path_rmdir(path, dentry);
+}
+
+int security_path_unlink(struct path *path, struct dentry *dentry)
+{
+	if (unlikely(IS_PRIVATE(path->dentry->d_inode)))
+		return 0;
+	return security_ops->path_unlink(path, dentry);
+}
+
+int security_path_symlink(struct path *path, struct dentry *dentry,
+			  const char *old_name)
+{
+	if (unlikely(IS_PRIVATE(path->dentry->d_inode)))
+		return 0;
+	return security_ops->path_symlink(path, dentry, old_name);
+}
+
+int security_path_link(struct dentry *old_dentry, struct path *new_dir,
+		       struct dentry *new_dentry)
+{
+	if (unlikely(IS_PRIVATE(old_dentry->d_inode)))
+		return 0;
+	return security_ops->path_link(old_dentry, new_dir, new_dentry);
+}
+
+int security_path_rename(struct path *old_dir, struct dentry *old_dentry,
+			 struct path *new_dir, struct dentry *new_dentry)
+{
+	if (unlikely(IS_PRIVATE(old_dentry->d_inode) ||
+		     (new_dentry->d_inode && IS_PRIVATE(new_dentry->d_inode))))
+		return 0;
+	return security_ops->path_rename(old_dir, old_dentry, new_dir,
+					 new_dentry);
+}
+
+int security_path_truncate(struct path *path, loff_t length,
+			   unsigned int time_attrs)
+{
+	if (unlikely(IS_PRIVATE(path->dentry->d_inode)))
+		return 0;
+	return security_ops->path_truncate(path, length, time_attrs);
+}
+#endif
+
 int security_inode_create(struct inode *dir, struct dentry *dentry, int mode)
 {
 	if (unlikely(IS_PRIVATE(dir)))
-- 
cgit v1.2.3-70-g09d2


From c2452f32786159ed85f0e4b21fec09258f822fc8 Mon Sep 17 00:00:00 2001
From: Nick Piggin <npiggin@suse.de>
Date: Mon, 1 Dec 2008 09:33:43 +0100
Subject: shrink struct dentry

struct dentry is one of the most critical structures in the kernel. So it's
sad to see it going neglected.

With CONFIG_PROFILING turned on (which is probably the common case at least
for distros and kernel developers), sizeof(struct dcache) == 208 here
(64-bit). This gives 19 objects per slab.

I packed d_mounted into a hole, and took another 4 bytes off the inline
name length to take the padding out from the end of the structure. This
shinks it to 200 bytes. I could have gone the other way and increased the
length to 40, but I'm aiming for a magic number, read on...

I then got rid of the d_cookie pointer. This shrinks it to 192 bytes. Rant:
why was this ever a good idea? The cookie system should increase its hash
size or use a tree or something if lookups are a problem. Also the "fast
dcookie lookups" in oprofile should be moved into the dcookie code -- how
can oprofile possibly care about the dcookie_mutex? It gets dropped after
get_dcookie() returns so it can't be providing any sort of protection.

At 192 bytes, 21 objects fit into a 4K page, saving about 3MB on my system
with ~140 000 entries allocated. 192 is also a multiple of 64, so we get
nice cacheline alignment on 64 and 32 byte line systems -- any given dentry
will now require 3 cachelines to touch all fields wheras previously it
would require 4.

I know the inline name size was chosen quite carefully, however with the
reduction in cacheline footprint, it should actually be just about as fast
to do a name lookup for a 36 character name as it was before the patch (and
faster for other sizes). The memory footprint savings for names which are
<= 32 or > 36 bytes long should more than make up for the memory cost for
33-36 byte names.

Performance is a feature...

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 arch/powerpc/oprofile/cell/spu_task_sync.c |  2 +-
 drivers/oprofile/buffer_sync.c             |  2 +-
 fs/dcache.c                                |  4 ----
 fs/dcookies.c                              | 28 +++++++++++++++++++---------
 include/linux/dcache.h                     | 21 ++++++++++++++-------
 5 files changed, 35 insertions(+), 22 deletions(-)

(limited to 'include/linux')

diff --git a/arch/powerpc/oprofile/cell/spu_task_sync.c b/arch/powerpc/oprofile/cell/spu_task_sync.c
index 2949126d28d..6b793aeda72 100644
--- a/arch/powerpc/oprofile/cell/spu_task_sync.c
+++ b/arch/powerpc/oprofile/cell/spu_task_sync.c
@@ -297,7 +297,7 @@ static inline unsigned long fast_get_dcookie(struct path *path)
 {
 	unsigned long cookie;
 
-	if (path->dentry->d_cookie)
+	if (path->dentry->d_flags & DCACHE_COOKIE)
 		return (unsigned long)path->dentry;
 	get_dcookie(path, &cookie);
 	return cookie;
diff --git a/drivers/oprofile/buffer_sync.c b/drivers/oprofile/buffer_sync.c
index 737bd948482..65e8294a9e2 100644
--- a/drivers/oprofile/buffer_sync.c
+++ b/drivers/oprofile/buffer_sync.c
@@ -200,7 +200,7 @@ static inline unsigned long fast_get_dcookie(struct path *path)
 {
 	unsigned long cookie;
 
-	if (path->dentry->d_cookie)
+	if (path->dentry->d_flags & DCACHE_COOKIE)
 		return (unsigned long)path->dentry;
 	get_dcookie(path, &cookie);
 	return cookie;
diff --git a/fs/dcache.c b/fs/dcache.c
index a1d86c7f3e6..fd244c7a7cc 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -34,7 +34,6 @@
 #include <linux/bootmem.h>
 #include "internal.h"
 
-
 int sysctl_vfs_cache_pressure __read_mostly = 100;
 EXPORT_SYMBOL_GPL(sysctl_vfs_cache_pressure);
 
@@ -948,9 +947,6 @@ struct dentry *d_alloc(struct dentry * parent, const struct qstr *name)
 	dentry->d_op = NULL;
 	dentry->d_fsdata = NULL;
 	dentry->d_mounted = 0;
-#ifdef CONFIG_PROFILING
-	dentry->d_cookie = NULL;
-#endif
 	INIT_HLIST_NODE(&dentry->d_hash);
 	INIT_LIST_HEAD(&dentry->d_lru);
 	INIT_LIST_HEAD(&dentry->d_subdirs);
diff --git a/fs/dcookies.c b/fs/dcookies.c
index 855d4b1d619..180e9fec4ad 100644
--- a/fs/dcookies.c
+++ b/fs/dcookies.c
@@ -93,10 +93,15 @@ static struct dcookie_struct *alloc_dcookie(struct path *path)
 {
 	struct dcookie_struct *dcs = kmem_cache_alloc(dcookie_cache,
 							GFP_KERNEL);
+	struct dentry *d;
 	if (!dcs)
 		return NULL;
 
-	path->dentry->d_cookie = dcs;
+	d = path->dentry;
+	spin_lock(&d->d_lock);
+	d->d_flags |= DCACHE_COOKIE;
+	spin_unlock(&d->d_lock);
+
 	dcs->path = *path;
 	path_get(path);
 	hash_dcookie(dcs);
@@ -119,14 +124,14 @@ int get_dcookie(struct path *path, unsigned long *cookie)
 		goto out;
 	}
 
-	dcs = path->dentry->d_cookie;
-
-	if (!dcs)
+	if (path->dentry->d_flags & DCACHE_COOKIE) {
+		dcs = find_dcookie((unsigned long)path->dentry);
+	} else {
 		dcs = alloc_dcookie(path);
-
-	if (!dcs) {
-		err = -ENOMEM;
-		goto out;
+		if (!dcs) {
+			err = -ENOMEM;
+			goto out;
+		}
 	}
 
 	*cookie = dcookie_value(dcs);
@@ -251,7 +256,12 @@ out_kmem:
 
 static void free_dcookie(struct dcookie_struct * dcs)
 {
-	dcs->path.dentry->d_cookie = NULL;
+	struct dentry *d = dcs->path.dentry;
+
+	spin_lock(&d->d_lock);
+	d->d_flags &= ~DCACHE_COOKIE;
+	spin_unlock(&d->d_lock);
+
 	path_put(&dcs->path);
 	kmem_cache_free(dcookie_cache, dcs);
 }
diff --git a/include/linux/dcache.h b/include/linux/dcache.h
index a37359d0bad..c66d22487bf 100644
--- a/include/linux/dcache.h
+++ b/include/linux/dcache.h
@@ -75,14 +75,22 @@ full_name_hash(const unsigned char *name, unsigned int len)
 	return end_name_hash(hash);
 }
 
-struct dcookie_struct;
-
-#define DNAME_INLINE_LEN_MIN 36
+/*
+ * Try to keep struct dentry aligned on 64 byte cachelines (this will
+ * give reasonable cacheline footprint with larger lines without the
+ * large memory footprint increase).
+ */
+#ifdef CONFIG_64BIT
+#define DNAME_INLINE_LEN_MIN 32 /* 192 bytes */
+#else
+#define DNAME_INLINE_LEN_MIN 40 /* 128 bytes */
+#endif
 
 struct dentry {
 	atomic_t d_count;
 	unsigned int d_flags;		/* protected by d_lock */
 	spinlock_t d_lock;		/* per dentry lock */
+	int d_mounted;
 	struct inode *d_inode;		/* Where the name belongs to - NULL is
 					 * negative */
 	/*
@@ -107,10 +115,7 @@ struct dentry {
 	struct dentry_operations *d_op;
 	struct super_block *d_sb;	/* The root of the dentry tree */
 	void *d_fsdata;			/* fs-specific data */
-#ifdef CONFIG_PROFILING
-	struct dcookie_struct *d_cookie; /* cookie, if any */
-#endif
-	int d_mounted;
+
 	unsigned char d_iname[DNAME_INLINE_LEN_MIN];	/* small names */
 };
 
@@ -177,6 +182,8 @@ d_iput:		no		no		no       yes
 
 #define DCACHE_INOTIFY_PARENT_WATCHED	0x0020 /* Parent inode is watched */
 
+#define DCACHE_COOKIE		0x0040	/* For use by dcookie subsystem */
+
 extern spinlock_t dcache_lock;
 extern seqlock_t rename_lock;
 
-- 
cgit v1.2.3-70-g09d2


From dded4f4d5048e64a01cf52eed4d27c8cb2600525 Mon Sep 17 00:00:00 2001
From: Jan Engelhardt <jengelh@medozas.de>
Date: Mon, 1 Dec 2008 14:34:50 -0800
Subject: include: linux/fs.h: put declarations in __KERNEL__

include/linux/fs.h contains externs for a bunch of variables.  That obviously
belongs under ifdef __KERNEL__.

Signed-off-by: Jan Engelhardt <jengelh@medozas.de>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 include/linux/fs.h | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/fs.h b/include/linux/fs.h
index 001ded4845b..c5e4c5c7403 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -21,7 +21,6 @@
 
 /* Fixed constants first: */
 #undef NR_OPEN
-extern int sysctl_nr_open;
 #define INR_OPEN 1024		/* Initial setting for nfile rlimits */
 
 #define BLOCK_SIZE_BITS 10
@@ -38,21 +37,13 @@ struct files_stat_struct {
 	int nr_free_files;	/* read only */
 	int max_files;		/* tunable */
 };
-extern struct files_stat_struct files_stat;
-extern int get_max_files(void);
 
 struct inodes_stat_t {
 	int nr_inodes;
 	int nr_unused;
 	int dummy[5];		/* padding for sysctl ABI compatibility */
 };
-extern struct inodes_stat_t inodes_stat;
 
-extern int leases_enable, lease_break_time;
-
-#ifdef CONFIG_DNOTIFY
-extern int dir_notify_enable;
-#endif
 
 #define NR_FILE  8192	/* this can well be larger on a larger system */
 
@@ -330,6 +321,15 @@ extern void __init inode_init(void);
 extern void __init inode_init_early(void);
 extern void __init files_init(unsigned long);
 
+extern struct files_stat_struct files_stat;
+extern int get_max_files(void);
+extern int sysctl_nr_open;
+extern struct inodes_stat_t inodes_stat;
+extern int leases_enable, lease_break_time;
+#ifdef CONFIG_DNOTIFY
+extern int dir_notify_enable;
+#endif
+
 struct buffer_head;
 typedef int (get_block_t)(struct inode *inode, sector_t iblock,
 			struct buffer_head *bh_result, int create);
-- 
cgit v1.2.3-70-g09d2


From 035146851cfa2fe24c1d9dc7637bb009ad06b2f7 Mon Sep 17 00:00:00 2001
From: Duane Griffin <duaneg@dghda.com>
Date: Fri, 19 Dec 2008 20:47:11 +0000
Subject: vfs: introduce helper function to safely NUL-terminate symlinks

A number of filesystems were potentially triggering kernel bugs due to
corrupted symlink names on disk. This function helps safely terminate
the names.

Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Duane Griffin <duaneg@dghda.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 include/linux/namei.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/namei.h b/include/linux/namei.h
index 99eb80306dc..fc2e0357987 100644
--- a/include/linux/namei.h
+++ b/include/linux/namei.h
@@ -94,4 +94,9 @@ static inline char *nd_get_link(struct nameidata *nd)
 	return nd->saved_names[nd->depth];
 }
 
+static inline void nd_terminate_link(void *name, size_t len, size_t maxlen)
+{
+	((char *) name)[min(len, maxlen)] = '\0';
+}
+
 #endif /* _LINUX_NAMEI_H */
-- 
cgit v1.2.3-70-g09d2


From 3fb64190aa3c23c10e6e9fd0124ac030115c99bf Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 24 Oct 2008 09:58:10 +0200
Subject: pass a struct path * to may_open

No need for the nameidata in may_open - a struct path is enough.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namei.c         | 14 +++++++-------
 fs/nfsctl.c        |  5 +++--
 include/linux/fs.h |  2 +-
 3 files changed, 11 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/fs/namei.c b/fs/namei.c
index d4d0b59ed2c..5cc0dc95a7a 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1487,9 +1487,9 @@ int vfs_create(struct inode *dir, struct dentry *dentry, int mode,
 	return error;
 }
 
-int may_open(struct nameidata *nd, int acc_mode, int flag)
+int may_open(struct path *path, int acc_mode, int flag)
 {
-	struct dentry *dentry = nd->path.dentry;
+	struct dentry *dentry = path->dentry;
 	struct inode *inode = dentry->d_inode;
 	int error;
 
@@ -1510,13 +1510,13 @@ int may_open(struct nameidata *nd, int acc_mode, int flag)
 	if (S_ISFIFO(inode->i_mode) || S_ISSOCK(inode->i_mode)) {
 	    	flag &= ~O_TRUNC;
 	} else if (S_ISBLK(inode->i_mode) || S_ISCHR(inode->i_mode)) {
-		if (nd->path.mnt->mnt_flags & MNT_NODEV)
+		if (path->mnt->mnt_flags & MNT_NODEV)
 			return -EACCES;
 
 		flag &= ~O_TRUNC;
 	}
 
-	error = vfs_permission(nd, acc_mode);
+	error = inode_permission(inode, acc_mode);
 	if (error)
 		return error;
 	/*
@@ -1551,7 +1551,7 @@ int may_open(struct nameidata *nd, int acc_mode, int flag)
 		 */
 		error = locks_verify_locked(inode);
 		if (!error)
-			error = security_path_truncate(&nd->path, 0,
+			error = security_path_truncate(path, 0,
 					       ATTR_MTIME|ATTR_CTIME|ATTR_OPEN);
 		if (!error) {
 			DQUOT_INIT(inode);
@@ -1594,7 +1594,7 @@ out_unlock:
 	if (error)
 		return error;
 	/* Don't check for write permission, don't truncate */
-	return may_open(nd, 0, flag & ~O_TRUNC);
+	return may_open(&nd->path, 0, flag & ~O_TRUNC);
 }
 
 /*
@@ -1780,7 +1780,7 @@ ok:
 		if (error)
 			goto exit;
 	}
-	error = may_open(&nd, acc_mode, flag);
+	error = may_open(&nd.path, acc_mode, flag);
 	if (error) {
 		if (will_write)
 			mnt_drop_write(nd.path.mnt);
diff --git a/fs/nfsctl.c b/fs/nfsctl.c
index b1acbd6ab6f..b27451909df 100644
--- a/fs/nfsctl.c
+++ b/fs/nfsctl.c
@@ -38,9 +38,10 @@ static struct file *do_open(char *name, int flags)
 		return ERR_PTR(error);
 
 	if (flags == O_RDWR)
-		error = may_open(&nd,MAY_READ|MAY_WRITE,FMODE_READ|FMODE_WRITE);
+		error = may_open(&nd.path, MAY_READ|MAY_WRITE,
+					   FMODE_READ|FMODE_WRITE);
 	else
-		error = may_open(&nd, MAY_WRITE, FMODE_WRITE);
+		error = may_open(&nd.path, MAY_WRITE, FMODE_WRITE);
 
 	if (!error)
 		return dentry_open(nd.path.dentry, nd.path.mnt, flags,
diff --git a/include/linux/fs.h b/include/linux/fs.h
index c5e4c5c7403..3468df5a06e 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1869,7 +1869,7 @@ extern void free_write_pipe(struct file *);
 
 extern struct file *do_filp_open(int dfd, const char *pathname,
 		int open_flag, int mode);
-extern int may_open(struct nameidata *, int, int);
+extern int may_open(struct path *, int, int);
 
 extern int kernel_read(struct file *, unsigned long, char *, unsigned long);
 extern struct file * open_exec(const char *);
-- 
cgit v1.2.3-70-g09d2


From cb23beb55100171646e69e248fb45f10db6e99a4 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 24 Oct 2008 09:59:29 +0200
Subject: kill vfs_permission

With all the nameidata removal there's no point anymore for this helper.
Of the three callers left two will go away with the next lookup series
anyway.

Also add proper kerneldoc to inode_permission as this is the main
permission check routine now.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/exec.c          |  5 +++--
 fs/namei.c         | 31 +++++++++++++------------------
 include/linux/fs.h |  1 -
 3 files changed, 16 insertions(+), 21 deletions(-)

(limited to 'include/linux')

diff --git a/fs/exec.c b/fs/exec.c
index 02d2e120542..dfbf7009fbe 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -127,7 +127,8 @@ asmlinkage long sys_uselib(const char __user * library)
 	if (nd.path.mnt->mnt_flags & MNT_NOEXEC)
 		goto exit;
 
-	error = vfs_permission(&nd, MAY_READ | MAY_EXEC | MAY_OPEN);
+	error = inode_permission(nd.path.dentry->d_inode,
+				 MAY_READ | MAY_EXEC | MAY_OPEN);
 	if (error)
 		goto exit;
 
@@ -680,7 +681,7 @@ struct file *open_exec(const char *name)
 	if (nd.path.mnt->mnt_flags & MNT_NOEXEC)
 		goto out_path_put;
 
-	err = vfs_permission(&nd, MAY_EXEC | MAY_OPEN);
+	err = inode_permission(nd.path.dentry->d_inode, MAY_EXEC | MAY_OPEN);
 	if (err)
 		goto out_path_put;
 
diff --git a/fs/namei.c b/fs/namei.c
index 5cc0dc95a7a..3f88e043d45 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -226,6 +226,16 @@ int generic_permission(struct inode *inode, int mask,
 	return -EACCES;
 }
 
+/**
+ * inode_permission  -  check for access rights to a given inode
+ * @inode:	inode to check permission on
+ * @mask:	right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC)
+ *
+ * Used to check for read/write/execute permissions on an inode.
+ * We use "fsuid" for this, letting us set arbitrary permissions
+ * for filesystem access without changing the "normal" uids which
+ * are used for other things.
+ */
 int inode_permission(struct inode *inode, int mask)
 {
 	int retval;
@@ -263,21 +273,6 @@ int inode_permission(struct inode *inode, int mask)
 			mask & (MAY_READ|MAY_WRITE|MAY_EXEC|MAY_APPEND));
 }
 
-/**
- * vfs_permission  -  check for access rights to a given path
- * @nd:		lookup result that describes the path
- * @mask:	right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC)
- *
- * Used to check for read/write/execute permissions on a path.
- * We use "fsuid" for this, letting us set arbitrary permissions
- * for filesystem access without changing the "normal" uids which
- * are used for other things.
- */
-int vfs_permission(struct nameidata *nd, int mask)
-{
-	return inode_permission(nd->path.dentry->d_inode, mask);
-}
-
 /**
  * file_permission  -  check for additional access rights to a given file
  * @file:	file to check access rights for
@@ -288,7 +283,7 @@ int vfs_permission(struct nameidata *nd, int mask)
  *
  * Note:
  *	Do not use this function in new code.  All access checks should
- *	be done using vfs_permission().
+ *	be done using inode_permission().
  */
 int file_permission(struct file *file, int mask)
 {
@@ -853,7 +848,8 @@ static int __link_path_walk(const char *name, struct nameidata *nd)
 		nd->flags |= LOOKUP_CONTINUE;
 		err = exec_permission_lite(inode);
 		if (err == -EAGAIN)
-			err = vfs_permission(nd, MAY_EXEC);
+			err = inode_permission(nd->path.dentry->d_inode,
+					       MAY_EXEC);
  		if (err)
 			break;
 
@@ -2882,7 +2878,6 @@ EXPORT_SYMBOL(path_lookup);
 EXPORT_SYMBOL(kern_path);
 EXPORT_SYMBOL(vfs_path_lookup);
 EXPORT_SYMBOL(inode_permission);
-EXPORT_SYMBOL(vfs_permission);
 EXPORT_SYMBOL(file_permission);
 EXPORT_SYMBOL(unlock_rename);
 EXPORT_SYMBOL(vfs_create);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 3468df5a06e..fd615986a41 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1212,7 +1212,6 @@ extern void unlock_super(struct super_block *);
 /*
  * VFS helper functions..
  */
-extern int vfs_permission(struct nameidata *, int);
 extern int vfs_create(struct inode *, struct dentry *, int, struct nameidata *);
 extern int vfs_mkdir(struct inode *, struct dentry *, int);
 extern int vfs_mknod(struct inode *, struct dentry *, int, dev_t);
-- 
cgit v1.2.3-70-g09d2


From 18d8fda7c3c9439be04d7ea2e82da2513b121acb Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Fri, 26 Dec 2008 00:35:37 -0500
Subject: take init_fs to saner place

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 arch/alpha/kernel/init_task.c     | 1 -
 arch/arm/kernel/init_task.c       | 1 -
 arch/avr32/kernel/init_task.c     | 1 -
 arch/blackfin/kernel/init_task.c  | 1 -
 arch/cris/kernel/process.c        | 1 -
 arch/frv/kernel/init_task.c       | 1 -
 arch/h8300/kernel/init_task.c     | 1 -
 arch/ia64/kernel/init_task.c      | 1 -
 arch/m32r/kernel/init_task.c      | 1 -
 arch/m68k/kernel/process.c        | 1 -
 arch/m68knommu/kernel/init_task.c | 1 -
 arch/mips/kernel/init_task.c      | 1 -
 arch/mn10300/kernel/init_task.c   | 1 -
 arch/parisc/kernel/init_task.c    | 1 -
 arch/powerpc/kernel/init_task.c   | 1 -
 arch/s390/kernel/init_task.c      | 1 -
 arch/sh/kernel/init_task.c        | 1 -
 arch/sparc/kernel/init_task.c     | 1 -
 arch/um/kernel/init_task.c        | 1 -
 arch/x86/kernel/init_task.c       | 1 -
 arch/xtensa/kernel/init_task.c    | 1 -
 fs/namei.c                        | 7 +++++++
 include/linux/fs_struct.h         | 6 ------
 include/linux/init_task.h         | 1 +
 24 files changed, 8 insertions(+), 27 deletions(-)

(limited to 'include/linux')

diff --git a/arch/alpha/kernel/init_task.c b/arch/alpha/kernel/init_task.c
index 1f762189fa6..c2938e574a4 100644
--- a/arch/alpha/kernel/init_task.c
+++ b/arch/alpha/kernel/init_task.c
@@ -8,7 +8,6 @@
 #include <asm/uaccess.h>
 
 
-static struct fs_struct init_fs = INIT_FS;
 static struct signal_struct init_signals = INIT_SIGNALS(init_signals);
 static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand);
 struct mm_struct init_mm = INIT_MM(init_mm);
diff --git a/arch/arm/kernel/init_task.c b/arch/arm/kernel/init_task.c
index 0bbf8062539..e859af34946 100644
--- a/arch/arm/kernel/init_task.c
+++ b/arch/arm/kernel/init_task.c
@@ -12,7 +12,6 @@
 
 #include <asm/pgtable.h>
 
-static struct fs_struct init_fs = INIT_FS;
 static struct signal_struct init_signals = INIT_SIGNALS(init_signals);
 static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand);
 struct mm_struct init_mm = INIT_MM(init_mm);
diff --git a/arch/avr32/kernel/init_task.c b/arch/avr32/kernel/init_task.c
index 44058469c6e..993d56ee3cf 100644
--- a/arch/avr32/kernel/init_task.c
+++ b/arch/avr32/kernel/init_task.c
@@ -13,7 +13,6 @@
 
 #include <asm/pgtable.h>
 
-static struct fs_struct init_fs = INIT_FS;
 static struct signal_struct init_signals = INIT_SIGNALS(init_signals);
 static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand);
 struct mm_struct init_mm = INIT_MM(init_mm);
diff --git a/arch/blackfin/kernel/init_task.c b/arch/blackfin/kernel/init_task.c
index 6bdba7b2110..2c228c02097 100644
--- a/arch/blackfin/kernel/init_task.c
+++ b/arch/blackfin/kernel/init_task.c
@@ -33,7 +33,6 @@
 #include <linux/mqueue.h>
 #include <linux/fs.h>
 
-static struct fs_struct init_fs = INIT_FS;
 static struct signal_struct init_signals = INIT_SIGNALS(init_signals);
 static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand);
 
diff --git a/arch/cris/kernel/process.c b/arch/cris/kernel/process.c
index 5933656db5a..60816e87645 100644
--- a/arch/cris/kernel/process.c
+++ b/arch/cris/kernel/process.c
@@ -37,7 +37,6 @@
  * setup.
  */
 
-static struct fs_struct init_fs = INIT_FS;
 static struct signal_struct init_signals = INIT_SIGNALS(init_signals);
 static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand);
 struct mm_struct init_mm = INIT_MM(init_mm);
diff --git a/arch/frv/kernel/init_task.c b/arch/frv/kernel/init_task.c
index e2198815b63..29429a8b7f6 100644
--- a/arch/frv/kernel/init_task.c
+++ b/arch/frv/kernel/init_task.c
@@ -10,7 +10,6 @@
 #include <asm/pgtable.h>
 
 
-static struct fs_struct init_fs = INIT_FS;
 static struct signal_struct init_signals = INIT_SIGNALS(init_signals);
 static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand);
 struct mm_struct init_mm = INIT_MM(init_mm);
diff --git a/arch/h8300/kernel/init_task.c b/arch/h8300/kernel/init_task.c
index 93a4899e46c..cb5dc552da9 100644
--- a/arch/h8300/kernel/init_task.c
+++ b/arch/h8300/kernel/init_task.c
@@ -12,7 +12,6 @@
 #include <asm/uaccess.h>
 #include <asm/pgtable.h>
 
-static struct fs_struct init_fs = INIT_FS;
 static struct signal_struct init_signals = INIT_SIGNALS(init_signals);
 static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand);
 struct mm_struct init_mm = INIT_MM(init_mm);
diff --git a/arch/ia64/kernel/init_task.c b/arch/ia64/kernel/init_task.c
index 9d7e1c66faf..5b0e830c6f3 100644
--- a/arch/ia64/kernel/init_task.c
+++ b/arch/ia64/kernel/init_task.c
@@ -17,7 +17,6 @@
 #include <asm/uaccess.h>
 #include <asm/pgtable.h>
 
-static struct fs_struct init_fs = INIT_FS;
 static struct signal_struct init_signals = INIT_SIGNALS(init_signals);
 static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand);
 struct mm_struct init_mm = INIT_MM(init_mm);
diff --git a/arch/m32r/kernel/init_task.c b/arch/m32r/kernel/init_task.c
index 0d658dbb676..016885c6f26 100644
--- a/arch/m32r/kernel/init_task.c
+++ b/arch/m32r/kernel/init_task.c
@@ -11,7 +11,6 @@
 #include <asm/uaccess.h>
 #include <asm/pgtable.h>
 
-static struct fs_struct init_fs = INIT_FS;
 static struct signal_struct init_signals = INIT_SIGNALS(init_signals);
 static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand);
 struct mm_struct init_mm = INIT_MM(init_mm);
diff --git a/arch/m68k/kernel/process.c b/arch/m68k/kernel/process.c
index 3042c2bc8c5..632ce016014 100644
--- a/arch/m68k/kernel/process.c
+++ b/arch/m68k/kernel/process.c
@@ -40,7 +40,6 @@
  * alignment requirements and potentially different initial
  * setup.
  */
-static struct fs_struct init_fs = INIT_FS;
 static struct signal_struct init_signals = INIT_SIGNALS(init_signals);
 static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand);
 struct mm_struct init_mm = INIT_MM(init_mm);
diff --git a/arch/m68knommu/kernel/init_task.c b/arch/m68knommu/kernel/init_task.c
index 344c01aede0..fe282de1d59 100644
--- a/arch/m68knommu/kernel/init_task.c
+++ b/arch/m68knommu/kernel/init_task.c
@@ -12,7 +12,6 @@
 #include <asm/uaccess.h>
 #include <asm/pgtable.h>
 
-static struct fs_struct init_fs = INIT_FS;
 static struct signal_struct init_signals = INIT_SIGNALS(init_signals);
 static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand);
 struct mm_struct init_mm = INIT_MM(init_mm);
diff --git a/arch/mips/kernel/init_task.c b/arch/mips/kernel/init_task.c
index d72487ad7c1..149cd914526 100644
--- a/arch/mips/kernel/init_task.c
+++ b/arch/mips/kernel/init_task.c
@@ -9,7 +9,6 @@
 #include <asm/uaccess.h>
 #include <asm/pgtable.h>
 
-static struct fs_struct init_fs = INIT_FS;
 static struct signal_struct init_signals = INIT_SIGNALS(init_signals);
 static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand);
 struct mm_struct init_mm = INIT_MM(init_mm);
diff --git a/arch/mn10300/kernel/init_task.c b/arch/mn10300/kernel/init_task.c
index af16f6e5c91..5ac3566f8c9 100644
--- a/arch/mn10300/kernel/init_task.c
+++ b/arch/mn10300/kernel/init_task.c
@@ -18,7 +18,6 @@
 #include <asm/uaccess.h>
 #include <asm/pgtable.h>
 
-static struct fs_struct init_fs = INIT_FS;
 static struct signal_struct init_signals = INIT_SIGNALS(init_signals);
 static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand);
 struct mm_struct init_mm = INIT_MM(init_mm);
diff --git a/arch/parisc/kernel/init_task.c b/arch/parisc/kernel/init_task.c
index f5941c08655..1e25a45d64c 100644
--- a/arch/parisc/kernel/init_task.c
+++ b/arch/parisc/kernel/init_task.c
@@ -34,7 +34,6 @@
 #include <asm/pgtable.h>
 #include <asm/pgalloc.h>
 
-static struct fs_struct init_fs = INIT_FS;
 static struct signal_struct init_signals = INIT_SIGNALS(init_signals);
 static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand);
 struct mm_struct init_mm = INIT_MM(init_mm);
diff --git a/arch/powerpc/kernel/init_task.c b/arch/powerpc/kernel/init_task.c
index 4c85b8d5647..688b329800b 100644
--- a/arch/powerpc/kernel/init_task.c
+++ b/arch/powerpc/kernel/init_task.c
@@ -7,7 +7,6 @@
 #include <linux/mqueue.h>
 #include <asm/uaccess.h>
 
-static struct fs_struct init_fs = INIT_FS;
 static struct signal_struct init_signals = INIT_SIGNALS(init_signals);
 static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand);
 struct mm_struct init_mm = INIT_MM(init_mm);
diff --git a/arch/s390/kernel/init_task.c b/arch/s390/kernel/init_task.c
index e8071684361..7db95c0b869 100644
--- a/arch/s390/kernel/init_task.c
+++ b/arch/s390/kernel/init_task.c
@@ -16,7 +16,6 @@
 #include <asm/uaccess.h>
 #include <asm/pgtable.h>
 
-static struct fs_struct init_fs = INIT_FS;
 static struct signal_struct init_signals = INIT_SIGNALS(init_signals);
 static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand);
 struct mm_struct init_mm = INIT_MM(init_mm);
diff --git a/arch/sh/kernel/init_task.c b/arch/sh/kernel/init_task.c
index b151a25cb14..80c35ff71d5 100644
--- a/arch/sh/kernel/init_task.c
+++ b/arch/sh/kernel/init_task.c
@@ -7,7 +7,6 @@
 #include <asm/uaccess.h>
 #include <asm/pgtable.h>
 
-static struct fs_struct init_fs = INIT_FS;
 static struct signal_struct init_signals = INIT_SIGNALS(init_signals);
 static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand);
 struct pt_regs fake_swapper_regs;
diff --git a/arch/sparc/kernel/init_task.c b/arch/sparc/kernel/init_task.c
index 62126e4cec5..f28cb8278e9 100644
--- a/arch/sparc/kernel/init_task.c
+++ b/arch/sparc/kernel/init_task.c
@@ -8,7 +8,6 @@
 #include <asm/pgtable.h>
 #include <asm/uaccess.h>
 
-static struct fs_struct init_fs = INIT_FS;
 static struct signal_struct init_signals = INIT_SIGNALS(init_signals);
 static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand);
 struct mm_struct init_mm = INIT_MM(init_mm);
diff --git a/arch/um/kernel/init_task.c b/arch/um/kernel/init_task.c
index 910eda8fca1..806d381947b 100644
--- a/arch/um/kernel/init_task.c
+++ b/arch/um/kernel/init_task.c
@@ -10,7 +10,6 @@
 #include "linux/mqueue.h"
 #include "asm/uaccess.h"
 
-static struct fs_struct init_fs = INIT_FS;
 struct mm_struct init_mm = INIT_MM(init_mm);
 static struct signal_struct init_signals = INIT_SIGNALS(init_signals);
 static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand);
diff --git a/arch/x86/kernel/init_task.c b/arch/x86/kernel/init_task.c
index d39918076bb..df3bf269bea 100644
--- a/arch/x86/kernel/init_task.c
+++ b/arch/x86/kernel/init_task.c
@@ -10,7 +10,6 @@
 #include <asm/pgtable.h>
 #include <asm/desc.h>
 
-static struct fs_struct init_fs = INIT_FS;
 static struct signal_struct init_signals = INIT_SIGNALS(init_signals);
 static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand);
 struct mm_struct init_mm = INIT_MM(init_mm);
diff --git a/arch/xtensa/kernel/init_task.c b/arch/xtensa/kernel/init_task.c
index 3df469dbe81..e07f5c9fcd3 100644
--- a/arch/xtensa/kernel/init_task.c
+++ b/arch/xtensa/kernel/init_task.c
@@ -21,7 +21,6 @@
 
 #include <asm/uaccess.h>
 
-static struct fs_struct init_fs = INIT_FS;
 static struct signal_struct init_signals = INIT_SIGNALS(init_signals);
 static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand);
 struct mm_struct init_mm = INIT_MM(init_mm);
diff --git a/fs/namei.c b/fs/namei.c
index 3f88e043d45..e203691b9d1 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -2893,3 +2893,10 @@ EXPORT_SYMBOL(vfs_symlink);
 EXPORT_SYMBOL(vfs_unlink);
 EXPORT_SYMBOL(dentry_unhash);
 EXPORT_SYMBOL(generic_readlink);
+
+/* to be mentioned only in INIT_TASK */
+struct fs_struct init_fs = {
+	.count		= ATOMIC_INIT(1),
+	.lock		= RW_LOCK_UNLOCKED,
+	.umask		= 0022,
+};
diff --git a/include/linux/fs_struct.h b/include/linux/fs_struct.h
index 9e5a06e78d0..a97c053d3a9 100644
--- a/include/linux/fs_struct.h
+++ b/include/linux/fs_struct.h
@@ -10,12 +10,6 @@ struct fs_struct {
 	struct path root, pwd;
 };
 
-#define INIT_FS {				\
-	.count		= ATOMIC_INIT(1),	\
-	.lock		= RW_LOCK_UNLOCKED,	\
-	.umask		= 0022, \
-}
-
 extern struct kmem_cache *fs_cachep;
 
 extern void exit_fs(struct task_struct *);
diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index 959f5522d10..2f3c2d4ef73 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -12,6 +12,7 @@
 #include <net/net_namespace.h>
 
 extern struct files_struct init_files;
+extern struct fs_struct init_fs;
 
 #define INIT_KIOCTX(name, which_mm) \
 {							\
-- 
cgit v1.2.3-70-g09d2


From b6b3fdead251d432f32f2cfce2a893ab8a658110 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <dada1@cosmosbay.com>
Date: Wed, 10 Dec 2008 09:35:45 -0800
Subject: filp_cachep can be static in fs/file_table.c

Instead of creating the "filp" kmem_cache in vfs_caches_init(),
we can do it a litle be later in files_init(), so that filp_cachep
is static to fs/file_table.c

Acked-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>

Signed-off-by: Eric Dumazet <dada1@cosmosbay.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/dcache.c             |  6 ------
 fs/file_table.c         | 10 +++++++++-
 include/linux/fdtable.h |  2 --
 3 files changed, 9 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/fs/dcache.c b/fs/dcache.c
index bdb3f50248a..e88c23b85a3 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -2314,9 +2314,6 @@ static void __init dcache_init(void)
 /* SLAB cache for __getname() consumers */
 struct kmem_cache *names_cachep __read_mostly;
 
-/* SLAB cache for file structures */
-struct kmem_cache *filp_cachep __read_mostly;
-
 EXPORT_SYMBOL(d_genocide);
 
 void __init vfs_caches_init_early(void)
@@ -2338,9 +2335,6 @@ void __init vfs_caches_init(unsigned long mempages)
 	names_cachep = kmem_cache_create("names_cache", PATH_MAX, 0,
 			SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
 
-	filp_cachep = kmem_cache_create("filp", sizeof(struct file), 0,
-			SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
-
 	dcache_init();
 	inode_init();
 	files_init(mempages);
diff --git a/fs/file_table.c b/fs/file_table.c
index 0fbcacc3ea7..bbeeac6efa1 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -32,6 +32,9 @@ struct files_stat_struct files_stat = {
 /* public. Not pretty! */
 __cacheline_aligned_in_smp DEFINE_SPINLOCK(files_lock);
 
+/* SLAB cache for file structures */
+static struct kmem_cache *filp_cachep __read_mostly;
+
 static struct percpu_counter nr_files __cacheline_aligned_in_smp;
 
 static inline void file_free_rcu(struct rcu_head *head)
@@ -397,7 +400,12 @@ too_bad:
 void __init files_init(unsigned long mempages)
 { 
 	int n; 
-	/* One file with associated inode and dcache is very roughly 1K. 
+
+	filp_cachep = kmem_cache_create("filp", sizeof(struct file), 0,
+			SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL);
+
+	/*
+	 * One file with associated inode and dcache is very roughly 1K.
 	 * Per default don't use more than 10% of our memory for files. 
 	 */ 
 
diff --git a/include/linux/fdtable.h b/include/linux/fdtable.h
index 4aab6f12cfa..09d6c5bbddd 100644
--- a/include/linux/fdtable.h
+++ b/include/linux/fdtable.h
@@ -57,8 +57,6 @@ struct files_struct {
 
 #define files_fdtable(files) (rcu_dereference((files)->fdt))
 
-extern struct kmem_cache *filp_cachep;
-
 struct file_operations;
 struct vfsmount;
 struct dentry;
-- 
cgit v1.2.3-70-g09d2


From 6badd79bd002788aaec27b50a74ab69ef65ab8ee Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Fri, 26 Dec 2008 00:57:40 -0500
Subject: kill ->dir_notify()

Remove the hopelessly misguided ->dir_notify().  The only instance (cifs)
has been broken by design from the very beginning; the objects it creates
are never destroyed, keep references to struct file they can outlive, nothing
that could possibly evict them exists on close(2) path *and* no locking
whatsoever is done to prevent races with close(), should the previous, er,
deficiencies someday be dealt with.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 Documentation/filesystems/Locking |   2 -
 Documentation/filesystems/vfs.txt |   3 -
 fs/bad_inode.c                    |   6 --
 fs/cifs/Makefile                  |   2 +-
 fs/cifs/cifsfs.c                  |   7 ---
 fs/cifs/cifsfs.h                  |   1 -
 fs/cifs/fcntl.c                   | 118 --------------------------------------
 fs/dnotify.c                      |   3 -
 include/linux/fs.h                |   1 -
 9 files changed, 1 insertion(+), 142 deletions(-)
 delete mode 100644 fs/cifs/fcntl.c

(limited to 'include/linux')

diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking
index 23d2f4460de..ccec5539438 100644
--- a/Documentation/filesystems/Locking
+++ b/Documentation/filesystems/Locking
@@ -394,7 +394,6 @@ prototypes:
 	unsigned long (*get_unmapped_area)(struct file *, unsigned long,
 			unsigned long, unsigned long, unsigned long);
 	int (*check_flags)(int);
-	int (*dir_notify)(struct file *, unsigned long);
 };
 
 locking rules:
@@ -424,7 +423,6 @@ sendfile:		no
 sendpage:		no
 get_unmapped_area:	no
 check_flags:		no
-dir_notify:		no
 
 ->llseek() locking has moved from llseek to the individual llseek
 implementations.  If your fs is not using generic_file_llseek, you
diff --git a/Documentation/filesystems/vfs.txt b/Documentation/filesystems/vfs.txt
index 041cb771d50..ef19afa186a 100644
--- a/Documentation/filesystems/vfs.txt
+++ b/Documentation/filesystems/vfs.txt
@@ -733,7 +733,6 @@ struct file_operations {
 	ssize_t (*sendpage) (struct file *, struct page *, int, size_t, loff_t *, int);
 	unsigned long (*get_unmapped_area)(struct file *, unsigned long, unsigned long, unsigned long, unsigned long);
 	int (*check_flags)(int);
-	int (*dir_notify)(struct file *filp, unsigned long arg);
 	int (*flock) (struct file *, int, struct file_lock *);
 	ssize_t (*splice_write)(struct pipe_inode_info *, struct file *, size_t, unsigned int);
 	ssize_t (*splice_read)(struct file *, struct pipe_inode_info *, size_t, unsigned int);
@@ -800,8 +799,6 @@ otherwise noted.
 
   check_flags: called by the fcntl(2) system call for F_SETFL command
 
-  dir_notify: called by the fcntl(2) system call for F_NOTIFY command
-
   flock: called by the flock(2) system call
 
   splice_write: called by the VFS to splice data from a pipe to a file. This
diff --git a/fs/bad_inode.c b/fs/bad_inode.c
index 5f1538c03b1..a05287a23f6 100644
--- a/fs/bad_inode.c
+++ b/fs/bad_inode.c
@@ -132,11 +132,6 @@ static int bad_file_check_flags(int flags)
 	return -EIO;
 }
 
-static int bad_file_dir_notify(struct file *file, unsigned long arg)
-{
-	return -EIO;
-}
-
 static int bad_file_flock(struct file *filp, int cmd, struct file_lock *fl)
 {
 	return -EIO;
@@ -179,7 +174,6 @@ static const struct file_operations bad_file_ops =
 	.sendpage	= bad_file_sendpage,
 	.get_unmapped_area = bad_file_get_unmapped_area,
 	.check_flags	= bad_file_check_flags,
-	.dir_notify	= bad_file_dir_notify,
 	.flock		= bad_file_flock,
 	.splice_write	= bad_file_splice_write,
 	.splice_read	= bad_file_splice_read,
diff --git a/fs/cifs/Makefile b/fs/cifs/Makefile
index 6ba43fb346f..9948c0030e8 100644
--- a/fs/cifs/Makefile
+++ b/fs/cifs/Makefile
@@ -5,7 +5,7 @@ obj-$(CONFIG_CIFS) += cifs.o
 
 cifs-y := cifsfs.o cifssmb.o cifs_debug.o connect.o dir.o file.o inode.o \
 	  link.o misc.o netmisc.o smbdes.o smbencrypt.o transport.o asn1.o \
-	  md4.o md5.o cifs_unicode.o nterr.o xattr.o cifsencrypt.o fcntl.o \
+	  md4.o md5.o cifs_unicode.o nterr.o xattr.o cifsencrypt.o \
 	  readdir.o ioctl.o sess.o export.o cifsacl.o
 
 cifs-$(CONFIG_CIFS_UPCALL) += cifs_spnego.o
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 0005a194a75..13ea53251dc 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -747,7 +747,6 @@ const struct file_operations cifs_file_ops = {
 #endif /* CONFIG_CIFS_POSIX */
 
 #ifdef CONFIG_CIFS_EXPERIMENTAL
-	.dir_notify = cifs_dir_notify,
 	.setlease = cifs_setlease,
 #endif /* CONFIG_CIFS_EXPERIMENTAL */
 };
@@ -768,7 +767,6 @@ const struct file_operations cifs_file_direct_ops = {
 #endif /* CONFIG_CIFS_POSIX */
 	.llseek = cifs_llseek,
 #ifdef CONFIG_CIFS_EXPERIMENTAL
-	.dir_notify = cifs_dir_notify,
 	.setlease = cifs_setlease,
 #endif /* CONFIG_CIFS_EXPERIMENTAL */
 };
@@ -789,7 +787,6 @@ const struct file_operations cifs_file_nobrl_ops = {
 #endif /* CONFIG_CIFS_POSIX */
 
 #ifdef CONFIG_CIFS_EXPERIMENTAL
-	.dir_notify = cifs_dir_notify,
 	.setlease = cifs_setlease,
 #endif /* CONFIG_CIFS_EXPERIMENTAL */
 };
@@ -809,7 +806,6 @@ const struct file_operations cifs_file_direct_nobrl_ops = {
 #endif /* CONFIG_CIFS_POSIX */
 	.llseek = cifs_llseek,
 #ifdef CONFIG_CIFS_EXPERIMENTAL
-	.dir_notify = cifs_dir_notify,
 	.setlease = cifs_setlease,
 #endif /* CONFIG_CIFS_EXPERIMENTAL */
 };
@@ -818,9 +814,6 @@ const struct file_operations cifs_dir_ops = {
 	.readdir = cifs_readdir,
 	.release = cifs_closedir,
 	.read    = generic_read_dir,
-#ifdef CONFIG_CIFS_EXPERIMENTAL
-	.dir_notify = cifs_dir_notify,
-#endif /* CONFIG_CIFS_EXPERIMENTAL */
 	.unlocked_ioctl  = cifs_ioctl,
 	.llseek = generic_file_llseek,
 };
diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h
index 2ce04c73d74..7ac481841f8 100644
--- a/fs/cifs/cifsfs.h
+++ b/fs/cifs/cifsfs.h
@@ -76,7 +76,6 @@ extern int cifs_file_mmap(struct file * , struct vm_area_struct *);
 extern const struct file_operations cifs_dir_ops;
 extern int cifs_dir_open(struct inode *inode, struct file *file);
 extern int cifs_readdir(struct file *file, void *direntry, filldir_t filldir);
-extern int cifs_dir_notify(struct file *, unsigned long arg);
 
 /* Functions related to dir entries */
 extern struct dentry_operations cifs_dentry_ops;
diff --git a/fs/cifs/fcntl.c b/fs/cifs/fcntl.c
deleted file mode 100644
index 5a57581eb4b..00000000000
--- a/fs/cifs/fcntl.c
+++ /dev/null
@@ -1,118 +0,0 @@
-/*
- *   fs/cifs/fcntl.c
- *
- *   vfs operations that deal with the file control API
- *
- *   Copyright (C) International Business Machines  Corp., 2003,2004
- *   Author(s): Steve French (sfrench@us.ibm.com)
- *
- *   This library is free software; you can redistribute it and/or modify
- *   it under the terms of the GNU Lesser General Public License as published
- *   by the Free Software Foundation; either version 2.1 of the License, or
- *   (at your option) any later version.
- *
- *   This library is distributed in the hope that it will be useful,
- *   but WITHOUT ANY WARRANTY; without even the implied warranty of
- *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
- *   the GNU Lesser General Public License for more details.
- *
- *   You should have received a copy of the GNU Lesser General Public License
- *   along with this library; if not, write to the Free Software
- *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
- */
-#include <linux/fs.h>
-#include <linux/stat.h>
-#include <linux/fcntl.h>
-#include "cifsglob.h"
-#include "cifsproto.h"
-#include "cifs_unicode.h"
-#include "cifs_debug.h"
-#include "cifsfs.h"
-
-static __u32 convert_to_cifs_notify_flags(unsigned long fcntl_notify_flags)
-{
-	__u32 cifs_ntfy_flags = 0;
-
-	/* No way on Linux VFS to ask to monitor xattr
-	changes (and no stream support either */
-	if (fcntl_notify_flags & DN_ACCESS)
-		cifs_ntfy_flags |= FILE_NOTIFY_CHANGE_LAST_ACCESS;
-	if (fcntl_notify_flags & DN_MODIFY) {
-		/* What does this mean on directories? */
-		cifs_ntfy_flags |= FILE_NOTIFY_CHANGE_LAST_WRITE |
-			FILE_NOTIFY_CHANGE_SIZE;
-	}
-	if (fcntl_notify_flags & DN_CREATE) {
-		cifs_ntfy_flags |= FILE_NOTIFY_CHANGE_CREATION |
-			FILE_NOTIFY_CHANGE_LAST_WRITE;
-	}
-	if (fcntl_notify_flags & DN_DELETE)
-		cifs_ntfy_flags |= FILE_NOTIFY_CHANGE_LAST_WRITE;
-	if (fcntl_notify_flags & DN_RENAME) {
-		/* BB review this - checking various server behaviors */
-		cifs_ntfy_flags |= FILE_NOTIFY_CHANGE_DIR_NAME |
-			FILE_NOTIFY_CHANGE_FILE_NAME;
-	}
-	if (fcntl_notify_flags & DN_ATTRIB) {
-		cifs_ntfy_flags |= FILE_NOTIFY_CHANGE_SECURITY |
-			FILE_NOTIFY_CHANGE_ATTRIBUTES;
-	}
-/*	if (fcntl_notify_flags & DN_MULTISHOT) {
-		cifs_ntfy_flags |= ;
-	} */ /* BB fixme - not sure how to handle this with CIFS yet */
-
-	return cifs_ntfy_flags;
-}
-
-int cifs_dir_notify(struct file *file, unsigned long arg)
-{
-	int xid;
-	int rc = -EINVAL;
-	int oplock = 0;
-	struct cifs_sb_info *cifs_sb;
-	struct cifsTconInfo *pTcon;
-	char *full_path = NULL;
-	__u32 filter = FILE_NOTIFY_CHANGE_NAME | FILE_NOTIFY_CHANGE_ATTRIBUTES;
-	__u16 netfid;
-
-	if (experimEnabled == 0)
-		return 0;
-
-	xid = GetXid();
-	cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
-	pTcon = cifs_sb->tcon;
-
-	full_path = build_path_from_dentry(file->f_path.dentry);
-
-	if (full_path == NULL) {
-		rc = -ENOMEM;
-	} else {
-		cFYI(1, ("dir notify on file %s Arg 0x%lx", full_path, arg));
-		rc = CIFSSMBOpen(xid, pTcon, full_path, FILE_OPEN,
-			GENERIC_READ | SYNCHRONIZE, 0 /* create options */,
-			&netfid, &oplock, NULL, cifs_sb->local_nls,
-			cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
-		/* BB fixme - add this handle to a notify handle list */
-		if (rc) {
-			cFYI(1, ("Could not open directory for notify"));
-		} else {
-			filter = convert_to_cifs_notify_flags(arg);
-			if (filter != 0) {
-				rc = CIFSSMBNotify(xid, pTcon,
-					0 /* no subdirs */, netfid,
-					filter, file, arg & DN_MULTISHOT,
-					cifs_sb->local_nls);
-			} else {
-				rc = -EINVAL;
-			}
-			/* BB add code to close file eventually (at unmount
-			it would close automatically but may be a way
-			to do it easily when inode freed or when
-			notify info is cleared/changed */
-			cFYI(1, ("notify rc %d", rc));
-		}
-	}
-
-	FreeXid(xid);
-	return rc;
-}
diff --git a/fs/dnotify.c b/fs/dnotify.c
index 676073b8dda..b0aa2cde80b 100644
--- a/fs/dnotify.c
+++ b/fs/dnotify.c
@@ -115,9 +115,6 @@ int fcntl_dirnotify(int fd, struct file *filp, unsigned long arg)
 	dn->dn_next = inode->i_dnotify;
 	inode->i_dnotify = dn;
 	spin_unlock(&inode->i_lock);
-
-	if (filp->f_op && filp->f_op->dir_notify)
-		return filp->f_op->dir_notify(filp, arg);
 	return 0;
 
 out_free:
diff --git a/include/linux/fs.h b/include/linux/fs.h
index fd615986a41..be16ce01fb1 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1309,7 +1309,6 @@ struct file_operations {
 	ssize_t (*sendpage) (struct file *, struct page *, int, size_t, loff_t *, int);
 	unsigned long (*get_unmapped_area)(struct file *, unsigned long, unsigned long, unsigned long, unsigned long);
 	int (*check_flags)(int);
-	int (*dir_notify)(struct file *filp, unsigned long arg);
 	int (*flock) (struct file *, int, struct file_lock *);
 	ssize_t (*splice_write)(struct pipe_inode_info *, struct file *, loff_t *, size_t, unsigned int);
 	ssize_t (*splice_read)(struct file *, loff_t *, struct pipe_inode_info *, size_t, unsigned int);
-- 
cgit v1.2.3-70-g09d2


From 261bca86ed4f7f391d1938167624e78da61dcc6b Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Tue, 30 Dec 2008 01:48:21 -0500
Subject: nfsd/create race fixes, infrastructure

new helpers - insert_inode_locked() and insert_inode_locked4().
Hash new inode, making sure that there's no such inode in icache
already.  If there is and it does not end up unhashed (as would
happen if we have nfsd trying to resolve a bogus fhandle), fail.
Otherwise insert our inode into hash and succeed.

In either case have i_state set to new+locked; cleanup ends up
being simpler with such calling conventions.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/inode.c         | 59 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/fs.h |  2 ++
 2 files changed, 61 insertions(+)

(limited to 'include/linux')

diff --git a/fs/inode.c b/fs/inode.c
index 098a2443196..7de1cda9248 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -1032,6 +1032,65 @@ struct inode *iget_locked(struct super_block *sb, unsigned long ino)
 
 EXPORT_SYMBOL(iget_locked);
 
+int insert_inode_locked(struct inode *inode)
+{
+	struct super_block *sb = inode->i_sb;
+	ino_t ino = inode->i_ino;
+	struct hlist_head *head = inode_hashtable + hash(sb, ino);
+	struct inode *old;
+
+	inode->i_state |= I_LOCK|I_NEW;
+	while (1) {
+		spin_lock(&inode_lock);
+		old = find_inode_fast(sb, head, ino);
+		if (likely(!old)) {
+			hlist_add_head(&inode->i_hash, head);
+			spin_unlock(&inode_lock);
+			return 0;
+		}
+		__iget(old);
+		spin_unlock(&inode_lock);
+		wait_on_inode(old);
+		if (unlikely(!hlist_unhashed(&old->i_hash))) {
+			iput(old);
+			return -EBUSY;
+		}
+		iput(old);
+	}
+}
+
+EXPORT_SYMBOL(insert_inode_locked);
+
+int insert_inode_locked4(struct inode *inode, unsigned long hashval,
+		int (*test)(struct inode *, void *), void *data)
+{
+	struct super_block *sb = inode->i_sb;
+	struct hlist_head *head = inode_hashtable + hash(sb, hashval);
+	struct inode *old;
+
+	inode->i_state |= I_LOCK|I_NEW;
+
+	while (1) {
+		spin_lock(&inode_lock);
+		old = find_inode(sb, head, test, data);
+		if (likely(!old)) {
+			hlist_add_head(&inode->i_hash, head);
+			spin_unlock(&inode_lock);
+			return 0;
+		}
+		__iget(old);
+		spin_unlock(&inode_lock);
+		wait_on_inode(old);
+		if (unlikely(!hlist_unhashed(&old->i_hash))) {
+			iput(old);
+			return -EBUSY;
+		}
+		iput(old);
+	}
+}
+
+EXPORT_SYMBOL(insert_inode_locked4);
+
 /**
  *	__insert_inode_hash - hash an inode
  *	@inode: unhashed inode
diff --git a/include/linux/fs.h b/include/linux/fs.h
index be16ce01fb1..e2170ee21e1 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1902,6 +1902,8 @@ extern struct inode *ilookup(struct super_block *sb, unsigned long ino);
 
 extern struct inode * iget5_locked(struct super_block *, unsigned long, int (*test)(struct inode *, void *), int (*set)(struct inode *, void *), void *);
 extern struct inode * iget_locked(struct super_block *, unsigned long);
+extern int insert_inode_locked4(struct inode *, unsigned long, int (*test)(struct inode *, void *), void *);
+extern int insert_inode_locked(struct inode *);
 extern void unlock_new_inode(struct inode *);
 
 extern void __iget(struct inode * inode);
-- 
cgit v1.2.3-70-g09d2


From ab53d472e785e51fdfc08fc1d66252c1153e6c0f Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Thu, 1 Jan 2009 10:12:19 +1030
Subject: bitmap: find_last_bit()

Impact: New API

As the name suggests.  For the moment everyone uses the generic one.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 include/linux/bitops.h | 13 ++++++++++++-
 lib/Kconfig            |  4 ++++
 lib/Makefile           |  1 +
 lib/find_last_bit.c    | 45 +++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 62 insertions(+), 1 deletion(-)
 create mode 100644 lib/find_last_bit.c

(limited to 'include/linux')

diff --git a/include/linux/bitops.h b/include/linux/bitops.h
index 024f2b02724..61829139795 100644
--- a/include/linux/bitops.h
+++ b/include/linux/bitops.h
@@ -134,9 +134,20 @@ extern unsigned long find_first_bit(const unsigned long *addr,
  */
 extern unsigned long find_first_zero_bit(const unsigned long *addr,
 					 unsigned long size);
-
 #endif /* CONFIG_GENERIC_FIND_FIRST_BIT */
 
+#ifdef CONFIG_GENERIC_FIND_LAST_BIT
+/**
+ * find_last_bit - find the last set bit in a memory region
+ * @addr: The address to start the search at
+ * @size: The maximum size to search
+ *
+ * Returns the bit number of the first set bit, or size.
+ */
+extern unsigned long find_last_bit(const unsigned long *addr,
+				   unsigned long size);
+#endif /* CONFIG_GENERIC_FIND_LAST_BIT */
+
 #ifdef CONFIG_GENERIC_FIND_NEXT_BIT
 
 /**
diff --git a/lib/Kconfig b/lib/Kconfig
index 2ba43c4a5b0..fc5f5ee50bc 100644
--- a/lib/Kconfig
+++ b/lib/Kconfig
@@ -13,6 +13,10 @@ config GENERIC_FIND_FIRST_BIT
 config GENERIC_FIND_NEXT_BIT
 	bool
 
+config GENERIC_FIND_LAST_BIT
+	bool
+	default y
+
 config CRC_CCITT
 	tristate "CRC-CCITT functions"
 	help
diff --git a/lib/Makefile b/lib/Makefile
index 80fe8a3ec12..32b0e64ded2 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -37,6 +37,7 @@ lib-$(CONFIG_RWSEM_GENERIC_SPINLOCK) += rwsem-spinlock.o
 lib-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem.o
 lib-$(CONFIG_GENERIC_FIND_FIRST_BIT) += find_next_bit.o
 lib-$(CONFIG_GENERIC_FIND_NEXT_BIT) += find_next_bit.o
+lib-$(CONFIG_GENERIC_FIND_LAST_BIT) += find_last_bit.o
 obj-$(CONFIG_GENERIC_HWEIGHT) += hweight.o
 obj-$(CONFIG_LOCK_KERNEL) += kernel_lock.o
 obj-$(CONFIG_PLIST) += plist.o
diff --git a/lib/find_last_bit.c b/lib/find_last_bit.c
new file mode 100644
index 00000000000..5d202e36bdd
--- /dev/null
+++ b/lib/find_last_bit.c
@@ -0,0 +1,45 @@
+/* find_last_bit.c: fallback find next bit implementation
+ *
+ * Copyright (C) 2008 IBM Corporation
+ * Written by Rusty Russell <rusty@rustcorp.com.au>
+ * (Inspired by David Howell's find_next_bit implementation)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/bitops.h>
+#include <linux/module.h>
+#include <asm/types.h>
+#include <asm/byteorder.h>
+
+unsigned long find_last_bit(const unsigned long *addr, unsigned long size)
+{
+	unsigned long words;
+	unsigned long tmp;
+
+	/* Start at final word. */
+	words = size / BITS_PER_LONG;
+
+	/* Partial final word? */
+	if (size & (BITS_PER_LONG-1)) {
+		tmp = (addr[words] & (~0UL >> (BITS_PER_LONG
+					 - (size & (BITS_PER_LONG-1)))));
+		if (tmp)
+			goto found;
+	}
+
+	while (words) {
+		tmp = addr[--words];
+		if (tmp) {
+found:
+			return words * BITS_PER_LONG + __fls(tmp);
+		}
+	}
+
+	/* Not found */
+	return size;
+}
+EXPORT_SYMBOL(find_last_bit);
-- 
cgit v1.2.3-70-g09d2


From 6b954823c24f04ed026a8517f6bab5abda279db8 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Thu, 1 Jan 2009 10:12:25 +1030
Subject: cpumask: convert kernel time functions

Impact: Use new APIs

Convert kernel/time functions to use struct cpumask *.

Note the ugly bitmap declarations in tick-broadcast.c.  These should
be cpumask_var_t, but there was no obvious initialization function to
put the alloc_cpumask_var() calls in.  This was safe.

(Eventually 'struct cpumask' will be undefined for CONFIG_CPUMASK_OFFSTACK,
so we use a bitmap here to show we really mean it).

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
Signed-off-by: Mike Travis <travis@sgi.com>
---
 include/linux/tick.h         |   4 +-
 kernel/time/clocksource.c    |   6 +--
 kernel/time/tick-broadcast.c | 113 ++++++++++++++++++++++---------------------
 kernel/time/tick-common.c    |   6 +--
 4 files changed, 66 insertions(+), 63 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/tick.h b/include/linux/tick.h
index b6ec8189ac0..469b82d88b3 100644
--- a/include/linux/tick.h
+++ b/include/linux/tick.h
@@ -84,10 +84,10 @@ static inline void tick_cancel_sched_timer(int cpu) { }
 
 # ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST
 extern struct tick_device *tick_get_broadcast_device(void);
-extern cpumask_t *tick_get_broadcast_mask(void);
+extern struct cpumask *tick_get_broadcast_mask(void);
 
 #  ifdef CONFIG_TICK_ONESHOT
-extern cpumask_t *tick_get_broadcast_oneshot_mask(void);
+extern struct cpumask *tick_get_broadcast_oneshot_mask(void);
 #  endif
 
 # endif /* BROADCAST */
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index 9ed2eec9752..32141b15d63 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -148,7 +148,7 @@ static void clocksource_watchdog(unsigned long data)
 		int next_cpu = next_cpu_nr(raw_smp_processor_id(), cpu_online_map);
 
 		if (next_cpu >= nr_cpu_ids)
-			next_cpu = first_cpu(cpu_online_map);
+			next_cpu = cpumask_first(cpu_online_mask);
 		watchdog_timer.expires += WATCHDOG_INTERVAL;
 		add_timer_on(&watchdog_timer, next_cpu);
 	}
@@ -173,7 +173,7 @@ static void clocksource_check_watchdog(struct clocksource *cs)
 			watchdog_last = watchdog->read();
 			watchdog_timer.expires = jiffies + WATCHDOG_INTERVAL;
 			add_timer_on(&watchdog_timer,
-				     first_cpu(cpu_online_map));
+				     cpumask_first(cpu_online_mask));
 		}
 	} else {
 		if (cs->flags & CLOCK_SOURCE_IS_CONTINUOUS)
@@ -195,7 +195,7 @@ static void clocksource_check_watchdog(struct clocksource *cs)
 				watchdog_timer.expires =
 					jiffies + WATCHDOG_INTERVAL;
 				add_timer_on(&watchdog_timer,
-					     first_cpu(cpu_online_map));
+					     cpumask_first(cpu_online_mask));
 			}
 		}
 	}
diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c
index 9590af2327b..356fac57a18 100644
--- a/kernel/time/tick-broadcast.c
+++ b/kernel/time/tick-broadcast.c
@@ -28,7 +28,9 @@
  */
 
 struct tick_device tick_broadcast_device;
-static cpumask_t tick_broadcast_mask;
+/* FIXME: Use cpumask_var_t. */
+static DECLARE_BITMAP(tick_broadcast_mask, NR_CPUS);
+static DECLARE_BITMAP(tmpmask, NR_CPUS);
 static DEFINE_SPINLOCK(tick_broadcast_lock);
 static int tick_broadcast_force;
 
@@ -46,9 +48,9 @@ struct tick_device *tick_get_broadcast_device(void)
 	return &tick_broadcast_device;
 }
 
-cpumask_t *tick_get_broadcast_mask(void)
+struct cpumask *tick_get_broadcast_mask(void)
 {
-	return &tick_broadcast_mask;
+	return to_cpumask(tick_broadcast_mask);
 }
 
 /*
@@ -72,7 +74,7 @@ int tick_check_broadcast_device(struct clock_event_device *dev)
 
 	clockevents_exchange_device(NULL, dev);
 	tick_broadcast_device.evtdev = dev;
-	if (!cpus_empty(tick_broadcast_mask))
+	if (!cpumask_empty(tick_get_broadcast_mask()))
 		tick_broadcast_start_periodic(dev);
 	return 1;
 }
@@ -104,7 +106,7 @@ int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu)
 	 */
 	if (!tick_device_is_functional(dev)) {
 		dev->event_handler = tick_handle_periodic;
-		cpu_set(cpu, tick_broadcast_mask);
+		cpumask_set_cpu(cpu, tick_get_broadcast_mask());
 		tick_broadcast_start_periodic(tick_broadcast_device.evtdev);
 		ret = 1;
 	} else {
@@ -116,7 +118,7 @@ int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu)
 		if (!(dev->features & CLOCK_EVT_FEAT_C3STOP)) {
 			int cpu = smp_processor_id();
 
-			cpu_clear(cpu, tick_broadcast_mask);
+			cpumask_clear_cpu(cpu, tick_get_broadcast_mask());
 			tick_broadcast_clear_oneshot(cpu);
 		}
 	}
@@ -125,9 +127,9 @@ int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu)
 }
 
 /*
- * Broadcast the event to the cpus, which are set in the mask
+ * Broadcast the event to the cpus, which are set in the mask (mangled).
  */
-static void tick_do_broadcast(cpumask_t mask)
+static void tick_do_broadcast(struct cpumask *mask)
 {
 	int cpu = smp_processor_id();
 	struct tick_device *td;
@@ -135,22 +137,21 @@ static void tick_do_broadcast(cpumask_t mask)
 	/*
 	 * Check, if the current cpu is in the mask
 	 */
-	if (cpu_isset(cpu, mask)) {
-		cpu_clear(cpu, mask);
+	if (cpumask_test_cpu(cpu, mask)) {
+		cpumask_clear_cpu(cpu, mask);
 		td = &per_cpu(tick_cpu_device, cpu);
 		td->evtdev->event_handler(td->evtdev);
 	}
 
-	if (!cpus_empty(mask)) {
+	if (!cpumask_empty(mask)) {
 		/*
 		 * It might be necessary to actually check whether the devices
 		 * have different broadcast functions. For now, just use the
 		 * one of the first device. This works as long as we have this
 		 * misfeature only on x86 (lapic)
 		 */
-		cpu = first_cpu(mask);
-		td = &per_cpu(tick_cpu_device, cpu);
-		td->evtdev->broadcast(&mask);
+		td = &per_cpu(tick_cpu_device, cpumask_first(mask));
+		td->evtdev->broadcast(mask);
 	}
 }
 
@@ -160,12 +161,11 @@ static void tick_do_broadcast(cpumask_t mask)
  */
 static void tick_do_periodic_broadcast(void)
 {
-	cpumask_t mask;
-
 	spin_lock(&tick_broadcast_lock);
 
-	cpus_and(mask, cpu_online_map, tick_broadcast_mask);
-	tick_do_broadcast(mask);
+	cpumask_and(to_cpumask(tmpmask),
+		    cpu_online_mask, tick_get_broadcast_mask());
+	tick_do_broadcast(to_cpumask(tmpmask));
 
 	spin_unlock(&tick_broadcast_lock);
 }
@@ -228,13 +228,13 @@ static void tick_do_broadcast_on_off(void *why)
 	if (!tick_device_is_functional(dev))
 		goto out;
 
-	bc_stopped = cpus_empty(tick_broadcast_mask);
+	bc_stopped = cpumask_empty(tick_get_broadcast_mask());
 
 	switch (*reason) {
 	case CLOCK_EVT_NOTIFY_BROADCAST_ON:
 	case CLOCK_EVT_NOTIFY_BROADCAST_FORCE:
-		if (!cpu_isset(cpu, tick_broadcast_mask)) {
-			cpu_set(cpu, tick_broadcast_mask);
+		if (!cpumask_test_cpu(cpu, tick_get_broadcast_mask())) {
+			cpumask_set_cpu(cpu, tick_get_broadcast_mask());
 			if (tick_broadcast_device.mode ==
 			    TICKDEV_MODE_PERIODIC)
 				clockevents_shutdown(dev);
@@ -244,8 +244,8 @@ static void tick_do_broadcast_on_off(void *why)
 		break;
 	case CLOCK_EVT_NOTIFY_BROADCAST_OFF:
 		if (!tick_broadcast_force &&
-		    cpu_isset(cpu, tick_broadcast_mask)) {
-			cpu_clear(cpu, tick_broadcast_mask);
+		    cpumask_test_cpu(cpu, tick_get_broadcast_mask())) {
+			cpumask_clear_cpu(cpu, tick_get_broadcast_mask());
 			if (tick_broadcast_device.mode ==
 			    TICKDEV_MODE_PERIODIC)
 				tick_setup_periodic(dev, 0);
@@ -253,7 +253,7 @@ static void tick_do_broadcast_on_off(void *why)
 		break;
 	}
 
-	if (cpus_empty(tick_broadcast_mask)) {
+	if (cpumask_empty(tick_get_broadcast_mask())) {
 		if (!bc_stopped)
 			clockevents_shutdown(bc);
 	} else if (bc_stopped) {
@@ -272,7 +272,7 @@ out:
  */
 void tick_broadcast_on_off(unsigned long reason, int *oncpu)
 {
-	if (!cpu_isset(*oncpu, cpu_online_map))
+	if (!cpumask_test_cpu(*oncpu, cpu_online_mask))
 		printk(KERN_ERR "tick-broadcast: ignoring broadcast for "
 		       "offline CPU #%d\n", *oncpu);
 	else
@@ -303,10 +303,10 @@ void tick_shutdown_broadcast(unsigned int *cpup)
 	spin_lock_irqsave(&tick_broadcast_lock, flags);
 
 	bc = tick_broadcast_device.evtdev;
-	cpu_clear(cpu, tick_broadcast_mask);
+	cpumask_clear_cpu(cpu, tick_get_broadcast_mask());
 
 	if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC) {
-		if (bc && cpus_empty(tick_broadcast_mask))
+		if (bc && cpumask_empty(tick_get_broadcast_mask()))
 			clockevents_shutdown(bc);
 	}
 
@@ -342,10 +342,10 @@ int tick_resume_broadcast(void)
 
 		switch (tick_broadcast_device.mode) {
 		case TICKDEV_MODE_PERIODIC:
-			if(!cpus_empty(tick_broadcast_mask))
+			if (!cpumask_empty(tick_get_broadcast_mask()))
 				tick_broadcast_start_periodic(bc);
-			broadcast = cpu_isset(smp_processor_id(),
-					      tick_broadcast_mask);
+			broadcast = cpumask_test_cpu(smp_processor_id(),
+						     tick_get_broadcast_mask());
 			break;
 		case TICKDEV_MODE_ONESHOT:
 			broadcast = tick_resume_broadcast_oneshot(bc);
@@ -360,14 +360,15 @@ int tick_resume_broadcast(void)
 
 #ifdef CONFIG_TICK_ONESHOT
 
-static cpumask_t tick_broadcast_oneshot_mask;
+/* FIXME: use cpumask_var_t. */
+static DECLARE_BITMAP(tick_broadcast_oneshot_mask, NR_CPUS);
 
 /*
- * Debugging: see timer_list.c
+ * Exposed for debugging: see timer_list.c
  */
-cpumask_t *tick_get_broadcast_oneshot_mask(void)
+struct cpumask *tick_get_broadcast_oneshot_mask(void)
 {
-	return &tick_broadcast_oneshot_mask;
+	return to_cpumask(tick_broadcast_oneshot_mask);
 }
 
 static int tick_broadcast_set_event(ktime_t expires, int force)
@@ -389,7 +390,7 @@ int tick_resume_broadcast_oneshot(struct clock_event_device *bc)
  */
 void tick_check_oneshot_broadcast(int cpu)
 {
-	if (cpu_isset(cpu, tick_broadcast_oneshot_mask)) {
+	if (cpumask_test_cpu(cpu, to_cpumask(tick_broadcast_oneshot_mask))) {
 		struct tick_device *td = &per_cpu(tick_cpu_device, cpu);
 
 		clockevents_set_mode(td->evtdev, CLOCK_EVT_MODE_ONESHOT);
@@ -402,7 +403,6 @@ void tick_check_oneshot_broadcast(int cpu)
 static void tick_handle_oneshot_broadcast(struct clock_event_device *dev)
 {
 	struct tick_device *td;
-	cpumask_t mask;
 	ktime_t now, next_event;
 	int cpu;
 
@@ -410,13 +410,13 @@ static void tick_handle_oneshot_broadcast(struct clock_event_device *dev)
 again:
 	dev->next_event.tv64 = KTIME_MAX;
 	next_event.tv64 = KTIME_MAX;
-	mask = CPU_MASK_NONE;
+	cpumask_clear(to_cpumask(tmpmask));
 	now = ktime_get();
 	/* Find all expired events */
-	for_each_cpu_mask_nr(cpu, tick_broadcast_oneshot_mask) {
+	for_each_cpu(cpu, tick_get_broadcast_oneshot_mask()) {
 		td = &per_cpu(tick_cpu_device, cpu);
 		if (td->evtdev->next_event.tv64 <= now.tv64)
-			cpu_set(cpu, mask);
+			cpumask_set_cpu(cpu, to_cpumask(tmpmask));
 		else if (td->evtdev->next_event.tv64 < next_event.tv64)
 			next_event.tv64 = td->evtdev->next_event.tv64;
 	}
@@ -424,7 +424,7 @@ again:
 	/*
 	 * Wakeup the cpus which have an expired event.
 	 */
-	tick_do_broadcast(mask);
+	tick_do_broadcast(to_cpumask(tmpmask));
 
 	/*
 	 * Two reasons for reprogram:
@@ -476,15 +476,16 @@ void tick_broadcast_oneshot_control(unsigned long reason)
 		goto out;
 
 	if (reason == CLOCK_EVT_NOTIFY_BROADCAST_ENTER) {
-		if (!cpu_isset(cpu, tick_broadcast_oneshot_mask)) {
-			cpu_set(cpu, tick_broadcast_oneshot_mask);
+		if (!cpumask_test_cpu(cpu, tick_get_broadcast_oneshot_mask())) {
+			cpumask_set_cpu(cpu, tick_get_broadcast_oneshot_mask());
 			clockevents_set_mode(dev, CLOCK_EVT_MODE_SHUTDOWN);
 			if (dev->next_event.tv64 < bc->next_event.tv64)
 				tick_broadcast_set_event(dev->next_event, 1);
 		}
 	} else {
-		if (cpu_isset(cpu, tick_broadcast_oneshot_mask)) {
-			cpu_clear(cpu, tick_broadcast_oneshot_mask);
+		if (cpumask_test_cpu(cpu, tick_get_broadcast_oneshot_mask())) {
+			cpumask_clear_cpu(cpu,
+					  tick_get_broadcast_oneshot_mask());
 			clockevents_set_mode(dev, CLOCK_EVT_MODE_ONESHOT);
 			if (dev->next_event.tv64 != KTIME_MAX)
 				tick_program_event(dev->next_event, 1);
@@ -502,10 +503,11 @@ out:
  */
 static void tick_broadcast_clear_oneshot(int cpu)
 {
-	cpu_clear(cpu, tick_broadcast_oneshot_mask);
+	cpumask_clear_cpu(cpu, tick_get_broadcast_oneshot_mask());
 }
 
-static void tick_broadcast_init_next_event(cpumask_t *mask, ktime_t expires)
+static void tick_broadcast_init_next_event(struct cpumask *mask,
+					   ktime_t expires)
 {
 	struct tick_device *td;
 	int cpu;
@@ -526,7 +528,6 @@ void tick_broadcast_setup_oneshot(struct clock_event_device *bc)
 	if (bc->event_handler != tick_handle_oneshot_broadcast) {
 		int was_periodic = bc->mode == CLOCK_EVT_MODE_PERIODIC;
 		int cpu = smp_processor_id();
-		cpumask_t mask;
 
 		bc->event_handler = tick_handle_oneshot_broadcast;
 		clockevents_set_mode(bc, CLOCK_EVT_MODE_ONESHOT);
@@ -540,13 +541,15 @@ void tick_broadcast_setup_oneshot(struct clock_event_device *bc)
 		 * oneshot_mask bits for those and program the
 		 * broadcast device to fire.
 		 */
-		mask = tick_broadcast_mask;
-		cpu_clear(cpu, mask);
-		cpus_or(tick_broadcast_oneshot_mask,
-			tick_broadcast_oneshot_mask, mask);
-
-		if (was_periodic && !cpus_empty(mask)) {
-			tick_broadcast_init_next_event(&mask, tick_next_period);
+		cpumask_copy(to_cpumask(tmpmask), tick_get_broadcast_mask());
+		cpumask_clear_cpu(cpu, to_cpumask(tmpmask));
+		cpumask_or(tick_get_broadcast_oneshot_mask(),
+			   tick_get_broadcast_oneshot_mask(),
+			   to_cpumask(tmpmask));
+
+		if (was_periodic && !cpumask_empty(to_cpumask(tmpmask))) {
+			tick_broadcast_init_next_event(to_cpumask(tmpmask),
+						       tick_next_period);
 			tick_broadcast_set_event(tick_next_period, 1);
 		} else
 			bc->next_event.tv64 = KTIME_MAX;
@@ -585,7 +588,7 @@ void tick_shutdown_broadcast_oneshot(unsigned int *cpup)
 	 * Clear the broadcast mask flag for the dead cpu, but do not
 	 * stop the broadcast device!
 	 */
-	cpu_clear(cpu, tick_broadcast_oneshot_mask);
+	cpumask_clear_cpu(cpu, tick_get_broadcast_oneshot_mask());
 
 	spin_unlock_irqrestore(&tick_broadcast_lock, flags);
 }
diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c
index f8372be7412..63e05d423a0 100644
--- a/kernel/time/tick-common.c
+++ b/kernel/time/tick-common.c
@@ -254,7 +254,7 @@ static int tick_check_new_device(struct clock_event_device *newdev)
 		curdev = NULL;
 	}
 	clockevents_exchange_device(curdev, newdev);
-	tick_setup_device(td, newdev, cpu, &cpumask_of_cpu(cpu));
+	tick_setup_device(td, newdev, cpu, cpumask_of(cpu));
 	if (newdev->features & CLOCK_EVT_FEAT_ONESHOT)
 		tick_oneshot_notify();
 
@@ -299,9 +299,9 @@ static void tick_shutdown(unsigned int *cpup)
 	}
 	/* Transfer the do_timer job away from this cpu */
 	if (*cpup == tick_do_timer_cpu) {
-		int cpu = first_cpu(cpu_online_map);
+		int cpu = cpumask_first(cpu_online_mask);
 
-		tick_do_timer_cpu = (cpu != NR_CPUS) ? cpu :
+		tick_do_timer_cpu = (cpu < nr_cpu_ids) ? cpu :
 			TICK_DO_TIMER_NONE;
 	}
 	spin_unlock_irqrestore(&tick_device_lock, flags);
-- 
cgit v1.2.3-70-g09d2


From d036e67b40f52bdd95392390108defbac7e53837 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Thu, 1 Jan 2009 10:12:26 +1030
Subject: cpumask: convert kernel/irq

Impact: Reduce stack usage, use new cpumask API.  ALPHA mod!

Main change is that irq_default_affinity becomes a cpumask_var_t, so
treat it as a pointer (this effects alpha).

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 arch/alpha/kernel/irq.c   |  3 ++-
 include/linux/interrupt.h |  2 +-
 kernel/irq/manage.c       | 11 +++++++++--
 kernel/irq/proc.c         | 32 +++++++++++++++++++++-----------
 4 files changed, 33 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/arch/alpha/kernel/irq.c b/arch/alpha/kernel/irq.c
index d0f1620007f..703731accda 100644
--- a/arch/alpha/kernel/irq.c
+++ b/arch/alpha/kernel/irq.c
@@ -50,7 +50,8 @@ int irq_select_affinity(unsigned int irq)
 	if (!irq_desc[irq].chip->set_affinity || irq_user_affinity[irq])
 		return 1;
 
-	while (!cpu_possible(cpu) || !cpu_isset(cpu, irq_default_affinity))
+	while (!cpu_possible(cpu) ||
+	       !cpumask_test_cpu(cpu, irq_default_affinity))
 		cpu = (cpu < (NR_CPUS-1) ? cpu + 1 : 0);
 	last_cpu = cpu;
 
diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index dfaee6bd265..91f1ef8e581 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -109,7 +109,7 @@ extern void enable_irq(unsigned int irq);
 
 #if defined(CONFIG_SMP) && defined(CONFIG_GENERIC_HARDIRQS)
 
-extern cpumask_t irq_default_affinity;
+extern cpumask_var_t irq_default_affinity;
 
 extern int irq_set_affinity(unsigned int irq, const struct cpumask *cpumask);
 extern int irq_can_set_affinity(unsigned int irq);
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 61c4a9b6216..cd0cd8dcb34 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -16,8 +16,15 @@
 #include "internals.h"
 
 #ifdef CONFIG_SMP
+cpumask_var_t irq_default_affinity;
 
-cpumask_t irq_default_affinity = CPU_MASK_ALL;
+static int init_irq_default_affinity(void)
+{
+	alloc_cpumask_var(&irq_default_affinity, GFP_KERNEL);
+	cpumask_setall(irq_default_affinity);
+	return 0;
+}
+core_initcall(init_irq_default_affinity);
 
 /**
  *	synchronize_irq - wait for pending IRQ handlers (on other CPUs)
@@ -127,7 +134,7 @@ int do_irq_select_affinity(unsigned int irq, struct irq_desc *desc)
 			desc->status &= ~IRQ_AFFINITY_SET;
 	}
 
-	cpumask_and(&desc->affinity, cpu_online_mask, &irq_default_affinity);
+	cpumask_and(&desc->affinity, cpu_online_mask, irq_default_affinity);
 set_affinity:
 	desc->chip->set_affinity(irq, &desc->affinity);
 
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index d2c0e5ee53c..2abd3a7716e 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -20,7 +20,7 @@ static struct proc_dir_entry *root_irq_dir;
 static int irq_affinity_proc_show(struct seq_file *m, void *v)
 {
 	struct irq_desc *desc = irq_to_desc((long)m->private);
-	cpumask_t *mask = &desc->affinity;
+	const struct cpumask *mask = &desc->affinity;
 
 #ifdef CONFIG_GENERIC_PENDING_IRQ
 	if (desc->status & IRQ_MOVE_PENDING)
@@ -93,7 +93,7 @@ static const struct file_operations irq_affinity_proc_fops = {
 
 static int default_affinity_show(struct seq_file *m, void *v)
 {
-	seq_cpumask(m, &irq_default_affinity);
+	seq_cpumask(m, irq_default_affinity);
 	seq_putc(m, '\n');
 	return 0;
 }
@@ -101,27 +101,37 @@ static int default_affinity_show(struct seq_file *m, void *v)
 static ssize_t default_affinity_write(struct file *file,
 		const char __user *buffer, size_t count, loff_t *ppos)
 {
-	cpumask_t new_value;
+	cpumask_var_t new_value;
 	int err;
 
-	err = cpumask_parse_user(buffer, count, &new_value);
+	if (!alloc_cpumask_var(&new_value, GFP_KERNEL))
+		return -ENOMEM;
+
+	err = cpumask_parse_user(buffer, count, new_value);
 	if (err)
-		return err;
+		goto out;
 
-	if (!is_affinity_mask_valid(new_value))
-		return -EINVAL;
+	if (!is_affinity_mask_valid(new_value)) {
+		err = -EINVAL;
+		goto out;
+	}
 
 	/*
 	 * Do not allow disabling IRQs completely - it's a too easy
 	 * way to make the system unusable accidentally :-) At least
 	 * one online CPU still has to be targeted.
 	 */
-	if (!cpus_intersects(new_value, cpu_online_map))
-		return -EINVAL;
+	if (!cpumask_intersects(new_value, cpu_online_mask)) {
+		err = -EINVAL;
+		goto out;
+	}
 
-	irq_default_affinity = new_value;
+	cpumask_copy(irq_default_affinity, new_value);
+	err = count;
 
-	return count;
+out:
+	free_cpumask_var(new_value);
+	return err;
 }
 
 static int default_affinity_open(struct inode *inode, struct file *file)
-- 
cgit v1.2.3-70-g09d2


From bd232f97b30f6bb630efa136a777647545db3039 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Thu, 1 Jan 2009 10:12:26 +1030
Subject: cpumask: convert RCU implementations

Impact: use new cpumask API.

rcu_ctrlblk contains a cpumask, and it's highly optimized so I don't want
a cpumask_var_t (ie. a pointer) for the CONFIG_CPUMASK_OFFSTACK case.  It
could use a dangling bitmap, and be allocated in __rcu_init to save memory,
but for the moment we use a bitmap.

(Eventually 'struct cpumask' will be undefined for CONFIG_CPUMASK_OFFSTACK,
so we use a bitmap here to show we really mean it).

We remove on-stack cpumasks, using cpumask_var_t for
rcu_torture_shuffle_tasks() and for_each_cpu_and in force_quiescent_state().

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 include/linux/rcuclassic.h |  4 ++--
 kernel/rcuclassic.c        | 32 +++++++++++++++++---------------
 kernel/rcupreempt.c        | 19 ++++++++++---------
 kernel/rcutorture.c        | 27 +++++++++++++++------------
 4 files changed, 44 insertions(+), 38 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/rcuclassic.h b/include/linux/rcuclassic.h
index 301dda829e3..f3f697df1d7 100644
--- a/include/linux/rcuclassic.h
+++ b/include/linux/rcuclassic.h
@@ -59,8 +59,8 @@ struct rcu_ctrlblk {
 	int	signaled;
 
 	spinlock_t	lock	____cacheline_internodealigned_in_smp;
-	cpumask_t	cpumask; /* CPUs that need to switch in order    */
-				 /* for current batch to proceed.        */
+	DECLARE_BITMAP(cpumask, NR_CPUS); /* CPUs that need to switch for */
+					  /* current batch to proceed.     */
 } ____cacheline_internodealigned_in_smp;
 
 /* Is batch a before batch b ? */
diff --git a/kernel/rcuclassic.c b/kernel/rcuclassic.c
index e503a002f33..0ff9b05706a 100644
--- a/kernel/rcuclassic.c
+++ b/kernel/rcuclassic.c
@@ -63,14 +63,14 @@ static struct rcu_ctrlblk rcu_ctrlblk = {
 	.completed = -300,
 	.pending = -300,
 	.lock = __SPIN_LOCK_UNLOCKED(&rcu_ctrlblk.lock),
-	.cpumask = CPU_MASK_NONE,
+	.cpumask = CPU_BITS_NONE,
 };
 static struct rcu_ctrlblk rcu_bh_ctrlblk = {
 	.cur = -300,
 	.completed = -300,
 	.pending = -300,
 	.lock = __SPIN_LOCK_UNLOCKED(&rcu_bh_ctrlblk.lock),
-	.cpumask = CPU_MASK_NONE,
+	.cpumask = CPU_BITS_NONE,
 };
 
 DEFINE_PER_CPU(struct rcu_data, rcu_data) = { 0L };
@@ -85,7 +85,6 @@ static void force_quiescent_state(struct rcu_data *rdp,
 			struct rcu_ctrlblk *rcp)
 {
 	int cpu;
-	cpumask_t cpumask;
 	unsigned long flags;
 
 	set_need_resched();
@@ -96,10 +95,10 @@ static void force_quiescent_state(struct rcu_data *rdp,
 		 * Don't send IPI to itself. With irqs disabled,
 		 * rdp->cpu is the current cpu.
 		 *
-		 * cpu_online_map is updated by the _cpu_down()
+		 * cpu_online_mask is updated by the _cpu_down()
 		 * using __stop_machine(). Since we're in irqs disabled
 		 * section, __stop_machine() is not exectuting, hence
-		 * the cpu_online_map is stable.
+		 * the cpu_online_mask is stable.
 		 *
 		 * However,  a cpu might have been offlined _just_ before
 		 * we disabled irqs while entering here.
@@ -107,13 +106,14 @@ static void force_quiescent_state(struct rcu_data *rdp,
 		 * notification, leading to the offlined cpu's bit
 		 * being set in the rcp->cpumask.
 		 *
-		 * Hence cpumask = (rcp->cpumask & cpu_online_map) to prevent
+		 * Hence cpumask = (rcp->cpumask & cpu_online_mask) to prevent
 		 * sending smp_reschedule() to an offlined CPU.
 		 */
-		cpus_and(cpumask, rcp->cpumask, cpu_online_map);
-		cpu_clear(rdp->cpu, cpumask);
-		for_each_cpu_mask_nr(cpu, cpumask)
-			smp_send_reschedule(cpu);
+		for_each_cpu_and(cpu,
+				  to_cpumask(rcp->cpumask), cpu_online_mask) {
+			if (cpu != rdp->cpu)
+				smp_send_reschedule(cpu);
+		}
 	}
 	spin_unlock_irqrestore(&rcp->lock, flags);
 }
@@ -193,7 +193,7 @@ static void print_other_cpu_stall(struct rcu_ctrlblk *rcp)
 
 	printk(KERN_ERR "INFO: RCU detected CPU stalls:");
 	for_each_possible_cpu(cpu) {
-		if (cpu_isset(cpu, rcp->cpumask))
+		if (cpumask_test_cpu(cpu, to_cpumask(rcp->cpumask)))
 			printk(" %d", cpu);
 	}
 	printk(" (detected by %d, t=%ld jiffies)\n",
@@ -221,7 +221,8 @@ static void check_cpu_stall(struct rcu_ctrlblk *rcp)
 	long delta;
 
 	delta = jiffies - rcp->jiffies_stall;
-	if (cpu_isset(smp_processor_id(), rcp->cpumask) && delta >= 0) {
+	if (cpumask_test_cpu(smp_processor_id(), to_cpumask(rcp->cpumask)) &&
+		delta >= 0) {
 
 		/* We haven't checked in, so go dump stack. */
 		print_cpu_stall(rcp);
@@ -393,7 +394,8 @@ static void rcu_start_batch(struct rcu_ctrlblk *rcp)
 		 * unnecessarily.
 		 */
 		smp_mb();
-		cpus_andnot(rcp->cpumask, cpu_online_map, nohz_cpu_mask);
+		cpumask_andnot(to_cpumask(rcp->cpumask),
+			       cpu_online_mask, &nohz_cpu_mask);
 
 		rcp->signaled = 0;
 	}
@@ -406,8 +408,8 @@ static void rcu_start_batch(struct rcu_ctrlblk *rcp)
  */
 static void cpu_quiet(int cpu, struct rcu_ctrlblk *rcp)
 {
-	cpu_clear(cpu, rcp->cpumask);
-	if (cpus_empty(rcp->cpumask)) {
+	cpumask_clear_cpu(cpu, to_cpumask(rcp->cpumask));
+	if (cpumask_empty(to_cpumask(rcp->cpumask))) {
 		/* batch completed ! */
 		rcp->completed = rcp->cur;
 		rcu_start_batch(rcp);
diff --git a/kernel/rcupreempt.c b/kernel/rcupreempt.c
index 04982659875..f9dc8f3720f 100644
--- a/kernel/rcupreempt.c
+++ b/kernel/rcupreempt.c
@@ -164,7 +164,8 @@ static char *rcu_try_flip_state_names[] =
 	{ "idle", "waitack", "waitzero", "waitmb" };
 #endif /* #ifdef CONFIG_RCU_TRACE */
 
-static cpumask_t rcu_cpu_online_map __read_mostly = CPU_MASK_NONE;
+static DECLARE_BITMAP(rcu_cpu_online_map, NR_CPUS) __read_mostly
+	= CPU_BITS_NONE;
 
 /*
  * Enum and per-CPU flag to determine when each CPU has seen
@@ -758,7 +759,7 @@ rcu_try_flip_idle(void)
 
 	/* Now ask each CPU for acknowledgement of the flip. */
 
-	for_each_cpu_mask_nr(cpu, rcu_cpu_online_map) {
+	for_each_cpu(cpu, to_cpumask(rcu_cpu_online_map)) {
 		per_cpu(rcu_flip_flag, cpu) = rcu_flipped;
 		dyntick_save_progress_counter(cpu);
 	}
@@ -776,7 +777,7 @@ rcu_try_flip_waitack(void)
 	int cpu;
 
 	RCU_TRACE_ME(rcupreempt_trace_try_flip_a1);
-	for_each_cpu_mask_nr(cpu, rcu_cpu_online_map)
+	for_each_cpu(cpu, to_cpumask(rcu_cpu_online_map))
 		if (rcu_try_flip_waitack_needed(cpu) &&
 		    per_cpu(rcu_flip_flag, cpu) != rcu_flip_seen) {
 			RCU_TRACE_ME(rcupreempt_trace_try_flip_ae1);
@@ -808,7 +809,7 @@ rcu_try_flip_waitzero(void)
 	/* Check to see if the sum of the "last" counters is zero. */
 
 	RCU_TRACE_ME(rcupreempt_trace_try_flip_z1);
-	for_each_cpu_mask_nr(cpu, rcu_cpu_online_map)
+	for_each_cpu(cpu, to_cpumask(rcu_cpu_online_map))
 		sum += RCU_DATA_CPU(cpu)->rcu_flipctr[lastidx];
 	if (sum != 0) {
 		RCU_TRACE_ME(rcupreempt_trace_try_flip_ze1);
@@ -823,7 +824,7 @@ rcu_try_flip_waitzero(void)
 	smp_mb();  /*  ^^^^^^^^^^^^ */
 
 	/* Call for a memory barrier from each CPU. */
-	for_each_cpu_mask_nr(cpu, rcu_cpu_online_map) {
+	for_each_cpu(cpu, to_cpumask(rcu_cpu_online_map)) {
 		per_cpu(rcu_mb_flag, cpu) = rcu_mb_needed;
 		dyntick_save_progress_counter(cpu);
 	}
@@ -843,7 +844,7 @@ rcu_try_flip_waitmb(void)
 	int cpu;
 
 	RCU_TRACE_ME(rcupreempt_trace_try_flip_m1);
-	for_each_cpu_mask_nr(cpu, rcu_cpu_online_map)
+	for_each_cpu(cpu, to_cpumask(rcu_cpu_online_map))
 		if (rcu_try_flip_waitmb_needed(cpu) &&
 		    per_cpu(rcu_mb_flag, cpu) != rcu_mb_done) {
 			RCU_TRACE_ME(rcupreempt_trace_try_flip_me1);
@@ -1032,7 +1033,7 @@ void rcu_offline_cpu(int cpu)
 	RCU_DATA_CPU(cpu)->rcu_flipctr[0] = 0;
 	RCU_DATA_CPU(cpu)->rcu_flipctr[1] = 0;
 
-	cpu_clear(cpu, rcu_cpu_online_map);
+	cpumask_clear_cpu(cpu, to_cpumask(rcu_cpu_online_map));
 
 	spin_unlock_irqrestore(&rcu_ctrlblk.fliplock, flags);
 
@@ -1072,7 +1073,7 @@ void __cpuinit rcu_online_cpu(int cpu)
 	struct rcu_data *rdp;
 
 	spin_lock_irqsave(&rcu_ctrlblk.fliplock, flags);
-	cpu_set(cpu, rcu_cpu_online_map);
+	cpumask_set_cpu(cpu, to_cpumask(rcu_cpu_online_map));
 	spin_unlock_irqrestore(&rcu_ctrlblk.fliplock, flags);
 
 	/*
@@ -1430,7 +1431,7 @@ void __init __rcu_init(void)
 	 * We don't need protection against CPU-Hotplug here
 	 * since
 	 * a) If a CPU comes online while we are iterating over the
-	 *    cpu_online_map below, we would only end up making a
+	 *    cpu_online_mask below, we would only end up making a
 	 *    duplicate call to rcu_online_cpu() which sets the corresponding
 	 *    CPU's mask in the rcu_cpu_online_map.
 	 *
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index b3106552210..3245b40952c 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -868,49 +868,52 @@ static int rcu_idle_cpu;	/* Force all torture tasks off this CPU */
  */
 static void rcu_torture_shuffle_tasks(void)
 {
-	cpumask_t tmp_mask;
+	cpumask_var_t tmp_mask;
 	int i;
 
-	cpus_setall(tmp_mask);
+	if (!alloc_cpumask_var(&tmp_mask, GFP_KERNEL))
+		BUG();
+
+	cpumask_setall(tmp_mask);
 	get_online_cpus();
 
 	/* No point in shuffling if there is only one online CPU (ex: UP) */
-	if (num_online_cpus() == 1) {
-		put_online_cpus();
-		return;
-	}
+	if (num_online_cpus() == 1)
+		goto out;
 
 	if (rcu_idle_cpu != -1)
-		cpu_clear(rcu_idle_cpu, tmp_mask);
+		cpumask_clear_cpu(rcu_idle_cpu, tmp_mask);
 
-	set_cpus_allowed_ptr(current, &tmp_mask);
+	set_cpus_allowed_ptr(current, tmp_mask);
 
 	if (reader_tasks) {
 		for (i = 0; i < nrealreaders; i++)
 			if (reader_tasks[i])
 				set_cpus_allowed_ptr(reader_tasks[i],
-						     &tmp_mask);
+						     tmp_mask);
 	}
 
 	if (fakewriter_tasks) {
 		for (i = 0; i < nfakewriters; i++)
 			if (fakewriter_tasks[i])
 				set_cpus_allowed_ptr(fakewriter_tasks[i],
-						     &tmp_mask);
+						     tmp_mask);
 	}
 
 	if (writer_task)
-		set_cpus_allowed_ptr(writer_task, &tmp_mask);
+		set_cpus_allowed_ptr(writer_task, tmp_mask);
 
 	if (stats_task)
-		set_cpus_allowed_ptr(stats_task, &tmp_mask);
+		set_cpus_allowed_ptr(stats_task, tmp_mask);
 
 	if (rcu_idle_cpu == -1)
 		rcu_idle_cpu = num_online_cpus() - 1;
 	else
 		rcu_idle_cpu--;
 
+out:
 	put_online_cpus();
+	free_cpumask_var(tmp_mask);
 }
 
 /* Shuffle tasks across CPUs, with the intent of allowing each CPU in the
-- 
cgit v1.2.3-70-g09d2


From 41c7bb9588904eb060a95bcad47bd3804a1ece25 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Thu, 1 Jan 2009 10:12:28 +1030
Subject: cpumask: convert rest of files in kernel/

Impact: Reduce stack usage, use new cpumask API.

Mainly changing cpumask_t to 'struct cpumask' and similar simple API
conversion.  Two conversions worth mentioning:

1) we use cpumask_any_but to avoid a temporary in kernel/softlockup.c,
2) Use cpumask_var_t in taskstats_user_cmd().

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
Signed-off-by: Mike Travis <travis@sgi.com>
Cc: Balbir Singh <balbir@linux.vnet.ibm.com>
Cc: Ingo Molnar <mingo@redhat.com>
---
 include/linux/stop_machine.h |  6 +++---
 kernel/power/poweroff.c      |  2 +-
 kernel/softlockup.c          |  6 ++----
 kernel/stop_machine.c        |  8 ++++----
 kernel/taskstats.c           | 39 ++++++++++++++++++++++++---------------
 5 files changed, 34 insertions(+), 27 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/stop_machine.h b/include/linux/stop_machine.h
index faf1519b5ad..74d59a64136 100644
--- a/include/linux/stop_machine.h
+++ b/include/linux/stop_machine.h
@@ -23,7 +23,7 @@
  *
  * This can be thought of as a very heavy write lock, equivalent to
  * grabbing every spinlock in the kernel. */
-int stop_machine(int (*fn)(void *), void *data, const cpumask_t *cpus);
+int stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus);
 
 /**
  * __stop_machine: freeze the machine on all CPUs and run this function
@@ -34,11 +34,11 @@ int stop_machine(int (*fn)(void *), void *data, const cpumask_t *cpus);
  * Description: This is a special version of the above, which assumes cpus
  * won't come or go while it's being called.  Used by hotplug cpu.
  */
-int __stop_machine(int (*fn)(void *), void *data, const cpumask_t *cpus);
+int __stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus);
 #else
 
 static inline int stop_machine(int (*fn)(void *), void *data,
-			       const cpumask_t *cpus)
+			       const struct cpumask *cpus)
 {
 	int ret;
 	local_irq_disable();
diff --git a/kernel/power/poweroff.c b/kernel/power/poweroff.c
index 72016f05147..97890831e1b 100644
--- a/kernel/power/poweroff.c
+++ b/kernel/power/poweroff.c
@@ -27,7 +27,7 @@ static DECLARE_WORK(poweroff_work, do_poweroff);
 static void handle_poweroff(int key, struct tty_struct *tty)
 {
 	/* run sysrq poweroff on boot cpu */
-	schedule_work_on(first_cpu(cpu_online_map), &poweroff_work);
+	schedule_work_on(cpumask_first(cpu_online_mask), &poweroff_work);
 }
 
 static struct sysrq_key_op	sysrq_poweroff_op = {
diff --git a/kernel/softlockup.c b/kernel/softlockup.c
index 492f0c72fec..d9188c66278 100644
--- a/kernel/softlockup.c
+++ b/kernel/softlockup.c
@@ -310,10 +310,8 @@ cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
 	case CPU_DOWN_PREPARE:
 	case CPU_DOWN_PREPARE_FROZEN:
 		if (hotcpu == check_cpu) {
-			cpumask_t temp_cpu_online_map = cpu_online_map;
-
-			cpu_clear(hotcpu, temp_cpu_online_map);
-			check_cpu = cpumask_any(&temp_cpu_online_map);
+			/* Pick any other online cpu. */
+			check_cpu = cpumask_any_but(cpu_online_mask, hotcpu);
 		}
 		break;
 
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index 24e8ceacc38..286c41722e8 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -69,10 +69,10 @@ static void stop_cpu(struct work_struct *unused)
 	int err;
 
 	if (!active_cpus) {
-		if (cpu == first_cpu(cpu_online_map))
+		if (cpu == cpumask_first(cpu_online_mask))
 			smdata = &active;
 	} else {
-		if (cpu_isset(cpu, *active_cpus))
+		if (cpumask_test_cpu(cpu, active_cpus))
 			smdata = &active;
 	}
 	/* Simple state machine */
@@ -109,7 +109,7 @@ static int chill(void *unused)
 	return 0;
 }
 
-int __stop_machine(int (*fn)(void *), void *data, const cpumask_t *cpus)
+int __stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus)
 {
 	struct work_struct *sm_work;
 	int i, ret;
@@ -142,7 +142,7 @@ int __stop_machine(int (*fn)(void *), void *data, const cpumask_t *cpus)
 	return ret;
 }
 
-int stop_machine(int (*fn)(void *), void *data, const cpumask_t *cpus)
+int stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus)
 {
 	int ret;
 
diff --git a/kernel/taskstats.c b/kernel/taskstats.c
index 6d7dc4ec4aa..888adbcca30 100644
--- a/kernel/taskstats.c
+++ b/kernel/taskstats.c
@@ -290,18 +290,17 @@ ret:
 	return;
 }
 
-static int add_del_listener(pid_t pid, cpumask_t *maskp, int isadd)
+static int add_del_listener(pid_t pid, const struct cpumask *mask, int isadd)
 {
 	struct listener_list *listeners;
 	struct listener *s, *tmp;
 	unsigned int cpu;
-	cpumask_t mask = *maskp;
 
-	if (!cpus_subset(mask, cpu_possible_map))
+	if (!cpumask_subset(mask, cpu_possible_mask))
 		return -EINVAL;
 
 	if (isadd == REGISTER) {
-		for_each_cpu_mask_nr(cpu, mask) {
+		for_each_cpu(cpu, mask) {
 			s = kmalloc_node(sizeof(struct listener), GFP_KERNEL,
 					 cpu_to_node(cpu));
 			if (!s)
@@ -320,7 +319,7 @@ static int add_del_listener(pid_t pid, cpumask_t *maskp, int isadd)
 
 	/* Deregister or cleanup */
 cleanup:
-	for_each_cpu_mask_nr(cpu, mask) {
+	for_each_cpu(cpu, mask) {
 		listeners = &per_cpu(listener_array, cpu);
 		down_write(&listeners->sem);
 		list_for_each_entry_safe(s, tmp, &listeners->list, list) {
@@ -335,7 +334,7 @@ cleanup:
 	return 0;
 }
 
-static int parse(struct nlattr *na, cpumask_t *mask)
+static int parse(struct nlattr *na, struct cpumask *mask)
 {
 	char *data;
 	int len;
@@ -428,23 +427,33 @@ err:
 
 static int taskstats_user_cmd(struct sk_buff *skb, struct genl_info *info)
 {
-	int rc = 0;
+	int rc;
 	struct sk_buff *rep_skb;
 	struct taskstats *stats;
 	size_t size;
-	cpumask_t mask;
+	cpumask_var_t mask;
+
+	if (!alloc_cpumask_var(&mask, GFP_KERNEL))
+		return -ENOMEM;
 
-	rc = parse(info->attrs[TASKSTATS_CMD_ATTR_REGISTER_CPUMASK], &mask);
+	rc = parse(info->attrs[TASKSTATS_CMD_ATTR_REGISTER_CPUMASK], mask);
 	if (rc < 0)
-		return rc;
-	if (rc == 0)
-		return add_del_listener(info->snd_pid, &mask, REGISTER);
+		goto free_return_rc;
+	if (rc == 0) {
+		rc = add_del_listener(info->snd_pid, mask, REGISTER);
+		goto free_return_rc;
+	}
 
-	rc = parse(info->attrs[TASKSTATS_CMD_ATTR_DEREGISTER_CPUMASK], &mask);
+	rc = parse(info->attrs[TASKSTATS_CMD_ATTR_DEREGISTER_CPUMASK], mask);
 	if (rc < 0)
+		goto free_return_rc;
+	if (rc == 0) {
+		rc = add_del_listener(info->snd_pid, mask, DEREGISTER);
+free_return_rc:
+		free_cpumask_var(mask);
 		return rc;
-	if (rc == 0)
-		return add_del_listener(info->snd_pid, &mask, DEREGISTER);
+	}
+	free_cpumask_var(mask);
 
 	/*
 	 * Size includes space for nested attributes
-- 
cgit v1.2.3-70-g09d2


From 8c384cdee3e04d6194a2c2b192b624754f990835 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Thu, 1 Jan 2009 10:12:30 +1030
Subject: cpumask: CONFIG_DISABLE_OBSOLETE_CPUMASK_FUNCTIONS

Impact: new debug CONFIG options

This helps find unconverted code.  It currently breaks compile horribly,
but we never wanted a flag day so that's expected.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 include/linux/cpumask.h | 11 ++++++++++-
 lib/Kconfig             |  4 ++++
 2 files changed, 14 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h
index 7c178a6baae..9f315382610 100644
--- a/include/linux/cpumask.h
+++ b/include/linux/cpumask.h
@@ -144,6 +144,7 @@
 typedef struct cpumask { DECLARE_BITMAP(bits, NR_CPUS); } cpumask_t;
 extern cpumask_t _unused_cpumask_arg_;
 
+#ifndef CONFIG_DISABLE_OBSOLETE_CPUMASK_FUNCTIONS
 #define cpu_set(cpu, dst) __cpu_set((cpu), &(dst))
 static inline void __cpu_set(int cpu, volatile cpumask_t *dstp)
 {
@@ -267,6 +268,7 @@ static inline void __cpus_shift_left(cpumask_t *dstp,
 {
 	bitmap_shift_left(dstp->bits, srcp->bits, n, nbits);
 }
+#endif /* !CONFIG_DISABLE_OBSOLETE_CPUMASK_FUNCTIONS */
 
 /**
  * to_cpumask - convert an NR_CPUS bitmap to a struct cpumask *
@@ -304,6 +306,7 @@ static inline const struct cpumask *get_cpu_mask(unsigned int cpu)
 	return to_cpumask(p);
 }
 
+#ifndef CONFIG_DISABLE_OBSOLETE_CPUMASK_FUNCTIONS
 /*
  * In cases where we take the address of the cpumask immediately,
  * gcc optimizes it out (it's a constant) and there's no huge stack
@@ -389,19 +392,22 @@ static inline void __cpus_fold(cpumask_t *dstp, const cpumask_t *origp,
 {
 	bitmap_fold(dstp->bits, origp->bits, sz, nbits);
 }
+#endif /* !CONFIG_DISABLE_OBSOLETE_CPUMASK_FUNCTIONS */
 
 #if NR_CPUS == 1
 
 #define nr_cpu_ids		1
+#ifndef CONFIG_DISABLE_OBSOLETE_CPUMASK_FUNCTIONS
 #define first_cpu(src)		({ (void)(src); 0; })
 #define next_cpu(n, src)	({ (void)(src); 1; })
 #define any_online_cpu(mask)	0
 #define for_each_cpu_mask(cpu, mask)	\
 	for ((cpu) = 0; (cpu) < 1; (cpu)++, (void)mask)
-
+#endif /* !CONFIG_DISABLE_OBSOLETE_CPUMASK_FUNCTIONS */
 #else /* NR_CPUS > 1 */
 
 extern int nr_cpu_ids;
+#ifndef CONFIG_DISABLE_OBSOLETE_CPUMASK_FUNCTIONS
 int __first_cpu(const cpumask_t *srcp);
 int __next_cpu(int n, const cpumask_t *srcp);
 int __any_online_cpu(const cpumask_t *mask);
@@ -413,8 +419,10 @@ int __any_online_cpu(const cpumask_t *mask);
 	for ((cpu) = -1;				\
 		(cpu) = next_cpu((cpu), (mask)),	\
 		(cpu) < NR_CPUS; )
+#endif /* !CONFIG_DISABLE_OBSOLETE_CPUMASK_FUNCTIONS */
 #endif
 
+#ifndef CONFIG_DISABLE_OBSOLETE_CPUMASK_FUNCTIONS
 #if NR_CPUS <= 64
 
 #define next_cpu_nr(n, src)		next_cpu(n, src)
@@ -432,6 +440,7 @@ int __next_cpu_nr(int n, const cpumask_t *srcp);
 		(cpu) < nr_cpu_ids; )
 
 #endif /* NR_CPUS > 64 */
+#endif /* !CONFIG_DISABLE_OBSOLETE_CPUMASK_FUNCTIONS */
 
 /*
  * The following particular system cpumasks and operations manage
diff --git a/lib/Kconfig b/lib/Kconfig
index fc5f5ee50bc..03c2c24b908 100644
--- a/lib/Kconfig
+++ b/lib/Kconfig
@@ -170,4 +170,8 @@ config CPUMASK_OFFSTACK
 	  them on the stack.  This is a bit more expensive, but avoids
 	  stack overflow.
 
+config DISABLE_OBSOLETE_CPUMASK_FUNCTIONS
+       bool "Disable obsolete cpumask functions" if DEBUG_PER_CPU_MAPS
+       depends on EXPERIMENTAL && BROKEN
+
 endmenu
-- 
cgit v1.2.3-70-g09d2


From ebdab07dad3d3a008e519b0a028e1e1ad5ecaef0 Mon Sep 17 00:00:00 2001
From: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Date: Fri, 2 Jan 2009 16:12:48 +0100
Subject: ide: move sysfs support to ide-sysfs.c

While at it:
- media_string() -> ide_media_string()

There should be no functional changes caused by this patch.

Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
---
 drivers/ide/Makefile    |   2 +-
 drivers/ide/ide-probe.c |  52 --------------------
 drivers/ide/ide-sysfs.c | 125 ++++++++++++++++++++++++++++++++++++++++++++++++
 drivers/ide/ide.c       |  72 +---------------------------
 include/linux/ide.h     |   4 ++
 5 files changed, 132 insertions(+), 123 deletions(-)
 create mode 100644 drivers/ide/ide-sysfs.c

(limited to 'include/linux')

diff --git a/drivers/ide/Makefile b/drivers/ide/Makefile
index 177e3f8523e..410728992e6 100644
--- a/drivers/ide/Makefile
+++ b/drivers/ide/Makefile
@@ -5,7 +5,7 @@
 EXTRA_CFLAGS				+= -Idrivers/ide
 
 ide-core-y += ide.o ide-ioctls.o ide-io.o ide-iops.o ide-lib.o ide-probe.o \
-	      ide-taskfile.o ide-pm.o ide-park.o ide-pio-blacklist.o
+	      ide-taskfile.o ide-pm.o ide-park.o ide-pio-blacklist.o ide-sysfs.o
 
 # core IDE code
 ide-core-$(CONFIG_IDE_TIMINGS)		+= ide-timings.o
diff --git a/drivers/ide/ide-probe.c b/drivers/ide/ide-probe.c
index 91f5faee740..f9efd069edc 100644
--- a/drivers/ide/ide-probe.c
+++ b/drivers/ide/ide-probe.c
@@ -1420,58 +1420,6 @@ static void ide_port_cable_detect(ide_hwif_t *hwif)
 	}
 }
 
-static ssize_t store_delete_devices(struct device *portdev,
-				    struct device_attribute *attr,
-				    const char *buf, size_t n)
-{
-	ide_hwif_t *hwif = dev_get_drvdata(portdev);
-
-	if (strncmp(buf, "1", n))
-		return -EINVAL;
-
-	ide_port_unregister_devices(hwif);
-
-	return n;
-};
-
-static DEVICE_ATTR(delete_devices, S_IWUSR, NULL, store_delete_devices);
-
-static ssize_t store_scan(struct device *portdev,
-			  struct device_attribute *attr,
-			  const char *buf, size_t n)
-{
-	ide_hwif_t *hwif = dev_get_drvdata(portdev);
-
-	if (strncmp(buf, "1", n))
-		return -EINVAL;
-
-	ide_port_unregister_devices(hwif);
-	ide_port_scan(hwif);
-
-	return n;
-};
-
-static DEVICE_ATTR(scan, S_IWUSR, NULL, store_scan);
-
-static struct device_attribute *ide_port_attrs[] = {
-	&dev_attr_delete_devices,
-	&dev_attr_scan,
-	NULL
-};
-
-static int ide_sysfs_register_port(ide_hwif_t *hwif)
-{
-	int i, uninitialized_var(rc);
-
-	for (i = 0; ide_port_attrs[i]; i++) {
-		rc = device_create_file(hwif->portdev, ide_port_attrs[i]);
-		if (rc)
-			break;
-	}
-
-	return rc;
-}
-
 static unsigned int ide_indexes;
 
 /**
diff --git a/drivers/ide/ide-sysfs.c b/drivers/ide/ide-sysfs.c
new file mode 100644
index 00000000000..883ffacaf45
--- /dev/null
+++ b/drivers/ide/ide-sysfs.c
@@ -0,0 +1,125 @@
+#include <linux/kernel.h>
+#include <linux/ide.h>
+
+char *ide_media_string(ide_drive_t *drive)
+{
+	switch (drive->media) {
+	case ide_disk:
+		return "disk";
+	case ide_cdrom:
+		return "cdrom";
+	case ide_tape:
+		return "tape";
+	case ide_floppy:
+		return "floppy";
+	case ide_optical:
+		return "optical";
+	default:
+		return "UNKNOWN";
+	}
+}
+
+static ssize_t media_show(struct device *dev, struct device_attribute *attr,
+			  char *buf)
+{
+	ide_drive_t *drive = to_ide_device(dev);
+	return sprintf(buf, "%s\n", ide_media_string(drive));
+}
+
+static ssize_t drivename_show(struct device *dev, struct device_attribute *attr,
+			      char *buf)
+{
+	ide_drive_t *drive = to_ide_device(dev);
+	return sprintf(buf, "%s\n", drive->name);
+}
+
+static ssize_t modalias_show(struct device *dev, struct device_attribute *attr,
+			     char *buf)
+{
+	ide_drive_t *drive = to_ide_device(dev);
+	return sprintf(buf, "ide:m-%s\n", ide_media_string(drive));
+}
+
+static ssize_t model_show(struct device *dev, struct device_attribute *attr,
+			  char *buf)
+{
+	ide_drive_t *drive = to_ide_device(dev);
+	return sprintf(buf, "%s\n", (char *)&drive->id[ATA_ID_PROD]);
+}
+
+static ssize_t firmware_show(struct device *dev, struct device_attribute *attr,
+			     char *buf)
+{
+	ide_drive_t *drive = to_ide_device(dev);
+	return sprintf(buf, "%s\n", (char *)&drive->id[ATA_ID_FW_REV]);
+}
+
+static ssize_t serial_show(struct device *dev, struct device_attribute *attr,
+			   char *buf)
+{
+	ide_drive_t *drive = to_ide_device(dev);
+	return sprintf(buf, "%s\n", (char *)&drive->id[ATA_ID_SERNO]);
+}
+
+struct device_attribute ide_dev_attrs[] = {
+	__ATTR_RO(media),
+	__ATTR_RO(drivename),
+	__ATTR_RO(modalias),
+	__ATTR_RO(model),
+	__ATTR_RO(firmware),
+	__ATTR(serial, 0400, serial_show, NULL),
+	__ATTR(unload_heads, 0644, ide_park_show, ide_park_store),
+	__ATTR_NULL
+};
+
+static ssize_t store_delete_devices(struct device *portdev,
+				    struct device_attribute *attr,
+				    const char *buf, size_t n)
+{
+	ide_hwif_t *hwif = dev_get_drvdata(portdev);
+
+	if (strncmp(buf, "1", n))
+		return -EINVAL;
+
+	ide_port_unregister_devices(hwif);
+
+	return n;
+};
+
+static DEVICE_ATTR(delete_devices, S_IWUSR, NULL, store_delete_devices);
+
+static ssize_t store_scan(struct device *portdev,
+			  struct device_attribute *attr,
+			  const char *buf, size_t n)
+{
+	ide_hwif_t *hwif = dev_get_drvdata(portdev);
+
+	if (strncmp(buf, "1", n))
+		return -EINVAL;
+
+	ide_port_unregister_devices(hwif);
+	ide_port_scan(hwif);
+
+	return n;
+};
+
+static DEVICE_ATTR(scan, S_IWUSR, NULL, store_scan);
+
+static struct device_attribute *ide_port_attrs[] = {
+	&dev_attr_delete_devices,
+	&dev_attr_scan,
+	NULL
+};
+
+int ide_sysfs_register_port(ide_hwif_t *hwif)
+{
+	int i, uninitialized_var(rc);
+
+	for (i = 0; ide_port_attrs[i]; i++) {
+		rc = device_create_file(hwif->portdev, ide_port_attrs[i]);
+		if (rc)
+			break;
+	}
+
+	return rc;
+}
diff --git a/drivers/ide/ide.c b/drivers/ide/ide.c
index f0f09f702e9..46a2d4ca812 100644
--- a/drivers/ide/ide.c
+++ b/drivers/ide/ide.c
@@ -440,81 +440,13 @@ static int ide_bus_match(struct device *dev, struct device_driver *drv)
 	return 1;
 }
 
-static char *media_string(ide_drive_t *drive)
-{
-	switch (drive->media) {
-	case ide_disk:
-		return "disk";
-	case ide_cdrom:
-		return "cdrom";
-	case ide_tape:
-		return "tape";
-	case ide_floppy:
-		return "floppy";
-	case ide_optical:
-		return "optical";
-	default:
-		return "UNKNOWN";
-	}
-}
-
-static ssize_t media_show(struct device *dev, struct device_attribute *attr, char *buf)
-{
-	ide_drive_t *drive = to_ide_device(dev);
-	return sprintf(buf, "%s\n", media_string(drive));
-}
-
-static ssize_t drivename_show(struct device *dev, struct device_attribute *attr, char *buf)
-{
-	ide_drive_t *drive = to_ide_device(dev);
-	return sprintf(buf, "%s\n", drive->name);
-}
-
-static ssize_t modalias_show(struct device *dev, struct device_attribute *attr, char *buf)
-{
-	ide_drive_t *drive = to_ide_device(dev);
-	return sprintf(buf, "ide:m-%s\n", media_string(drive));
-}
-
-static ssize_t model_show(struct device *dev, struct device_attribute *attr,
-			  char *buf)
-{
-	ide_drive_t *drive = to_ide_device(dev);
-	return sprintf(buf, "%s\n", (char *)&drive->id[ATA_ID_PROD]);
-}
-
-static ssize_t firmware_show(struct device *dev, struct device_attribute *attr,
-			     char *buf)
-{
-	ide_drive_t *drive = to_ide_device(dev);
-	return sprintf(buf, "%s\n", (char *)&drive->id[ATA_ID_FW_REV]);
-}
-
-static ssize_t serial_show(struct device *dev, struct device_attribute *attr,
-			   char *buf)
-{
-	ide_drive_t *drive = to_ide_device(dev);
-	return sprintf(buf, "%s\n", (char *)&drive->id[ATA_ID_SERNO]);
-}
-
-static struct device_attribute ide_dev_attrs[] = {
-	__ATTR_RO(media),
-	__ATTR_RO(drivename),
-	__ATTR_RO(modalias),
-	__ATTR_RO(model),
-	__ATTR_RO(firmware),
-	__ATTR(serial, 0400, serial_show, NULL),
-	__ATTR(unload_heads, 0644, ide_park_show, ide_park_store),
-	__ATTR_NULL
-};
-
 static int ide_uevent(struct device *dev, struct kobj_uevent_env *env)
 {
 	ide_drive_t *drive = to_ide_device(dev);
 
-	add_uevent_var(env, "MEDIA=%s", media_string(drive));
+	add_uevent_var(env, "MEDIA=%s", ide_media_string(drive));
 	add_uevent_var(env, "DRIVENAME=%s", drive->name);
-	add_uevent_var(env, "MODALIAS=ide:m-%s", media_string(drive));
+	add_uevent_var(env, "MODALIAS=ide:m-%s", ide_media_string(drive));
 	return 0;
 }
 
diff --git a/include/linux/ide.h b/include/linux/ide.h
index e99c56de7f5..62fccaea311 100644
--- a/include/linux/ide.h
+++ b/include/linux/ide.h
@@ -1533,6 +1533,7 @@ void ide_unregister_region(struct gendisk *);
 void ide_undecoded_slave(ide_drive_t *);
 
 void ide_port_apply_params(ide_hwif_t *);
+int ide_sysfs_register_port(ide_hwif_t *);
 
 struct ide_host *ide_host_alloc(const struct ide_port_info *, hw_regs_t **);
 void ide_host_free(struct ide_host *);
@@ -1627,6 +1628,9 @@ extern struct mutex ide_cfg_mtx;
 
 #define local_irq_set(flags)	do { local_save_flags((flags)); local_irq_enable_in_hardirq(); } while (0)
 
+char *ide_media_string(ide_drive_t *);
+
+extern struct device_attribute ide_dev_attrs[];
 extern struct bus_type ide_bus_type;
 extern struct class *ide_port_class;
 
-- 
cgit v1.2.3-70-g09d2


From 295f00042aaf6b553b5f37348f89bab463d4a469 Mon Sep 17 00:00:00 2001
From: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Date: Fri, 2 Jan 2009 16:12:48 +0100
Subject: ide: don't execute the next queued command from the hard-IRQ context
 (v2)

* Tell the block layer that we are not done handling requests by using
  blk_plug_device() in ide_do_request() (request handling function)
  and ide_timer_expiry() (timeout handler) if the queue is not empty.

* Remove optimization which directly calls ide_do_request() for the next
  queued command from the ide_intr() (IRQ handler) and ide_timer_expiry().

* Remove no longer needed IRQ masking from ide_do_request() - in case of
  IDE ports needing serialization disable_irq_nosync()/enable_irq() was
  used for the (possibly shared) IRQ of the other IDE port.

* Put the misplaced comment in the right place in ide_do_request().

* Drop no longer needed 'int masked_irq' argument from ide_do_request().

* Merge ide_do_request() into do_ide_request().

* Remove no longer needed IDE_NO_IRQ define.

While at it:

* Don't use HWGROUP() macro in do_ide_request().

* Use __func__ in ide_intr().

This patch reduces IRQ hadling latency for IDE and improves the system-wide
handling of shared IRQs (which should result in more timeout resistant and
stable IDE systems).  It also makes it possible to do some further changes
later (i.e. replace some busy-waiting delays with sleeping equivalents).

v2:
Changes per review from Elias Oltmanns:
- fix wrong goto statement in 'if (startstop == ide_stopped)' block
- use spin_unlock_irq()
- don't use obsolete HWIF() macro

Cc: Elias Oltmanns <eo@nebensachen.de>
Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
---
 drivers/ide/ide-io.c | 67 +++++++++++++++++++++++-----------------------------
 include/linux/ide.h  |  7 ------
 2 files changed, 30 insertions(+), 44 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/ide/ide-io.c b/drivers/ide/ide-io.c
index ecacc008fda..23754bc5e59 100644
--- a/drivers/ide/ide-io.c
+++ b/drivers/ide/ide-io.c
@@ -778,8 +778,10 @@ repeat:
  * the driver.  This makes the driver much more friendlier to shared IRQs
  * than previous designs, while remaining 100% (?) SMP safe and capable.
  */
-static void ide_do_request (ide_hwgroup_t *hwgroup, int masked_irq)
+void do_ide_request(struct request_queue *q)
 {
+	ide_drive_t	*orig_drive = q->queuedata;
+	ide_hwgroup_t	*hwgroup = orig_drive->hwif->hwgroup;
 	ide_drive_t	*drive;
 	ide_hwif_t	*hwif;
 	struct request	*rq;
@@ -837,10 +839,14 @@ static void ide_do_request (ide_hwgroup_t *hwgroup, int masked_irq)
 			}
 
 			/* no more work for this hwgroup (for now) */
-			return;
+			goto plug_device;
 		}
-	again:
-		hwif = HWIF(drive);
+
+		if (drive != orig_drive)
+			goto plug_device;
+again:
+		hwif = drive->hwif;
+
 		if (hwif != hwgroup->hwif) {
 			/*
 			 * set nIEN for previous hwif, drives in the
@@ -888,41 +894,26 @@ static void ide_do_request (ide_hwgroup_t *hwgroup, int masked_irq)
 				goto again;
 			/* We clear busy, there should be no pending ATA command at this point. */
 			hwgroup->busy = 0;
-			break;
+			goto plug_device;
 		}
 
 		hwgroup->rq = rq;
 
-		/*
-		 * Some systems have trouble with IDE IRQs arriving while
-		 * the driver is still setting things up.  So, here we disable
-		 * the IRQ used by this interface while the request is being started.
-		 * This may look bad at first, but pretty much the same thing
-		 * happens anyway when any interrupt comes in, IDE or otherwise
-		 *  -- the kernel masks the IRQ while it is being handled.
-		 */
-		if (masked_irq != IDE_NO_IRQ && hwif->irq != masked_irq)
-			disable_irq_nosync(hwif->irq);
-		spin_unlock(&hwgroup->lock);
-		local_irq_enable_in_hardirq();
-			/* allow other IRQs while we start this request */
+		spin_unlock_irq(&hwgroup->lock);
 		startstop = start_request(drive, rq);
 		spin_lock_irq(&hwgroup->lock);
-		if (masked_irq != IDE_NO_IRQ && hwif->irq != masked_irq)
-			enable_irq(hwif->irq);
-		if (startstop == ide_stopped)
+
+		if (startstop == ide_stopped) {
 			hwgroup->busy = 0;
+			if (!elv_queue_empty(orig_drive->queue))
+				blk_plug_device(orig_drive->queue);
+		}
 	}
-}
+	return;
 
-/*
- * Passes the stuff to ide_do_request
- */
-void do_ide_request(struct request_queue *q)
-{
-	ide_drive_t *drive = q->queuedata;
-
-	ide_do_request(HWGROUP(drive), IDE_NO_IRQ);
+plug_device:
+	if (!elv_queue_empty(orig_drive->queue))
+		blk_plug_device(orig_drive->queue);
 }
 
 /*
@@ -1074,11 +1065,13 @@ void ide_timer_expiry (unsigned long data)
 			drive->service_time = jiffies - drive->service_start;
 			spin_lock_irq(&hwgroup->lock);
 			enable_irq(hwif->irq);
-			if (startstop == ide_stopped)
+			if (startstop == ide_stopped) {
 				hwgroup->busy = 0;
+				if (!elv_queue_empty(drive->queue))
+					blk_plug_device(drive->queue);
+			}
 		}
 	}
-	ide_do_request(hwgroup, IDE_NO_IRQ);
 	spin_unlock_irqrestore(&hwgroup->lock, flags);
 }
 
@@ -1271,11 +1264,11 @@ irqreturn_t ide_intr (int irq, void *dev_id)
 	if (startstop == ide_stopped) {
 		if (hwgroup->handler == NULL) {	/* paranoia */
 			hwgroup->busy = 0;
-			ide_do_request(hwgroup, hwif->irq);
-		} else {
-			printk(KERN_ERR "%s: ide_intr: huh? expected NULL handler "
-				"on exit\n", drive->name);
-		}
+			if (!elv_queue_empty(drive->queue))
+				blk_plug_device(drive->queue);
+		} else
+			printk(KERN_ERR "%s: %s: huh? expected NULL handler "
+					"on exit\n", __func__, drive->name);
 	}
 out_handled:
 	irq_ret = IRQ_HANDLED;
diff --git a/include/linux/ide.h b/include/linux/ide.h
index 62fccaea311..968ca8f6053 100644
--- a/include/linux/ide.h
+++ b/include/linux/ide.h
@@ -32,13 +32,6 @@
 # define SUPPORT_VLB_SYNC 1
 #endif
 
-/*
- * Used to indicate "no IRQ", should be a value that cannot be an IRQ
- * number.
- */
- 
-#define IDE_NO_IRQ		(-1)
-
 typedef unsigned char	byte;	/* used everywhere */
 
 /*
-- 
cgit v1.2.3-70-g09d2


From 631de3708d595d153e8a510a3608689290f4c0ed Mon Sep 17 00:00:00 2001
From: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Date: Fri, 2 Jan 2009 16:12:50 +0100
Subject: ide: add ide_[un]lock_hwgroup() helpers

Add ide_[un]lock_hwgroup() inline helpers for obtaining exclusive
access to the given hwgroup and update the core code accordingly.

[ This change besides making code saner results in more efficient
  use of ide_{get,release}_lock(). ]

Cc: Michael Schmitz <schmitz@biophys.uni-duesseldorf.de>
Cc: Geert Uytterhoeven <geert@linux-m68k.org>
Cc: Elias Oltmanns <eo@nebensachen.de>
Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
---
 drivers/ide/ide-io.c   | 32 +++++++++++---------------------
 drivers/ide/ide-park.c |  2 +-
 include/linux/ide.h    | 20 ++++++++++++++++++++
 3 files changed, 32 insertions(+), 22 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/ide/ide-io.c b/drivers/ide/ide-io.c
index c60512196f6..ab480042757 100644
--- a/drivers/ide/ide-io.c
+++ b/drivers/ide/ide-io.c
@@ -790,10 +790,7 @@ void do_ide_request(struct request_queue *q)
 	/* caller must own hwgroup->lock */
 	BUG_ON(!irqs_disabled());
 
-	while (!hwgroup->busy) {
-		hwgroup->busy = 1;
-		/* for atari only */
-		ide_get_lock(ide_intr, hwgroup);
+	while (!ide_lock_hwgroup(hwgroup)) {
 		drive = choose_drive(hwgroup);
 		if (drive == NULL) {
 			int sleeping = 0;
@@ -825,17 +822,10 @@ void do_ide_request(struct request_queue *q)
 				hwgroup->sleeping = 1;
 				hwgroup->req_gen_timer = hwgroup->req_gen;
 				mod_timer(&hwgroup->timer, sleep);
-				/* we purposely leave hwgroup->busy==1
+				/* we purposely leave hwgroup locked
 				 * while sleeping */
-			} else {
-				/* Ugly, but how can we sleep for the lock
-				 * otherwise? perhaps from tq_disk?
-				 */
-
-				/* for atari only */
-				ide_release_lock();
-				hwgroup->busy = 0;
-			}
+			} else
+				ide_unlock_hwgroup(hwgroup);
 
 			/* no more work for this hwgroup (for now) */
 			goto plug_device;
@@ -865,7 +855,7 @@ void do_ide_request(struct request_queue *q)
 		 */
 		rq = elv_next_request(drive->queue);
 		if (!rq) {
-			hwgroup->busy = 0;
+			ide_unlock_hwgroup(hwgroup);
 			break;
 		}
 
@@ -885,8 +875,8 @@ void do_ide_request(struct request_queue *q)
 		if ((drive->dev_flags & IDE_DFLAG_BLOCKED) &&
 		    blk_pm_request(rq) == 0 &&
 		    (rq->cmd_flags & REQ_PREEMPT) == 0) {
-			/* We clear busy, there should be no pending ATA command at this point. */
-			hwgroup->busy = 0;
+			/* there should be no pending command at this point */
+			ide_unlock_hwgroup(hwgroup);
 			goto plug_device;
 		}
 
@@ -897,7 +887,7 @@ void do_ide_request(struct request_queue *q)
 		spin_lock_irq(&hwgroup->lock);
 
 		if (startstop == ide_stopped) {
-			hwgroup->busy = 0;
+			ide_unlock_hwgroup(hwgroup);
 			if (!elv_queue_empty(orig_drive->queue))
 				blk_plug_device(orig_drive->queue);
 		}
@@ -1001,7 +991,7 @@ void ide_timer_expiry (unsigned long data)
 		 */
 		if (hwgroup->sleeping) {
 			hwgroup->sleeping = 0;
-			hwgroup->busy = 0;
+			ide_unlock_hwgroup(hwgroup);
 		}
 	} else {
 		ide_drive_t *drive = hwgroup->drive;
@@ -1056,7 +1046,7 @@ void ide_timer_expiry (unsigned long data)
 			spin_lock_irq(&hwgroup->lock);
 			enable_irq(hwif->irq);
 			if (startstop == ide_stopped) {
-				hwgroup->busy = 0;
+				ide_unlock_hwgroup(hwgroup);
 				if (!elv_queue_empty(drive->queue))
 					blk_plug_device(drive->queue);
 			}
@@ -1249,7 +1239,7 @@ irqreturn_t ide_intr (int irq, void *dev_id)
 	drive->service_time = jiffies - drive->service_start;
 	if (startstop == ide_stopped) {
 		if (hwgroup->handler == NULL) {	/* paranoia */
-			hwgroup->busy = 0;
+			ide_unlock_hwgroup(hwgroup);
 			if (!elv_queue_empty(drive->queue))
 				blk_plug_device(drive->queue);
 		} else
diff --git a/drivers/ide/ide-park.c b/drivers/ide/ide-park.c
index 63d01c55f86..44c6787f8ae 100644
--- a/drivers/ide/ide-park.c
+++ b/drivers/ide/ide-park.c
@@ -22,7 +22,7 @@ static void issue_park_cmd(ide_drive_t *drive, unsigned long timeout)
 		if (reset_timer && hwgroup->sleeping &&
 		    del_timer(&hwgroup->timer)) {
 			hwgroup->sleeping = 0;
-			hwgroup->busy = 0;
+			ide_unlock_hwgroup(hwgroup);
 			blk_start_queueing(q);
 		}
 		spin_unlock_irq(&hwgroup->lock);
diff --git a/include/linux/ide.h b/include/linux/ide.h
index 968ca8f6053..f408d6123f1 100644
--- a/include/linux/ide.h
+++ b/include/linux/ide.h
@@ -1280,6 +1280,26 @@ extern void ide_stall_queue(ide_drive_t *drive, unsigned long timeout);
 
 extern void ide_timer_expiry(unsigned long);
 extern irqreturn_t ide_intr(int irq, void *dev_id);
+
+static inline int ide_lock_hwgroup(ide_hwgroup_t *hwgroup)
+{
+	if (hwgroup->busy)
+		return 1;
+
+	hwgroup->busy = 1;
+	/* for atari only */
+	ide_get_lock(ide_intr, hwgroup);
+
+	return 0;
+}
+
+static inline void ide_unlock_hwgroup(ide_hwgroup_t *hwgroup)
+{
+	/* for atari only */
+	ide_release_lock();
+	hwgroup->busy = 0;
+}
+
 extern void do_ide_request(struct request_queue *);
 
 void ide_init_disk(struct gendisk *, ide_drive_t *);
-- 
cgit v1.2.3-70-g09d2


From 201bffa46466b4afdf7d29db8eca3fa5decb39c8 Mon Sep 17 00:00:00 2001
From: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Date: Fri, 2 Jan 2009 16:12:50 +0100
Subject: ide: use per-device request queue locks (v2)

* Move hack for flush requests from choose_drive() to do_ide_request().

* Add ide_plug_device() helper and convert core IDE code from using
  per-hwgroup lock as a request lock to use the ->queue_lock instead.

* Remove no longer needed:
  - choose_drive() function
  - WAKEUP() macro
  - 'sleeping' flag from ide_hwif_t
  - 'service_{start,time}' fields from ide_drive_t

This patch results in much simpler and more maintainable code
(besides being a scalability improvement).

v2:
* Fixes/improvements based on review from Elias:
  - take as many requests off the queue as possible
  - remove now redundant BUG_ON()

Cc: Elias Oltmanns <eo@nebensachen.de>
Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
---
 drivers/ide/ide-io.c    | 214 +++++++++++++++---------------------------------
 drivers/ide/ide-park.c  |  13 +--
 drivers/ide/ide-probe.c |   3 +-
 include/linux/ide.h     |   4 -
 4 files changed, 77 insertions(+), 157 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/ide/ide-io.c b/drivers/ide/ide-io.c
index ab480042757..bb3248abf47 100644
--- a/drivers/ide/ide-io.c
+++ b/drivers/ide/ide-io.c
@@ -667,85 +667,10 @@ void ide_stall_queue (ide_drive_t *drive, unsigned long timeout)
 	drive->sleep = timeout + jiffies;
 	drive->dev_flags |= IDE_DFLAG_SLEEPING;
 }
-
 EXPORT_SYMBOL(ide_stall_queue);
 
-#define WAKEUP(drive)	((drive)->service_start + 2 * (drive)->service_time)
-
-/**
- *	choose_drive		-	select a drive to service
- *	@hwgroup: hardware group to select on
- *
- *	choose_drive() selects the next drive which will be serviced.
- *	This is necessary because the IDE layer can't issue commands
- *	to both drives on the same cable, unlike SCSI.
- */
- 
-static inline ide_drive_t *choose_drive (ide_hwgroup_t *hwgroup)
-{
-	ide_drive_t *drive, *best;
-
-repeat:	
-	best = NULL;
-	drive = hwgroup->drive;
-
-	/*
-	 * drive is doing pre-flush, ordered write, post-flush sequence. even
-	 * though that is 3 requests, it must be seen as a single transaction.
-	 * we must not preempt this drive until that is complete
-	 */
-	if (blk_queue_flushing(drive->queue)) {
-		/*
-		 * small race where queue could get replugged during
-		 * the 3-request flush cycle, just yank the plug since
-		 * we want it to finish asap
-		 */
-		blk_remove_plug(drive->queue);
-		return drive;
-	}
-
-	do {
-		u8 dev_s = !!(drive->dev_flags & IDE_DFLAG_SLEEPING);
-		u8 best_s = (best && !!(best->dev_flags & IDE_DFLAG_SLEEPING));
-
-		if ((dev_s == 0 || time_after_eq(jiffies, drive->sleep)) &&
-		    !elv_queue_empty(drive->queue)) {
-			if (best == NULL ||
-			    (dev_s && (best_s == 0 || time_before(drive->sleep, best->sleep))) ||
-			    (best_s == 0 && time_before(WAKEUP(drive), WAKEUP(best)))) {
-				if (!blk_queue_plugged(drive->queue))
-					best = drive;
-			}
-		}
-	} while ((drive = drive->next) != hwgroup->drive);
-
-	if (best && (best->dev_flags & IDE_DFLAG_NICE1) &&
-	    (best->dev_flags & IDE_DFLAG_SLEEPING) == 0 &&
-	    best != hwgroup->drive && best->service_time > WAIT_MIN_SLEEP) {
-		long t = (signed long)(WAKEUP(best) - jiffies);
-		if (t >= WAIT_MIN_SLEEP) {
-		/*
-		 * We *may* have some time to spare, but first let's see if
-		 * someone can potentially benefit from our nice mood today..
-		 */
-			drive = best->next;
-			do {
-				if ((drive->dev_flags & IDE_DFLAG_SLEEPING) == 0
-				 && time_before(jiffies - best->service_time, WAKEUP(drive))
-				 && time_before(WAKEUP(drive), jiffies + t))
-				{
-					ide_stall_queue(best, min_t(long, t, 10 * WAIT_MIN_SLEEP));
-					goto repeat;
-				}
-			} while ((drive = drive->next) != best);
-		}
-	}
-	return best;
-}
-
 /*
  * Issue a new request to a drive from hwgroup
- * Caller must have already done spin_lock_irqsave(&hwgroup->lock, ..);
  *
  * A hwgroup is a serialized group of IDE interfaces.  Usually there is
  * exactly one hwif (interface) per hwgroup, but buggy controllers (eg. CMD640)
@@ -757,8 +682,7 @@ repeat:
  * possibly along with many other devices.  This is especially common in
  * PCI-based systems with off-board IDE controller cards.
  *
- * The IDE driver uses a per-hwgroup spinlock to protect
- * access to the request queues, and to protect the hwgroup->busy flag.
+ * The IDE driver uses a per-hwgroup lock to protect the hwgroup->busy flag.
  *
  * The first thread into the driver for a particular hwgroup sets the
  * hwgroup->busy flag to indicate that this hwgroup is now active,
@@ -780,61 +704,38 @@ repeat:
  */
 void do_ide_request(struct request_queue *q)
 {
-	ide_drive_t	*orig_drive = q->queuedata;
-	ide_hwgroup_t	*hwgroup = orig_drive->hwif->hwgroup;
-	ide_drive_t	*drive;
-	ide_hwif_t	*hwif;
+	ide_drive_t	*drive = q->queuedata;
+	ide_hwif_t	*hwif = drive->hwif;
+	ide_hwgroup_t	*hwgroup = hwif->hwgroup;
 	struct request	*rq;
 	ide_startstop_t	startstop;
 
-	/* caller must own hwgroup->lock */
-	BUG_ON(!irqs_disabled());
-
-	while (!ide_lock_hwgroup(hwgroup)) {
-		drive = choose_drive(hwgroup);
-		if (drive == NULL) {
-			int sleeping = 0;
-			unsigned long sleep = 0; /* shut up, gcc */
-			hwgroup->rq = NULL;
-			drive = hwgroup->drive;
-			do {
-				if ((drive->dev_flags & IDE_DFLAG_SLEEPING) &&
-				    (sleeping == 0 ||
-				     time_before(drive->sleep, sleep))) {
-					sleeping = 1;
-					sleep = drive->sleep;
-				}
-			} while ((drive = drive->next) != hwgroup->drive);
-			if (sleeping) {
+	/*
+	 * drive is doing pre-flush, ordered write, post-flush sequence. even
+	 * though that is 3 requests, it must be seen as a single transaction.
+	 * we must not preempt this drive until that is complete
+	 */
+	if (blk_queue_flushing(q))
 		/*
-		 * Take a short snooze, and then wake up this hwgroup again.
-		 * This gives other hwgroups on the same a chance to
-		 * play fairly with us, just in case there are big differences
-		 * in relative throughputs.. don't want to hog the cpu too much.
+		 * small race where queue could get replugged during
+		 * the 3-request flush cycle, just yank the plug since
+		 * we want it to finish asap
 		 */
-				if (time_before(sleep, jiffies + WAIT_MIN_SLEEP))
-					sleep = jiffies + WAIT_MIN_SLEEP;
-#if 1
-				if (timer_pending(&hwgroup->timer))
-					printk(KERN_CRIT "ide_set_handler: timer already active\n");
-#endif
-				/* so that ide_timer_expiry knows what to do */
-				hwgroup->sleeping = 1;
-				hwgroup->req_gen_timer = hwgroup->req_gen;
-				mod_timer(&hwgroup->timer, sleep);
-				/* we purposely leave hwgroup locked
-				 * while sleeping */
-			} else
-				ide_unlock_hwgroup(hwgroup);
+		blk_remove_plug(q);
 
-			/* no more work for this hwgroup (for now) */
-			goto plug_device;
-		}
+	spin_unlock_irq(q->queue_lock);
+	spin_lock_irq(&hwgroup->lock);
 
-		if (drive != orig_drive)
-			goto plug_device;
+	if (!ide_lock_hwgroup(hwgroup)) {
+repeat:
+		hwgroup->rq = NULL;
 
-		hwif = drive->hwif;
+		if (drive->dev_flags & IDE_DFLAG_SLEEPING) {
+			if (time_before(drive->sleep, jiffies)) {
+				ide_unlock_hwgroup(hwgroup);
+				goto plug_device;
+			}
+		}
 
 		if (hwif != hwgroup->hwif) {
 			/*
@@ -847,16 +748,20 @@ void do_ide_request(struct request_queue *q)
 		hwgroup->hwif = hwif;
 		hwgroup->drive = drive;
 		drive->dev_flags &= ~(IDE_DFLAG_SLEEPING | IDE_DFLAG_PARKED);
-		drive->service_start = jiffies;
 
+		spin_unlock_irq(&hwgroup->lock);
+		spin_lock_irq(q->queue_lock);
 		/*
 		 * we know that the queue isn't empty, but this can happen
 		 * if the q->prep_rq_fn() decides to kill a request
 		 */
 		rq = elv_next_request(drive->queue);
+		spin_unlock_irq(q->queue_lock);
+		spin_lock_irq(&hwgroup->lock);
+
 		if (!rq) {
 			ide_unlock_hwgroup(hwgroup);
-			break;
+			goto out;
 		}
 
 		/*
@@ -886,17 +791,21 @@ void do_ide_request(struct request_queue *q)
 		startstop = start_request(drive, rq);
 		spin_lock_irq(&hwgroup->lock);
 
-		if (startstop == ide_stopped) {
-			ide_unlock_hwgroup(hwgroup);
-			if (!elv_queue_empty(orig_drive->queue))
-				blk_plug_device(orig_drive->queue);
-		}
-	}
+		if (startstop == ide_stopped)
+			goto repeat;
+	} else
+		goto plug_device;
+out:
+	spin_unlock_irq(&hwgroup->lock);
+	spin_lock_irq(q->queue_lock);
 	return;
 
 plug_device:
-	if (!elv_queue_empty(orig_drive->queue))
-		blk_plug_device(orig_drive->queue);
+	spin_unlock_irq(&hwgroup->lock);
+	spin_lock_irq(q->queue_lock);
+
+	if (!elv_queue_empty(q))
+		blk_plug_device(q);
 }
 
 /*
@@ -957,6 +866,17 @@ out:
 	return ret;
 }
 
+static void ide_plug_device(ide_drive_t *drive)
+{
+	struct request_queue *q = drive->queue;
+	unsigned long flags;
+
+	spin_lock_irqsave(q->queue_lock, flags);
+	if (!elv_queue_empty(q))
+		blk_plug_device(q);
+	spin_unlock_irqrestore(q->queue_lock, flags);
+}
+
 /**
  *	ide_timer_expiry	-	handle lack of an IDE interrupt
  *	@data: timer callback magic (hwgroup)
@@ -974,10 +894,12 @@ out:
 void ide_timer_expiry (unsigned long data)
 {
 	ide_hwgroup_t	*hwgroup = (ide_hwgroup_t *) data;
+	ide_drive_t	*uninitialized_var(drive);
 	ide_handler_t	*handler;
 	ide_expiry_t	*expiry;
 	unsigned long	flags;
 	unsigned long	wait = -1;
+	int		plug_device = 0;
 
 	spin_lock_irqsave(&hwgroup->lock, flags);
 
@@ -989,12 +911,8 @@ void ide_timer_expiry (unsigned long data)
 		 * or we were "sleeping" to give other devices a chance.
 		 * Either way, we don't really want to complain about anything.
 		 */
-		if (hwgroup->sleeping) {
-			hwgroup->sleeping = 0;
-			ide_unlock_hwgroup(hwgroup);
-		}
 	} else {
-		ide_drive_t *drive = hwgroup->drive;
+		drive = hwgroup->drive;
 		if (!drive) {
 			printk(KERN_ERR "ide_timer_expiry: hwgroup->drive was NULL\n");
 			hwgroup->handler = NULL;
@@ -1042,17 +960,18 @@ void ide_timer_expiry (unsigned long data)
 					ide_error(drive, "irq timeout",
 						  hwif->tp_ops->read_status(hwif));
 			}
-			drive->service_time = jiffies - drive->service_start;
 			spin_lock_irq(&hwgroup->lock);
 			enable_irq(hwif->irq);
 			if (startstop == ide_stopped) {
 				ide_unlock_hwgroup(hwgroup);
-				if (!elv_queue_empty(drive->queue))
-					blk_plug_device(drive->queue);
+				plug_device = 1;
 			}
 		}
 	}
 	spin_unlock_irqrestore(&hwgroup->lock, flags);
+
+	if (plug_device)
+		ide_plug_device(drive);
 }
 
 /**
@@ -1146,10 +1065,11 @@ irqreturn_t ide_intr (int irq, void *dev_id)
 	unsigned long flags;
 	ide_hwgroup_t *hwgroup = (ide_hwgroup_t *)dev_id;
 	ide_hwif_t *hwif = hwgroup->hwif;
-	ide_drive_t *drive;
+	ide_drive_t *uninitialized_var(drive);
 	ide_handler_t *handler;
 	ide_startstop_t startstop;
 	irqreturn_t irq_ret = IRQ_NONE;
+	int plug_device = 0;
 
 	spin_lock_irqsave(&hwgroup->lock, flags);
 
@@ -1236,12 +1156,10 @@ irqreturn_t ide_intr (int irq, void *dev_id)
 	 * same irq as is currently being serviced here, and Linux
 	 * won't allow another of the same (on any CPU) until we return.
 	 */
-	drive->service_time = jiffies - drive->service_start;
 	if (startstop == ide_stopped) {
 		if (hwgroup->handler == NULL) {	/* paranoia */
 			ide_unlock_hwgroup(hwgroup);
-			if (!elv_queue_empty(drive->queue))
-				blk_plug_device(drive->queue);
+			plug_device = 1;
 		} else
 			printk(KERN_ERR "%s: %s: huh? expected NULL handler "
 					"on exit\n", __func__, drive->name);
@@ -1250,6 +1168,10 @@ out_handled:
 	irq_ret = IRQ_HANDLED;
 out:
 	spin_unlock_irqrestore(&hwgroup->lock, flags);
+
+	if (plug_device)
+		ide_plug_device(drive);
+
 	return irq_ret;
 }
 
diff --git a/drivers/ide/ide-park.c b/drivers/ide/ide-park.c
index 44c6787f8ae..678454ac248 100644
--- a/drivers/ide/ide-park.c
+++ b/drivers/ide/ide-park.c
@@ -16,16 +16,19 @@ static void issue_park_cmd(ide_drive_t *drive, unsigned long timeout)
 	spin_lock_irq(&hwgroup->lock);
 	if (drive->dev_flags & IDE_DFLAG_PARKED) {
 		int reset_timer = time_before(timeout, drive->sleep);
+		int start_queue = 0;
 
 		drive->sleep = timeout;
 		wake_up_all(&ide_park_wq);
-		if (reset_timer && hwgroup->sleeping &&
-		    del_timer(&hwgroup->timer)) {
-			hwgroup->sleeping = 0;
-			ide_unlock_hwgroup(hwgroup);
+		if (reset_timer && del_timer(&hwgroup->timer))
+			start_queue = 1;
+		spin_unlock_irq(&hwgroup->lock);
+
+		if (start_queue) {
+			spin_lock_irq(q->queue_lock);
 			blk_start_queueing(q);
+			spin_unlock_irq(q->queue_lock);
 		}
-		spin_unlock_irq(&hwgroup->lock);
 		return;
 	}
 	spin_unlock_irq(&hwgroup->lock);
diff --git a/drivers/ide/ide-probe.c b/drivers/ide/ide-probe.c
index f9efd069edc..966b74c1577 100644
--- a/drivers/ide/ide-probe.c
+++ b/drivers/ide/ide-probe.c
@@ -881,8 +881,7 @@ static int ide_init_queue(ide_drive_t *drive)
 	 *	do not.
 	 */
 
-	q = blk_init_queue_node(do_ide_request, &hwif->hwgroup->lock,
-				hwif_to_node(hwif));
+	q = blk_init_queue_node(do_ide_request, NULL, hwif_to_node(hwif));
 	if (!q)
 		return 1;
 
diff --git a/include/linux/ide.h b/include/linux/ide.h
index f408d6123f1..5f86ad40ee7 100644
--- a/include/linux/ide.h
+++ b/include/linux/ide.h
@@ -603,8 +603,6 @@ struct ide_drive_s {
 	unsigned long dev_flags;
 
 	unsigned long sleep;		/* sleep until this time */
-	unsigned long service_start;	/* time we started last request */
-	unsigned long service_time;	/* service time of last request */
 	unsigned long timeout;		/* max time to wait for irq */
 
 	special_t	special;	/* special action flags */
@@ -872,8 +870,6 @@ typedef struct hwgroup_s {
 
 		/* BOOL: protects all fields below */
 	volatile int busy;
-		/* BOOL: wake us up on timer expiry */
-	unsigned int sleeping	: 1;
 		/* BOOL: polling active & poll_timeout field valid */
 	unsigned int polling	: 1;
 
-- 
cgit v1.2.3-70-g09d2


From bf64741fe89280bd81a9e3a1beadec1570861848 Mon Sep 17 00:00:00 2001
From: Borislav Petkov <petkovbb@googlemail.com>
Date: Fri, 2 Jan 2009 16:12:50 +0100
Subject: ide: make IDE_AFLAG_.. numbering continuous again

Signed-off-by: Borislav Petkov <petkovbb@gmail.com>
Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
---
 include/linux/ide.h | 48 ++++++++++++++++++++++++------------------------
 1 file changed, 24 insertions(+), 24 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ide.h b/include/linux/ide.h
index 5f86ad40ee7..eb4c01f7f25 100644
--- a/include/linux/ide.h
+++ b/include/linux/ide.h
@@ -473,53 +473,53 @@ enum {
 
 	/* ide-cd */
 	/* Drive cannot eject the disc. */
-	IDE_AFLAG_NO_EJECT		= (1 << 3),
+	IDE_AFLAG_NO_EJECT		= (1 << 1),
 	/* Drive is a pre ATAPI 1.2 drive. */
-	IDE_AFLAG_PRE_ATAPI12		= (1 << 4),
+	IDE_AFLAG_PRE_ATAPI12		= (1 << 2),
 	/* TOC addresses are in BCD. */
-	IDE_AFLAG_TOCADDR_AS_BCD	= (1 << 5),
+	IDE_AFLAG_TOCADDR_AS_BCD	= (1 << 3),
 	/* TOC track numbers are in BCD. */
-	IDE_AFLAG_TOCTRACKS_AS_BCD	= (1 << 6),
+	IDE_AFLAG_TOCTRACKS_AS_BCD	= (1 << 4),
 	/*
 	 * Drive does not provide data in multiples of SECTOR_SIZE
 	 * when more than one interrupt is needed.
 	 */
-	IDE_AFLAG_LIMIT_NFRAMES		= (1 << 7),
+	IDE_AFLAG_LIMIT_NFRAMES		= (1 << 5),
 	/* Saved TOC information is current. */
-	IDE_AFLAG_TOC_VALID		= (1 << 9),
+	IDE_AFLAG_TOC_VALID		= (1 << 6),
 	/* We think that the drive door is locked. */
-	IDE_AFLAG_DOOR_LOCKED		= (1 << 10),
+	IDE_AFLAG_DOOR_LOCKED		= (1 << 7),
 	/* SET_CD_SPEED command is unsupported. */
-	IDE_AFLAG_NO_SPEED_SELECT	= (1 << 11),
-	IDE_AFLAG_VERTOS_300_SSD	= (1 << 12),
-	IDE_AFLAG_VERTOS_600_ESD	= (1 << 13),
-	IDE_AFLAG_SANYO_3CD		= (1 << 14),
-	IDE_AFLAG_FULL_CAPS_PAGE	= (1 << 15),
-	IDE_AFLAG_PLAY_AUDIO_OK		= (1 << 16),
-	IDE_AFLAG_LE_SPEED_FIELDS	= (1 << 17),
+	IDE_AFLAG_NO_SPEED_SELECT	= (1 << 8),
+	IDE_AFLAG_VERTOS_300_SSD	= (1 << 9),
+	IDE_AFLAG_VERTOS_600_ESD	= (1 << 10),
+	IDE_AFLAG_SANYO_3CD		= (1 << 11),
+	IDE_AFLAG_FULL_CAPS_PAGE	= (1 << 12),
+	IDE_AFLAG_PLAY_AUDIO_OK		= (1 << 13),
+	IDE_AFLAG_LE_SPEED_FIELDS	= (1 << 14),
 
 	/* ide-floppy */
 	/* Avoid commands not supported in Clik drive */
-	IDE_AFLAG_CLIK_DRIVE		= (1 << 19),
+	IDE_AFLAG_CLIK_DRIVE		= (1 << 15),
 	/* Requires BH algorithm for packets */
-	IDE_AFLAG_ZIP_DRIVE		= (1 << 20),
+	IDE_AFLAG_ZIP_DRIVE		= (1 << 16),
 	/* Supports format progress report */
-	IDE_AFLAG_SRFP			= (1 << 22),
+	IDE_AFLAG_SRFP			= (1 << 17),
 
 	/* ide-tape */
-	IDE_AFLAG_IGNORE_DSC		= (1 << 23),
+	IDE_AFLAG_IGNORE_DSC		= (1 << 18),
 	/* 0 When the tape position is unknown */
-	IDE_AFLAG_ADDRESS_VALID		= (1 <<	24),
+	IDE_AFLAG_ADDRESS_VALID		= (1 <<	19),
 	/* Device already opened */
-	IDE_AFLAG_BUSY			= (1 << 25),
+	IDE_AFLAG_BUSY			= (1 << 20),
 	/* Attempt to auto-detect the current user block size */
-	IDE_AFLAG_DETECT_BS		= (1 << 26),
+	IDE_AFLAG_DETECT_BS		= (1 << 21),
 	/* Currently on a filemark */
-	IDE_AFLAG_FILEMARK		= (1 << 27),
+	IDE_AFLAG_FILEMARK		= (1 << 22),
 	/* 0 = no tape is loaded, so we don't rewind after ejecting */
-	IDE_AFLAG_MEDIUM_PRESENT	= (1 << 28),
+	IDE_AFLAG_MEDIUM_PRESENT	= (1 << 23),
 
-	IDE_AFLAG_NO_AUTOCLOSE		= (1 << 29),
+	IDE_AFLAG_NO_AUTOCLOSE		= (1 << 24),
 };
 
 /* device flags */
-- 
cgit v1.2.3-70-g09d2


From 392de1d53dd40e2eebee3a0a26aa647a3865ca78 Mon Sep 17 00:00:00 2001
From: Borislav Petkov <petkovbb@googlemail.com>
Date: Fri, 2 Jan 2009 16:12:52 +0100
Subject: ide-atapi: accomodate transfer length calculation for ide-cd

... by factoring it out of ide_cd_do_request() into a helper, as suggested by
Bart.

There should be no functionality change resulting from this patch.

Signed-off-by: Borislav Petkov <petkovbb@gmail.com>
[bart: BLK_DEV_IDECD needs to select IDE_ATAPI now]
Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
---
 drivers/ide/Kconfig     |  1 +
 drivers/ide/ide-atapi.c | 15 ++++++++++++++-
 drivers/ide/ide-cd.c    |  4 ++--
 include/linux/ide.h     |  2 ++
 4 files changed, 19 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/ide/Kconfig b/drivers/ide/Kconfig
index 937945e471d..4ee85fcf9aa 100644
--- a/drivers/ide/Kconfig
+++ b/drivers/ide/Kconfig
@@ -137,6 +137,7 @@ config BLK_DEV_DELKIN
 
 config BLK_DEV_IDECD
 	tristate "Include IDE/ATAPI CDROM support"
+	select IDE_ATAPI
 	---help---
 	  If you have a CD-ROM drive using the ATAPI protocol, say Y. ATAPI is
 	  a newer protocol used by IDE CD-ROM and TAPE drives, similar to the
diff --git a/drivers/ide/ide-atapi.c b/drivers/ide/ide-atapi.c
index 74273fdc882..8884877bd2b 100644
--- a/drivers/ide/ide-atapi.c
+++ b/drivers/ide/ide-atapi.c
@@ -252,6 +252,18 @@ int ide_scsi_expiry(ide_drive_t *drive)
 }
 EXPORT_SYMBOL_GPL(ide_scsi_expiry);
 
+int ide_cd_get_xferlen(struct request *rq)
+{
+	if (blk_fs_request(rq))
+		return 32768;
+	else if (blk_sense_request(rq) || blk_pc_request(rq) ||
+			 rq->cmd_type == REQ_TYPE_ATA_PC)
+		return rq->data_len;
+	else
+		return 0;
+}
+EXPORT_SYMBOL_GPL(ide_cd_get_xferlen);
+
 /*
  * This is the usual interrupt handler which will be called during a packet
  * command.  We will transfer some of the data (as requested by the drive)
@@ -551,7 +563,7 @@ ide_startstop_t ide_issue_pc(ide_drive_t *drive, unsigned int timeout,
 	struct ide_atapi_pc *pc = drive->pc;
 	ide_hwif_t *hwif = drive->hwif;
 	u32 tf_flags;
-	u16 bcount = 0;
+	u16 bcount;
 	u8 scsi = !!(drive->dev_flags & IDE_DFLAG_SCSI);
 
 	/* We haven't transferred any data yet */
@@ -560,6 +572,7 @@ ide_startstop_t ide_issue_pc(ide_drive_t *drive, unsigned int timeout,
 
 	if (dev_is_idecd(drive)) {
 		tf_flags = IDE_TFLAG_OUT_NSECT | IDE_TFLAG_OUT_LBAL;
+		bcount = ide_cd_get_xferlen(hwif->hwgroup->rq);
 	} else if (scsi) {
 		tf_flags = 0;
 		bcount = min(pc->req_xfer, 63 * 1024);
diff --git a/drivers/ide/ide-cd.c b/drivers/ide/ide-cd.c
index 65e5513758b..8d3c7714682 100644
--- a/drivers/ide/ide-cd.c
+++ b/drivers/ide/ide-cd.c
@@ -1214,8 +1214,9 @@ static ide_startstop_t ide_cd_do_request(ide_drive_t *drive, struct request *rq,
 		      __func__, rq->cmd[0], rq->cmd_type,
 		      (unsigned long long)block);
 
+	xferlen = ide_cd_get_xferlen(rq);
+
 	if (blk_fs_request(rq)) {
-		xferlen = 32768;
 		fn = cdrom_start_rw_cont;
 
 		if (cdrom_start_rw(drive, rq) == ide_stopped)
@@ -1225,7 +1226,6 @@ static ide_startstop_t ide_cd_do_request(ide_drive_t *drive, struct request *rq,
 			return ide_stopped;
 	} else if (blk_sense_request(rq) || blk_pc_request(rq) ||
 		   rq->cmd_type == REQ_TYPE_ATA_PC) {
-		xferlen = rq->data_len;
 		fn = cdrom_do_newpc_cont;
 
 		if (!rq->timeout)
diff --git a/include/linux/ide.h b/include/linux/ide.h
index eb4c01f7f25..e35ff682789 100644
--- a/include/linux/ide.h
+++ b/include/linux/ide.h
@@ -1254,6 +1254,8 @@ static inline unsigned long ide_scsi_get_timeout(struct ide_atapi_pc *pc)
 
 int ide_scsi_expiry(ide_drive_t *);
 
+int ide_cd_get_xferlen(struct request *);
+
 ide_startstop_t ide_issue_pc(ide_drive_t *, unsigned int, ide_expiry_t *);
 
 ide_startstop_t do_rw_taskfile(ide_drive_t *, ide_task_t *);
-- 
cgit v1.2.3-70-g09d2


From 4cad085efbce8dcc5006b0d1034089758b4fc7ba Mon Sep 17 00:00:00 2001
From: Borislav Petkov <petkovbb@gmail.com>
Date: Fri, 2 Jan 2009 16:12:53 +0100
Subject: ide-cd: move cdrom_timer_expiry to ide-atapi.c

- cdrom_timer_expiry -> ide_cd_expiry
- remove expiry-arg to ide_issue_pc as it is redundant now
- ide_debug_log -> debug_log

Signed-off-by: Borislav Petkov <petkovbb@gmail.com>
Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
---
 drivers/ide/ide-atapi.c  | 41 ++++++++++++++++++++++++++++++++++++++---
 drivers/ide/ide-cd.c     | 38 +++-----------------------------------
 drivers/ide/ide-cd.h     |  4 ----
 drivers/ide/ide-floppy.c |  2 +-
 drivers/ide/ide-tape.c   |  2 +-
 include/linux/ide.h      |  4 +++-
 6 files changed, 46 insertions(+), 45 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/ide/ide-atapi.c b/drivers/ide/ide-atapi.c
index 8c5cf68fbd7..c110329ccb1 100644
--- a/drivers/ide/ide-atapi.c
+++ b/drivers/ide/ide-atapi.c
@@ -3,6 +3,7 @@
  */
 
 #include <linux/kernel.h>
+#include <linux/cdrom.h>
 #include <linux/delay.h>
 #include <linux/ide.h>
 #include <scsi/scsi.h>
@@ -252,6 +253,38 @@ int ide_scsi_expiry(ide_drive_t *drive)
 }
 EXPORT_SYMBOL_GPL(ide_scsi_expiry);
 
+int ide_cd_expiry(ide_drive_t *drive)
+{
+	struct request *rq = HWGROUP(drive)->rq;
+	unsigned long wait = 0;
+
+	debug_log("%s: rq->cmd[0]: 0x%x\n", __func__, rq->cmd[0]);
+
+	/*
+	 * Some commands are *slow* and normally take a long time to complete.
+	 * Usually we can use the ATAPI "disconnect" to bypass this, but not all
+	 * commands/drives support that. Let ide_timer_expiry keep polling us
+	 * for these.
+	 */
+	switch (rq->cmd[0]) {
+	case GPCMD_BLANK:
+	case GPCMD_FORMAT_UNIT:
+	case GPCMD_RESERVE_RZONE_TRACK:
+	case GPCMD_CLOSE_TRACK:
+	case GPCMD_FLUSH_CACHE:
+		wait = ATAPI_WAIT_PC;
+		break;
+	default:
+		if (!(rq->cmd_flags & REQ_QUIET))
+			printk(KERN_INFO "cmd 0x%x timed out\n",
+					 rq->cmd[0]);
+		wait = 0;
+		break;
+	}
+	return wait;
+}
+EXPORT_SYMBOL_GPL(ide_cd_expiry);
+
 int ide_cd_get_xferlen(struct request *rq)
 {
 	if (blk_fs_request(rq))
@@ -562,11 +595,11 @@ static ide_startstop_t ide_transfer_pc(ide_drive_t *drive)
 	return ide_started;
 }
 
-ide_startstop_t ide_issue_pc(ide_drive_t *drive, unsigned int timeout,
-			     ide_expiry_t *expiry)
+ide_startstop_t ide_issue_pc(ide_drive_t *drive, unsigned int timeout)
 {
 	struct ide_atapi_pc *pc = drive->pc;
 	ide_hwif_t *hwif = drive->hwif;
+	ide_expiry_t *expiry = NULL;
 	u32 tf_flags;
 	u16 bcount;
 	u8 scsi = !!(drive->dev_flags & IDE_DFLAG_SCSI);
@@ -578,9 +611,11 @@ ide_startstop_t ide_issue_pc(ide_drive_t *drive, unsigned int timeout,
 	if (dev_is_idecd(drive)) {
 		tf_flags = IDE_TFLAG_OUT_NSECT | IDE_TFLAG_OUT_LBAL;
 		bcount = ide_cd_get_xferlen(hwif->hwgroup->rq);
+		expiry = ide_cd_expiry;
 	} else if (scsi) {
 		tf_flags = 0;
 		bcount = min(pc->req_xfer, 63 * 1024);
+		expiry = ide_scsi_expiry;
 	} else {
 		tf_flags = IDE_TFLAG_OUT_DEVICE;
 		bcount = ((drive->media == ide_tape) ?
@@ -613,7 +648,7 @@ ide_startstop_t ide_issue_pc(ide_drive_t *drive, unsigned int timeout,
 		if (drive->dma)
 			drive->waiting_for_dma = 0;
 		ide_execute_command(drive, ATA_CMD_PACKET, ide_transfer_pc,
-				    timeout, NULL);
+				    timeout, expiry);
 		return ide_started;
 	} else {
 		ide_execute_pkt_cmd(drive);
diff --git a/drivers/ide/ide-cd.c b/drivers/ide/ide-cd.c
index 8d3c7714682..105e4d855e6 100644
--- a/drivers/ide/ide-cd.c
+++ b/drivers/ide/ide-cd.c
@@ -511,38 +511,6 @@ end_request:
 	return 1;
 }
 
-static int cdrom_timer_expiry(ide_drive_t *drive)
-{
-	struct request *rq = HWGROUP(drive)->rq;
-	unsigned long wait = 0;
-
-	ide_debug_log(IDE_DBG_RQ, "Call %s: rq->cmd[0]: 0x%x\n", __func__,
-		      rq->cmd[0]);
-
-	/*
-	 * Some commands are *slow* and normally take a long time to complete.
-	 * Usually we can use the ATAPI "disconnect" to bypass this, but not all
-	 * commands/drives support that. Let ide_timer_expiry keep polling us
-	 * for these.
-	 */
-	switch (rq->cmd[0]) {
-	case GPCMD_BLANK:
-	case GPCMD_FORMAT_UNIT:
-	case GPCMD_RESERVE_RZONE_TRACK:
-	case GPCMD_CLOSE_TRACK:
-	case GPCMD_FLUSH_CACHE:
-		wait = ATAPI_WAIT_PC;
-		break;
-	default:
-		if (!(rq->cmd_flags & REQ_QUIET))
-			printk(KERN_INFO PFX "cmd 0x%x timed out\n",
-					 rq->cmd[0]);
-		wait = 0;
-		break;
-	}
-	return wait;
-}
-
 /*
  * Set up the device registers for transferring a packet command on DEV,
  * expecting to later transfer XFERLEN bytes.  HANDLER is the routine
@@ -574,7 +542,7 @@ static ide_startstop_t cdrom_start_packet_command(ide_drive_t *drive,
 
 		/* packet command */
 		ide_execute_command(drive, ATA_CMD_PACKET, handler,
-				    ATAPI_WAIT_PC, cdrom_timer_expiry);
+				    ATAPI_WAIT_PC, ide_cd_expiry);
 		return ide_started;
 	} else {
 		ide_execute_pkt_cmd(drive);
@@ -621,7 +589,7 @@ static ide_startstop_t cdrom_transfer_packet_command(ide_drive_t *drive,
 	}
 
 	/* arm the interrupt handler */
-	ide_set_handler(drive, handler, rq->timeout, cdrom_timer_expiry);
+	ide_set_handler(drive, handler, rq->timeout, ide_cd_expiry);
 
 	/* ATAPI commands get padded out to 12 bytes minimum */
 	cmd_len = COMMAND_SIZE(rq->cmd[0]);
@@ -1088,7 +1056,7 @@ static ide_startstop_t cdrom_newpc_intr(ide_drive_t *drive)
 	} else {
 		timeout = ATAPI_WAIT_PC;
 		if (!blk_fs_request(rq))
-			expiry = cdrom_timer_expiry;
+			expiry = ide_cd_expiry;
 	}
 
 	ide_set_handler(drive, cdrom_newpc_intr, timeout, expiry);
diff --git a/drivers/ide/ide-cd.h b/drivers/ide/ide-cd.h
index 389faa42eaa..bf676b26218 100644
--- a/drivers/ide/ide-cd.h
+++ b/drivers/ide/ide-cd.h
@@ -16,10 +16,6 @@
 #define ide_debug_log(lvl, fmt, args...) do {} while (0)
 #endif
 
-/*
- * typical timeout for packet command
- */
-#define ATAPI_WAIT_PC		(60 * HZ)
 #define ATAPI_WAIT_WRITE_BUSY	(10 * HZ)
 
 /************************************************************************/
diff --git a/drivers/ide/ide-floppy.c b/drivers/ide/ide-floppy.c
index 1f07f381893..fdec729d0e4 100644
--- a/drivers/ide/ide-floppy.c
+++ b/drivers/ide/ide-floppy.c
@@ -197,7 +197,7 @@ static ide_startstop_t idefloppy_issue_pc(ide_drive_t *drive,
 
 	pc->retries++;
 
-	return ide_issue_pc(drive, WAIT_FLOPPY_CMD, NULL);
+	return ide_issue_pc(drive, WAIT_FLOPPY_CMD);
 }
 
 void ide_floppy_create_read_capacity_cmd(struct ide_atapi_pc *pc)
diff --git a/drivers/ide/ide-tape.c b/drivers/ide/ide-tape.c
index a2d470eb2b5..ac9e29a4991 100644
--- a/drivers/ide/ide-tape.c
+++ b/drivers/ide/ide-tape.c
@@ -694,7 +694,7 @@ static ide_startstop_t idetape_issue_pc(ide_drive_t *drive,
 
 	pc->retries++;
 
-	return ide_issue_pc(drive, WAIT_TAPE_CMD, NULL);
+	return ide_issue_pc(drive, WAIT_TAPE_CMD);
 }
 
 /* A mode sense command is used to "sense" tape parameters. */
diff --git a/include/linux/ide.h b/include/linux/ide.h
index e35ff682789..e20e0b5c173 100644
--- a/include/linux/ide.h
+++ b/include/linux/ide.h
@@ -396,6 +396,7 @@ enum {
  * This is used for several packet commands (not for READ/WRITE commands).
  */
 #define IDE_PC_BUFFER_SIZE	256
+#define ATAPI_WAIT_PC		(60 * HZ)
 
 struct ide_atapi_pc {
 	/* actual packet bytes */
@@ -1253,10 +1254,11 @@ static inline unsigned long ide_scsi_get_timeout(struct ide_atapi_pc *pc)
 }
 
 int ide_scsi_expiry(ide_drive_t *);
+int ide_cd_expiry(ide_drive_t *);
 
 int ide_cd_get_xferlen(struct request *);
 
-ide_startstop_t ide_issue_pc(ide_drive_t *, unsigned int, ide_expiry_t *);
+ide_startstop_t ide_issue_pc(ide_drive_t *, unsigned int);
 
 ide_startstop_t do_rw_taskfile(ide_drive_t *, ide_task_t *);
 
-- 
cgit v1.2.3-70-g09d2


From 5d655a03b847fbe5353a8a74bbeb75e18708dca3 Mon Sep 17 00:00:00 2001
From: Borislav Petkov <petkovbb@googlemail.com>
Date: Fri, 2 Jan 2009 16:12:54 +0100
Subject: ide-atapi: remove ide-scsi remnants from ide_pc_intr()

As a result, remove now unused ide_scsi_get_timeout and ide_scsi_expiry.

Signed-off-by: Borislav Petkov <petkovbb@gmail.com>
Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
---
 drivers/ide/ide-atapi.c | 68 ++++++++++---------------------------------------
 include/linux/ide.h     |  6 -----
 2 files changed, 13 insertions(+), 61 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/ide/ide-atapi.c b/drivers/ide/ide-atapi.c
index f5bf405c36a..7a04509bf96 100644
--- a/drivers/ide/ide-atapi.c
+++ b/drivers/ide/ide-atapi.c
@@ -240,19 +240,6 @@ void ide_retry_pc(ide_drive_t *drive, struct gendisk *disk)
 }
 EXPORT_SYMBOL_GPL(ide_retry_pc);
 
-int ide_scsi_expiry(ide_drive_t *drive)
-{
-	struct ide_atapi_pc *pc = drive->pc;
-
-	debug_log("%s called for %lu at %lu\n", __func__,
-		  pc->scsi_cmd->serial_number, jiffies);
-
-	pc->flags |= PC_FLAG_TIMEDOUT;
-
-	return 0; /* we do not want the IDE subsystem to retry */
-}
-EXPORT_SYMBOL_GPL(ide_scsi_expiry);
-
 int ide_cd_expiry(ide_drive_t *drive)
 {
 	struct request *rq = HWGROUP(drive)->rq;
@@ -309,21 +296,14 @@ static ide_startstop_t ide_pc_intr(ide_drive_t *drive)
 	struct request *rq = hwif->hwgroup->rq;
 	const struct ide_tp_ops *tp_ops = hwif->tp_ops;
 	xfer_func_t *xferfunc;
-	ide_expiry_t *expiry;
 	unsigned int timeout, temp;
 	u16 bcount;
-	u8 stat, ireason, scsi = !!(drive->dev_flags & IDE_DFLAG_SCSI), dsc = 0;
+	u8 stat, ireason, dsc = 0;
 
 	debug_log("Enter %s - interrupt handler\n", __func__);
 
-	if (scsi) {
-		timeout = ide_scsi_get_timeout(pc);
-		expiry = ide_scsi_expiry;
-	} else {
-		timeout = (drive->media == ide_floppy) ? WAIT_FLOPPY_CMD
-						       : WAIT_TAPE_CMD;
-		expiry = NULL;
-	}
+	timeout = (drive->media == ide_floppy) ? WAIT_FLOPPY_CMD
+					       : WAIT_TAPE_CMD;
 
 	if (pc->flags & PC_FLAG_TIMEDOUT) {
 		drive->pc_callback(drive, 0);
@@ -335,8 +315,8 @@ static ide_startstop_t ide_pc_intr(ide_drive_t *drive)
 
 	if (pc->flags & PC_FLAG_DMA_IN_PROGRESS) {
 		if (hwif->dma_ops->dma_end(drive) ||
-		    (drive->media == ide_tape && !scsi && (stat & ATA_ERR))) {
-			if (drive->media == ide_floppy && !scsi)
+		    (drive->media == ide_tape && (stat & ATA_ERR))) {
+			if (drive->media == ide_floppy)
 				printk(KERN_ERR "%s: DMA %s error\n",
 					drive->name, rq_data_dir(pc->rq)
 						     ? "write" : "read");
@@ -358,7 +338,7 @@ static ide_startstop_t ide_pc_intr(ide_drive_t *drive)
 
 		local_irq_enable_in_hardirq();
 
-		if (drive->media == ide_tape && !scsi &&
+		if (drive->media == ide_tape &&
 		    (stat & ATA_ERR) && rq->cmd[0] == REQUEST_SENSE)
 			stat &= ~ATA_ERR;
 
@@ -366,11 +346,8 @@ static ide_startstop_t ide_pc_intr(ide_drive_t *drive)
 			/* Error detected */
 			debug_log("%s: I/O error\n", drive->name);
 
-			if (drive->media != ide_tape || scsi) {
+			if (drive->media != ide_tape)
 				pc->rq->errors++;
-				if (scsi)
-					goto cmd_finished;
-			}
 
 			if (rq->cmd[0] == REQUEST_SENSE) {
 				printk(KERN_ERR "%s: I/O error in request sense"
@@ -386,7 +363,6 @@ static ide_startstop_t ide_pc_intr(ide_drive_t *drive)
 			/* queued, but not started */
 			return ide_stopped;
 		}
-cmd_finished:
 		pc->error = 0;
 
 		if ((pc->flags & PC_FLAG_WAIT_FOR_DSC) && (stat & ATA_DSC) == 0)
@@ -433,25 +409,8 @@ cmd_finished:
 						"us more data than expected - "
 						"discarding data\n",
 						drive->name);
-				if (scsi)
-					temp = pc->buf_size - pc->xferred;
-				else
-					temp = 0;
-				if (temp) {
-					if (pc->sg)
-						drive->pc_io_buffers(drive, pc,
-								     temp, 0);
-					else
-						tp_ops->input_data(drive, NULL,
-							pc->cur_pos, temp);
-					printk(KERN_ERR "%s: transferred %d of "
-							"%d bytes\n",
-							drive->name,
-							temp, bcount);
-				}
-				pc->xferred += temp;
-				pc->cur_pos += temp;
-				ide_pad_transfer(drive, 0, bcount - temp);
+
+				ide_pad_transfer(drive, 0, bcount);
 				goto next_irq;
 			}
 			debug_log("The device wants to send us more data than "
@@ -461,14 +420,13 @@ cmd_finished:
 	} else
 		xferfunc = tp_ops->output_data;
 
-	if ((drive->media == ide_floppy && !scsi && !pc->buf) ||
-	    (drive->media == ide_tape && !scsi && pc->bh) ||
-	    (scsi && pc->sg)) {
+	if ((drive->media == ide_floppy && !pc->buf) ||
+	    (drive->media == ide_tape && pc->bh)) {
 		int done = drive->pc_io_buffers(drive, pc, bcount,
 				  !!(pc->flags & PC_FLAG_WRITING));
 
 		/* FIXME: don't do partial completions */
-		if (drive->media == ide_floppy && !scsi)
+		if (drive->media == ide_floppy)
 			ide_end_request(drive, 1, done >> 9);
 	} else
 		xferfunc(drive, NULL, pc->cur_pos, bcount);
@@ -481,7 +439,7 @@ cmd_finished:
 		  rq->cmd[0], bcount);
 next_irq:
 	/* And set the interrupt handler again */
-	ide_set_handler(drive, ide_pc_intr, timeout, expiry);
+	ide_set_handler(drive, ide_pc_intr, timeout, NULL);
 	return ide_started;
 }
 
diff --git a/include/linux/ide.h b/include/linux/ide.h
index e20e0b5c173..257524ee1af 100644
--- a/include/linux/ide.h
+++ b/include/linux/ide.h
@@ -1248,12 +1248,6 @@ int ide_set_media_lock(ide_drive_t *, struct gendisk *, int);
 void ide_create_request_sense_cmd(ide_drive_t *, struct ide_atapi_pc *);
 void ide_retry_pc(ide_drive_t *, struct gendisk *);
 
-static inline unsigned long ide_scsi_get_timeout(struct ide_atapi_pc *pc)
-{
-	return max_t(unsigned long, WAIT_CMD, pc->timeout - jiffies);
-}
-
-int ide_scsi_expiry(ide_drive_t *);
 int ide_cd_expiry(ide_drive_t *);
 
 int ide_cd_get_xferlen(struct request *);
-- 
cgit v1.2.3-70-g09d2


From 5317464dccd0c03026d60f1e9968de4f9cd23f69 Mon Sep 17 00:00:00 2001
From: Borislav Petkov <petkovbb@googlemail.com>
Date: Fri, 2 Jan 2009 16:12:54 +0100
Subject: ide: remove the last ide-scsi remnants

Signed-off-by: Borislav Petkov <petkovbb@gmail.com>
Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
---
 drivers/ide/ide-atapi.c  |  3 +--
 drivers/ide/ide-io.c     |  3 ---
 drivers/ide/ide-ioctls.c |  3 +--
 drivers/ide/ide-probe.c  |  2 --
 include/linux/ide.h      | 28 +++++++++++++---------------
 5 files changed, 15 insertions(+), 24 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/ide/ide-atapi.c b/drivers/ide/ide-atapi.c
index 7a04509bf96..d412bd2bd7f 100644
--- a/drivers/ide/ide-atapi.c
+++ b/drivers/ide/ide-atapi.c
@@ -17,8 +17,7 @@
 
 static inline int dev_is_idecd(ide_drive_t *drive)
 {
-	return (drive->media == ide_cdrom || drive->media == ide_optical) &&
-		!(drive->dev_flags & IDE_DFLAG_SCSI);
+	return drive->media == ide_cdrom || drive->media == ide_optical;
 }
 
 /*
diff --git a/drivers/ide/ide-io.c b/drivers/ide/ide-io.c
index bb3248abf47..1c36a8e83d3 100644
--- a/drivers/ide/ide-io.c
+++ b/drivers/ide/ide-io.c
@@ -426,9 +426,6 @@ void ide_map_sg(ide_drive_t *drive, struct request *rq)
 	ide_hwif_t *hwif = drive->hwif;
 	struct scatterlist *sg = hwif->sg_table;
 
-	if (hwif->sg_mapped)	/* needed by ide-scsi */
-		return;
-
 	if (rq->cmd_type != REQ_TYPE_ATA_TASKFILE) {
 		hwif->sg_nents = blk_rq_map_sg(drive->queue, rq, sg);
 	} else {
diff --git a/drivers/ide/ide-ioctls.c b/drivers/ide/ide-ioctls.c
index 28232c64c34..1be263eb9c0 100644
--- a/drivers/ide/ide-ioctls.c
+++ b/drivers/ide/ide-ioctls.c
@@ -95,8 +95,7 @@ static int ide_set_nice_ioctl(ide_drive_t *drive, unsigned long arg)
 		return -EPERM;
 
 	if (((arg >> IDE_NICE_DSC_OVERLAP) & 1) &&
-	    (drive->media != ide_tape ||
-	     (drive->dev_flags & IDE_DFLAG_SCSI)))
+	    (drive->media != ide_tape))
 		return -EPERM;
 
 	if ((arg >> IDE_NICE_DSC_OVERLAP) & 1)
diff --git a/drivers/ide/ide-probe.c b/drivers/ide/ide-probe.c
index 966b74c1577..c5adb7b9c5b 100644
--- a/drivers/ide/ide-probe.c
+++ b/drivers/ide/ide-probe.c
@@ -1141,8 +1141,6 @@ static struct kobject *ata_probe(dev_t dev, int *part, void *data)
 
 	if (drive->media == ide_disk)
 		request_module("ide-disk");
-	if (drive->dev_flags & IDE_DFLAG_SCSI)
-		request_module("ide-scsi");
 	if (drive->media == ide_cdrom || drive->media == ide_optical)
 		request_module("ide-cd");
 	if (drive->media == ide_tape)
diff --git a/include/linux/ide.h b/include/linux/ide.h
index 257524ee1af..ad57a449294 100644
--- a/include/linux/ide.h
+++ b/include/linux/ide.h
@@ -559,28 +559,26 @@ enum {
 	IDE_DFLAG_NODMA			= (1 << 16),
 	/* powermanagment told us not to do anything, so sleep nicely */
 	IDE_DFLAG_BLOCKED		= (1 << 17),
-	/* ide-scsi emulation */
-	IDE_DFLAG_SCSI			= (1 << 18),
 	/* sleeping & sleep field valid */
-	IDE_DFLAG_SLEEPING		= (1 << 19),
-	IDE_DFLAG_POST_RESET		= (1 << 20),
-	IDE_DFLAG_UDMA33_WARNED		= (1 << 21),
-	IDE_DFLAG_LBA48			= (1 << 22),
+	IDE_DFLAG_SLEEPING		= (1 << 18),
+	IDE_DFLAG_POST_RESET		= (1 << 19),
+	IDE_DFLAG_UDMA33_WARNED		= (1 << 20),
+	IDE_DFLAG_LBA48			= (1 << 21),
 	/* status of write cache */
-	IDE_DFLAG_WCACHE		= (1 << 23),
+	IDE_DFLAG_WCACHE		= (1 << 22),
 	/* used for ignoring ATA_DF */
-	IDE_DFLAG_NOWERR		= (1 << 24),
+	IDE_DFLAG_NOWERR		= (1 << 23),
 	/* retrying in PIO */
-	IDE_DFLAG_DMA_PIO_RETRY		= (1 << 25),
-	IDE_DFLAG_LBA			= (1 << 26),
+	IDE_DFLAG_DMA_PIO_RETRY		= (1 << 24),
+	IDE_DFLAG_LBA			= (1 << 25),
 	/* don't unload heads */
-	IDE_DFLAG_NO_UNLOAD		= (1 << 27),
+	IDE_DFLAG_NO_UNLOAD		= (1 << 26),
 	/* heads unloaded, please don't reset port */
-	IDE_DFLAG_PARKED		= (1 << 28),
-	IDE_DFLAG_MEDIA_CHANGED		= (1 << 29),
+	IDE_DFLAG_PARKED		= (1 << 27),
+	IDE_DFLAG_MEDIA_CHANGED		= (1 << 28),
 	/* write protect */
-	IDE_DFLAG_WP			= (1 << 30),
-	IDE_DFLAG_FORMAT_IN_PROGRESS	= (1 << 31),
+	IDE_DFLAG_WP			= (1 << 29),
+	IDE_DFLAG_FORMAT_IN_PROGRESS	= (1 << 30),
 };
 
 struct ide_drive_s {
-- 
cgit v1.2.3-70-g09d2


From 28ad91db77755f1c49d79652de11b28ee2cfbf03 Mon Sep 17 00:00:00 2001
From: Borislav Petkov <petkovbb@googlemail.com>
Date: Fri, 2 Jan 2009 16:12:56 +0100
Subject: ide-atapi: remove timeout arg to ide_issue_pc

There should be no functionality change resulting from this patch.

Signed-off-by: Borislav Petkov <petkovbb@gmail.com>
Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
---
 drivers/ide/ide-atapi.c  | 7 ++++++-
 drivers/ide/ide-floppy.c | 2 +-
 drivers/ide/ide-tape.c   | 2 +-
 include/linux/ide.h      | 2 +-
 4 files changed, 9 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/ide/ide-atapi.c b/drivers/ide/ide-atapi.c
index fa7a70a3f24..c470dbb155c 100644
--- a/drivers/ide/ide-atapi.c
+++ b/drivers/ide/ide-atapi.c
@@ -562,11 +562,12 @@ static ide_startstop_t ide_transfer_pc(ide_drive_t *drive)
 	return ide_started;
 }
 
-ide_startstop_t ide_issue_pc(ide_drive_t *drive, unsigned int timeout)
+ide_startstop_t ide_issue_pc(ide_drive_t *drive)
 {
 	struct ide_atapi_pc *pc;
 	ide_hwif_t *hwif = drive->hwif;
 	ide_expiry_t *expiry = NULL;
+	unsigned int timeout;
 	u32 tf_flags;
 	u16 bcount;
 
@@ -574,6 +575,7 @@ ide_startstop_t ide_issue_pc(ide_drive_t *drive, unsigned int timeout)
 		tf_flags = IDE_TFLAG_OUT_NSECT | IDE_TFLAG_OUT_LBAL;
 		bcount = ide_cd_get_xferlen(hwif->hwgroup->rq);
 		expiry = ide_cd_expiry;
+		timeout = ATAPI_WAIT_PC;
 
 		if (drive->dma)
 			drive->dma = !hwif->dma_ops->dma_setup(drive);
@@ -600,6 +602,9 @@ ide_startstop_t ide_issue_pc(ide_drive_t *drive, unsigned int timeout)
 
 		if (!drive->dma)
 			pc->flags &= ~PC_FLAG_DMA_OK;
+
+		timeout = (drive->media == ide_floppy) ? WAIT_FLOPPY_CMD
+						       : WAIT_TAPE_CMD;
 	}
 
 	ide_pktcmd_tf_load(drive, tf_flags, bcount, drive->dma);
diff --git a/drivers/ide/ide-floppy.c b/drivers/ide/ide-floppy.c
index fdec729d0e4..0a48e2dc53a 100644
--- a/drivers/ide/ide-floppy.c
+++ b/drivers/ide/ide-floppy.c
@@ -197,7 +197,7 @@ static ide_startstop_t idefloppy_issue_pc(ide_drive_t *drive,
 
 	pc->retries++;
 
-	return ide_issue_pc(drive, WAIT_FLOPPY_CMD);
+	return ide_issue_pc(drive);
 }
 
 void ide_floppy_create_read_capacity_cmd(struct ide_atapi_pc *pc)
diff --git a/drivers/ide/ide-tape.c b/drivers/ide/ide-tape.c
index ac9e29a4991..5d2aa22cd6e 100644
--- a/drivers/ide/ide-tape.c
+++ b/drivers/ide/ide-tape.c
@@ -694,7 +694,7 @@ static ide_startstop_t idetape_issue_pc(ide_drive_t *drive,
 
 	pc->retries++;
 
-	return ide_issue_pc(drive, WAIT_TAPE_CMD);
+	return ide_issue_pc(drive);
 }
 
 /* A mode sense command is used to "sense" tape parameters. */
diff --git a/include/linux/ide.h b/include/linux/ide.h
index ad57a449294..db5ef8ae1ab 100644
--- a/include/linux/ide.h
+++ b/include/linux/ide.h
@@ -1250,7 +1250,7 @@ int ide_cd_expiry(ide_drive_t *);
 
 int ide_cd_get_xferlen(struct request *);
 
-ide_startstop_t ide_issue_pc(ide_drive_t *, unsigned int);
+ide_startstop_t ide_issue_pc(ide_drive_t *);
 
 ide_startstop_t do_rw_taskfile(ide_drive_t *, ide_task_t *);
 
-- 
cgit v1.2.3-70-g09d2


From f153b82121b0366fe0e5f9553545cce237335175 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Fri, 2 Jan 2009 09:23:03 -0800
Subject: Sanitize gcc version header includes

 - include the gcc version-dependent header files from the generic gcc
   header file, rather than the other way around (iow: don't make the
   non-gcc header file have to know about gcc versions)

 - don't include compiler-gcc4.h for gcc 5 (for whenever it gets
   released).  That's just confusing and made us do odd things in the
   gcc4 header file (testing that we really had version 4!)

 - generate the name from the __GNUC__ version directly, rather than
   having a mess of #if conditionals.

Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/compiler-gcc.h  | 5 +++++
 include/linux/compiler-gcc3.h | 3 ---
 include/linux/compiler-gcc4.h | 5 +----
 include/linux/compiler.h      | 8 ++------
 4 files changed, 8 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/compiler-gcc.h b/include/linux/compiler-gcc.h
index 5c8351b859f..af40f8eb86f 100644
--- a/include/linux/compiler-gcc.h
+++ b/include/linux/compiler-gcc.h
@@ -61,3 +61,8 @@
 #define  noinline			__attribute__((noinline))
 #define __attribute_const__		__attribute__((__const__))
 #define __maybe_unused			__attribute__((unused))
+
+#define __gcc_header(x) #x
+#define _gcc_header(x) __gcc_header(linux/compiler-gcc##x.h)
+#define gcc_header(x) _gcc_header(x)
+#include gcc_header(__GNUC__)
diff --git a/include/linux/compiler-gcc3.h b/include/linux/compiler-gcc3.h
index e5eb795f78a..2befe6513ce 100644
--- a/include/linux/compiler-gcc3.h
+++ b/include/linux/compiler-gcc3.h
@@ -2,9 +2,6 @@
 #error "Please don't include <linux/compiler-gcc3.h> directly, include <linux/compiler.h> instead."
 #endif
 
-/* These definitions are for GCC v3.x.  */
-#include <linux/compiler-gcc.h>
-
 #if __GNUC_MINOR__ >= 3
 # define __used			__attribute__((__used__))
 #else
diff --git a/include/linux/compiler-gcc4.h b/include/linux/compiler-gcc4.h
index 974f5b7bb20..aa426214331 100644
--- a/include/linux/compiler-gcc4.h
+++ b/include/linux/compiler-gcc4.h
@@ -2,9 +2,6 @@
 #error "Please don't include <linux/compiler-gcc4.h> directly, include <linux/compiler.h> instead."
 #endif
 
-/* These definitions are for GCC v4.x.  */
-#include <linux/compiler-gcc.h>
-
 #define __used			__attribute__((__used__))
 #define __must_check 		__attribute__((warn_unused_result))
 #define __compiler_offsetof(a,b) __builtin_offsetof(a,b)
@@ -16,7 +13,7 @@
  */
 #define uninitialized_var(x) x = x
 
-#if !(__GNUC__ == 4 && __GNUC_MINOR__ < 3)
+#if __GNUC_MINOR__ >= 3
 /* Mark functions as cold. gcc will assume any path leading to a call
    to them will be unlikely.  This means a lot of manual unlikely()s
    are unnecessary now for any paths leading to the usual suspects
diff --git a/include/linux/compiler.h b/include/linux/compiler.h
index ea7c6be354b..d95da1020f1 100644
--- a/include/linux/compiler.h
+++ b/include/linux/compiler.h
@@ -36,12 +36,8 @@ extern void __chk_io_ptr(const volatile void __iomem *);
 
 #ifdef __KERNEL__
 
-#if __GNUC__ >= 4
-# include <linux/compiler-gcc4.h>
-#elif __GNUC__ == 3 && __GNUC_MINOR__ >= 2
-# include <linux/compiler-gcc3.h>
-#else
-# error Sorry, your compiler is too old/not recognized.
+#ifdef __GNUC__
+#include <linux/compiler-gcc.h>
 #endif
 
 #define notrace __attribute__((no_instrument_function))
-- 
cgit v1.2.3-70-g09d2


From f9d14250071eda9972e4c9cea745a11185952114 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Fri, 2 Jan 2009 09:29:43 -0800
Subject: Disallow gcc versions 4.1.{0,1}

These compiler versions are known to miscompile __weak functions and
thus generate kernels that don't necessarily work correctly.  If a weak
function is int he same compilation unit as a caller, gcc may end up
inlining it, and thus binding the weak function too early.

See

    http://gcc.gnu.org/bugzilla/show_bug.cgi?id=27781

for details.

Cc: Adrian Bunk <bunk@kernel.org>
Cc: Helge Deller <deller@gmx.de>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/compiler-gcc4.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/compiler-gcc4.h b/include/linux/compiler-gcc4.h
index aa426214331..09992718f9e 100644
--- a/include/linux/compiler-gcc4.h
+++ b/include/linux/compiler-gcc4.h
@@ -2,6 +2,11 @@
 #error "Please don't include <linux/compiler-gcc4.h> directly, include <linux/compiler.h> instead."
 #endif
 
+/* GCC 4.1.[01] miscompiles __weak */
+#if __GNUC_MINOR__ == 1 && __GNUC_PATCHLEVEL__ <= 1
+# error Your version of gcc miscompiles the __weak directive
+#endif
+
 #define __used			__attribute__((__used__))
 #define __must_check 		__attribute__((warn_unused_result))
 #define __compiler_offsetof(a,b) __builtin_offsetof(a,b)
-- 
cgit v1.2.3-70-g09d2


From a88a69c91256418c5907c2f1f8a0ec0a36f9e6cc Mon Sep 17 00:00:00 2001
From: Joe Peterson <joe@skyrush.com>
Date: Fri, 2 Jan 2009 13:40:53 +0000
Subject: n_tty: Fix loss of echoed characters and remove bkl from n_tty

Fixes the loss of echoed (and other ldisc-generated characters) when
the tty is stopped or when the driver output buffer is full (happens
frequently for input during continuous program output, such as ^C)
and removes the Big Kernel Lock from the N_TTY line discipline.

Adds an "echo buffer" to the N_TTY line discipline that handles all
ldisc-generated output (including echoed characters).  Along with the
loss of characters, this also fixes the associated loss of sync between
tty output and the ldisc state when characters cannot be immediately
written to the tty driver.

The echo buffer stores (in addition to characters) state operations that need
to be done at the time of character output (like management of the column
position).  This allows echo to cooperate correctly with program output,
since the ldisc state remains consistent with actual characters written.

Since the echo buffer code now isolates the tty column state code
to the process_out* and process_echoes functions, we can remove the
Big Kernel Lock (BKL) and replace it with mutex locks.

Highlights are:

* Handles echo (and other ldisc output) when tty driver buffer is full
  - continuous program output can block echo
* Saves echo when tty is in stopped state (e.g. ^S)
  - (e.g.: ^Q will correctly cause held characters to be released for output)
* Control character pairs (e.g. "^C") are treated atomically and not
  split up by interleaved program output
* Line discipline state is kept consistent with characters sent to
  the tty driver
* Remove the big kernel lock (BKL) from N_TTY line discipline

Signed-off-by: Joe Peterson <joe@skyrush.com>
Signed-off-by: Alan Cox <alan@redhat.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/char/n_tty.c  | 736 +++++++++++++++++++++++++++++++++++++++-----------
 drivers/char/tty_io.c |   6 +-
 drivers/char/vt.c     |   2 +-
 include/linux/tty.h   |   6 +
 4 files changed, 594 insertions(+), 156 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/char/n_tty.c b/drivers/char/n_tty.c
index efbfe961265..a9bc5764fe7 100644
--- a/drivers/char/n_tty.c
+++ b/drivers/char/n_tty.c
@@ -62,6 +62,17 @@
 #define TTY_THRESHOLD_THROTTLE		128 /* now based on remaining room */
 #define TTY_THRESHOLD_UNTHROTTLE 	128
 
+/*
+ * Special byte codes used in the echo buffer to represent operations
+ * or special handling of characters.  Bytes in the echo buffer that
+ * are not part of such special blocks are treated as normal character
+ * codes.
+ */
+#define ECHO_OP_START 0xff
+#define ECHO_OP_MOVE_BACK_COL 0x80
+#define ECHO_OP_SET_CANON_COL 0x81
+#define ECHO_OP_ERASE_TAB 0x82
+
 static inline unsigned char *alloc_buf(void)
 {
 	gfp_t prio = in_interrupt() ? GFP_ATOMIC : GFP_KERNEL;
@@ -169,6 +180,7 @@ static void check_unthrottle(struct tty_struct *tty)
  *
  *	Locking: tty_read_lock for read fields.
  */
+
 static void reset_buffer_flags(struct tty_struct *tty)
 {
 	unsigned long flags;
@@ -176,6 +188,11 @@ static void reset_buffer_flags(struct tty_struct *tty)
 	spin_lock_irqsave(&tty->read_lock, flags);
 	tty->read_head = tty->read_tail = tty->read_cnt = 0;
 	spin_unlock_irqrestore(&tty->read_lock, flags);
+
+	mutex_lock(&tty->echo_lock);
+	tty->echo_pos = tty->echo_cnt = tty->echo_overrun = 0;
+	mutex_unlock(&tty->echo_lock);
+
 	tty->canon_head = tty->canon_data = tty->erasing = 0;
 	memset(&tty->read_flags, 0, sizeof tty->read_flags);
 	n_tty_set_room(tty);
@@ -266,89 +283,116 @@ static inline int is_continuation(unsigned char c, struct tty_struct *tty)
 }
 
 /**
- *	opost			-	output post processor
+ *	do_output_char			-	output one character
  *	@c: character (or partial unicode symbol)
  *	@tty: terminal device
+ *	@space: space available in tty driver write buffer
  *
- *	Perform OPOST processing.  Returns -1 when the output device is
- *	full and the character must be retried. Note that Linux currently
- *	ignores TABDLY, CRDLY, VTDLY, FFDLY and NLDLY. They simply aren't
- *	relevant in the world today. If you ever need them, add them here.
+ *	This is a helper function that handles one output character
+ *	(including special characters like TAB, CR, LF, etc.),
+ *	putting the results in the tty driver's write buffer.
+ *
+ *	Note that Linux currently ignores TABDLY, CRDLY, VTDLY, FFDLY
+ *	and NLDLY.  They simply aren't relevant in the world today.
+ *	If you ever need them, add them here.
  *
- *	Called from both the receive and transmit sides and can be called
- *	re-entrantly. Relies on lock_kernel() for tty->column state.
+ *	Returns the number of bytes of buffer space used or -1 if
+ *	no space left.
+ *
+ *	Locking: should be called under the output_lock to protect
+ *		 the column state and space left in the buffer
  */
 
-static int opost(unsigned char c, struct tty_struct *tty)
+static int do_output_char(unsigned char c, struct tty_struct *tty, int space)
 {
-	int	space, spaces;
+	int	spaces;
 
-	space = tty_write_room(tty);
 	if (!space)
 		return -1;
-
-	lock_kernel();
-	if (O_OPOST(tty)) {
-		switch (c) {
-		case '\n':
-			if (O_ONLRET(tty))
-				tty->column = 0;
-			if (O_ONLCR(tty)) {
-				if (space < 2) {
-					unlock_kernel();
-					return -1;
-				}
-				tty_put_char(tty, '\r');
-				tty->column = 0;
-			}
-			tty->canon_column = tty->column;
-			break;
-		case '\r':
-			if (O_ONOCR(tty) && tty->column == 0) {
-				unlock_kernel();
-				return 0;
-			}
-			if (O_OCRNL(tty)) {
-				c = '\n';
-				if (O_ONLRET(tty))
-					tty->canon_column = tty->column = 0;
-				break;
-			}
+	
+	switch (c) {
+	case '\n':
+		if (O_ONLRET(tty))
+			tty->column = 0;
+		if (O_ONLCR(tty)) {
+			if (space < 2)
+				return -1;
 			tty->canon_column = tty->column = 0;
+			tty_put_char(tty, '\r');
+			tty_put_char(tty, c);
+			return 2;
+		}
+		tty->canon_column = tty->column;
+		break;
+	case '\r':
+		if (O_ONOCR(tty) && tty->column == 0)
+			return 0;
+		if (O_OCRNL(tty)) {
+			c = '\n';
+			if (O_ONLRET(tty))
+				tty->canon_column = tty->column = 0;
 			break;
-		case '\t':
-			spaces = 8 - (tty->column & 7);
-			if (O_TABDLY(tty) == XTABS) {
-				if (space < spaces) {
-					unlock_kernel();
-					return -1;
-				}
-				tty->column += spaces;
-				tty->ops->write(tty, "        ", spaces);
-				unlock_kernel();
-				return 0;
-			}
+		}
+		tty->canon_column = tty->column = 0;
+		break;
+	case '\t':
+		spaces = 8 - (tty->column & 7);
+		if (O_TABDLY(tty) == XTABS) {
+			if (space < spaces)
+				return -1;
 			tty->column += spaces;
-			break;
-		case '\b':
-			if (tty->column > 0)
-				tty->column--;
-			break;
-		default:
-			if (O_OLCUC(tty))
-				c = toupper(c);
-			if (!iscntrl(c) && !is_continuation(c, tty))
-				tty->column++;
-			break;
+			tty->ops->write(tty, "        ", spaces);
+			return spaces;
 		}
+		tty->column += spaces;
+		break;
+	case '\b':
+		if (tty->column > 0)
+			tty->column--;
+		break;
+	default:
+		if (O_OLCUC(tty))
+			c = toupper(c);
+		if (!iscntrl(c) && !is_continuation(c, tty))
+			tty->column++;
+		break;
 	}
+
 	tty_put_char(tty, c);
-	unlock_kernel();
-	return 0;
+	return 1;
+}
+
+/**
+ *	process_output			-	output post processor
+ *	@c: character (or partial unicode symbol)
+ *	@tty: terminal device
+ *
+ *	Perform OPOST processing.  Returns -1 when the output device is
+ *	full and the character must be retried.
+ *
+ *	Locking: output_lock to protect column state and space left
+ *		 (also, this is called from n_tty_write under the
+ *		  tty layer write lock)
+ */
+
+static int process_output(unsigned char c, struct tty_struct *tty)
+{
+	int	space, retval;
+
+	mutex_lock(&tty->output_lock);
+
+	space = tty_write_room(tty);
+	retval = do_output_char(c, tty, space);
+
+	mutex_unlock(&tty->output_lock);
+	if (retval < 0)
+		return -1;
+	else
+		return 0;
 }
 
 /**
- *	opost_block		-	block postprocess
+ *	process_output_block		-	block post processor
  *	@tty: terminal device
  *	@inbuf: user buffer
  *	@nr: number of bytes
@@ -358,24 +402,29 @@ static int opost(unsigned char c, struct tty_struct *tty)
  *	the simple cases normally found and helps to generate blocks of
  *	symbols for the console driver and thus improve performance.
  *
- *	Called from n_tty_write under the tty layer write lock. Relies
- *	on lock_kernel for the tty->column state.
+ *	Locking: output_lock to protect column state and space left
+ *		 (also, this is called from n_tty_write under the
+ *		  tty layer write lock)
  */
 
-static ssize_t opost_block(struct tty_struct *tty,
-		       const unsigned char *buf, unsigned int nr)
+static ssize_t process_output_block(struct tty_struct *tty,
+				    const unsigned char *buf, unsigned int nr)
 {
 	int	space;
 	int 	i;
 	const unsigned char *cp;
 
+	mutex_lock(&tty->output_lock);
+
 	space = tty_write_room(tty);
 	if (!space)
+	{
+		mutex_unlock(&tty->output_lock);
 		return 0;
+	}
 	if (nr > space)
 		nr = space;
 
-	lock_kernel();
 	for (i = 0, cp = buf; i < nr; i++, cp++) {
 		switch (*cp) {
 		case '\n':
@@ -407,46 +456,393 @@ static ssize_t opost_block(struct tty_struct *tty,
 		}
 	}
 break_out:
-	if (tty->ops->flush_chars)
-		tty->ops->flush_chars(tty);
 	i = tty->ops->write(tty, buf, i);
-	unlock_kernel();
+
+	mutex_unlock(&tty->output_lock);
 	return i;
 }
 
+/**
+ *	process_echoes	-	write pending echo characters
+ *	@tty: terminal device
+ *
+ *	Write previously buffered echo (and other ldisc-generated)
+ *	characters to the tty.
+ *
+ *	Characters generated by the ldisc (including echoes) need to
+ *	be buffered because the driver's write buffer can fill during
+ *	heavy program output.  Echoing straight to the driver will
+ *	often fail under these conditions, causing lost characters and
+ *	resulting mismatches of ldisc state information.
+ *
+ *	Since the ldisc state must represent the characters actually sent
+ *	to the driver at the time of the write, operations like certain
+ *	changes in column state are also saved in the buffer and executed
+ *	here.
+ *
+ *	A circular fifo buffer is used so that the most recent characters
+ *	are prioritized.  Also, when control characters are echoed with a
+ *	prefixed "^", the pair is treated atomically and thus not separated.
+ *
+ *	Locking: output_lock to protect column state and space left,
+ *		 echo_lock to protect the echo buffer
+ */
+
+static void process_echoes(struct tty_struct *tty)
+{
+	int	space, nr;
+	unsigned char c;
+	unsigned char *cp, *buf_end;
+
+	if (!tty->echo_cnt)
+		return;
+
+	mutex_lock(&tty->output_lock);
+	mutex_lock(&tty->echo_lock);
+
+	space = tty_write_room(tty);
+
+	buf_end = tty->echo_buf + N_TTY_BUF_SIZE;
+	cp = tty->echo_buf + tty->echo_pos;
+	nr = tty->echo_cnt;
+	while (nr > 0) {
+		c = *cp;
+		if (c == ECHO_OP_START) {
+			unsigned char op;
+			unsigned char *opp;
+			int no_space_left = 0;
+
+			/*
+			 * If the buffer byte is the start of a multi-byte
+			 * operation, get the next byte, which is either the
+			 * op code or a control character value.
+			 */
+			opp = cp + 1;
+			if (opp == buf_end)
+				opp -= N_TTY_BUF_SIZE;
+			op = *opp;
+			
+			switch (op) {
+				unsigned int num_chars, num_bs;
+
+			case ECHO_OP_ERASE_TAB:
+				if (++opp == buf_end)
+					opp -= N_TTY_BUF_SIZE;
+				num_chars = *opp;
+
+				/*
+				 * Determine how many columns to go back
+				 * in order to erase the tab.
+				 * This depends on the number of columns
+				 * used by other characters within the tab
+				 * area.  If this (modulo 8) count is from
+				 * the start of input rather than from a
+				 * previous tab, we offset by canon column.
+				 * Otherwise, tab spacing is normal.
+				 */
+				if (!(num_chars & 0x80))
+					num_chars += tty->canon_column;
+				num_bs = 8 - (num_chars & 7);
+
+				if (num_bs > space) {
+					no_space_left = 1;
+					break;
+				}
+				space -= num_bs;
+				while (num_bs--) {
+					tty_put_char(tty, '\b');
+					if (tty->column > 0)
+						tty->column--;
+				}
+				cp += 3;
+				nr -= 3;
+				break;
+
+			case ECHO_OP_SET_CANON_COL:
+				tty->canon_column = tty->column;
+				cp += 2;
+				nr -= 2;
+				break;
+
+			case ECHO_OP_MOVE_BACK_COL:
+				if (tty->column > 0)
+					tty->column--;
+				cp += 2;
+				nr -= 2;
+				break;
+
+			case ECHO_OP_START:
+				/* This is an escaped echo op start code */
+				if (!space) {
+					no_space_left = 1;
+					break;
+				}
+				tty_put_char(tty, ECHO_OP_START);
+				tty->column++;
+				space--;
+				cp += 2;
+				nr -= 2;
+				break;
+
+			default:
+				if (iscntrl(op)) {
+					if (L_ECHOCTL(tty)) {
+						/*
+						 * Ensure there is enough space
+						 * for the whole ctrl pair.
+						 */
+						if (space < 2) {
+							no_space_left = 1;
+							break;
+						}
+						tty_put_char(tty, '^');
+						tty_put_char(tty, op ^ 0100);
+						tty->column += 2;
+						space -= 2;
+					} else {
+						if (!space) {
+							no_space_left = 1;
+							break;
+						}
+						tty_put_char(tty, op);
+						space--;
+					}
+				}
+				/*
+				 * If above falls through, this was an
+				 * undefined op.
+				 */
+				cp += 2;
+				nr -= 2;
+			}
+
+			if (no_space_left)
+				break;
+		} else {
+			int retval;
+
+			if ((retval = do_output_char(c, tty, space)) < 0)
+				break;
+			space -= retval;
+			cp += 1;
+			nr -= 1;
+		}
+
+		/* When end of circular buffer reached, wrap around */
+		if (cp >= buf_end)
+			cp -= N_TTY_BUF_SIZE;
+	}
+
+	if (nr == 0) {
+		tty->echo_pos = 0;
+		tty->echo_cnt = 0;
+		tty->echo_overrun = 0;
+	} else {
+		int num_processed = tty->echo_cnt - nr;
+		tty->echo_pos += num_processed;
+		tty->echo_pos &= N_TTY_BUF_SIZE - 1;
+		tty->echo_cnt = nr;
+		if (num_processed > 0)
+			tty->echo_overrun = 0;
+	}
+
+	mutex_unlock(&tty->echo_lock);
+	mutex_unlock(&tty->output_lock);
+
+	if (tty->ops->flush_chars)
+		tty->ops->flush_chars(tty);
+}
+
+/**
+ *	add_echo_byte	-	add a byte to the echo buffer
+ *	@c: unicode byte to echo
+ *	@tty: terminal device
+ *
+ *	Add a character or operation byte to the echo buffer.
+ *
+ *	Should be called under the echo lock to protect the echo buffer.
+ */
+
+static void add_echo_byte(unsigned char c, struct tty_struct *tty)
+{
+	int	new_byte_pos;
+
+	if (tty->echo_cnt == N_TTY_BUF_SIZE) {
+		/* Circular buffer is already at capacity */
+		new_byte_pos = tty->echo_pos;
+
+		/*
+		 * Since the buffer start position needs to be advanced,
+		 * be sure to step by a whole operation byte group.
+		 */
+		if (tty->echo_buf[tty->echo_pos] == ECHO_OP_START)
+		{
+			if (tty->echo_buf[(tty->echo_pos + 1) &
+					  (N_TTY_BUF_SIZE - 1)] ==
+						ECHO_OP_ERASE_TAB) {
+				tty->echo_pos += 3;
+				tty->echo_cnt -= 2;
+			} else {
+				tty->echo_pos += 2;
+				tty->echo_cnt -= 1;
+			}
+		} else {
+			tty->echo_pos++;
+		}
+		tty->echo_pos &= N_TTY_BUF_SIZE - 1;
+
+		tty->echo_overrun = 1;
+	} else {
+		new_byte_pos = tty->echo_pos + tty->echo_cnt;
+		new_byte_pos &= N_TTY_BUF_SIZE - 1;
+		tty->echo_cnt++;
+	}
+
+	tty->echo_buf[new_byte_pos] = c;
+}
+
+/**
+ *	echo_move_back_col	-	add operation to move back a column
+ *	@tty: terminal device
+ *
+ *	Add an operation to the echo buffer to move back one column.
+ *
+ *	Locking: echo_lock to protect the echo buffer
+ */
+
+static void echo_move_back_col(struct tty_struct *tty)
+{
+	mutex_lock(&tty->echo_lock);
+
+	add_echo_byte(ECHO_OP_START, tty);
+	add_echo_byte(ECHO_OP_MOVE_BACK_COL, tty);
+
+	mutex_unlock(&tty->echo_lock);
+}
+
+/**
+ *	echo_set_canon_col	-	add operation to set the canon column
+ *	@tty: terminal device
+ *
+ *	Add an operation to the echo buffer to set the canon column
+ *	to the current column.
+ *
+ *	Locking: echo_lock to protect the echo buffer
+ */
+
+static void echo_set_canon_col(struct tty_struct *tty)
+{
+	mutex_lock(&tty->echo_lock);
+
+	add_echo_byte(ECHO_OP_START, tty);
+	add_echo_byte(ECHO_OP_SET_CANON_COL, tty);
+
+	mutex_unlock(&tty->echo_lock);
+}
+
+/**
+ *	echo_erase_tab	-	add operation to erase a tab
+ *	@num_chars: number of character columns already used
+ *	@after_tab: true if num_chars starts after a previous tab
+ *	@tty: terminal device
+ *
+ *	Add an operation to the echo buffer to erase a tab.
+ *
+ *	Called by the eraser function, which knows how many character
+ *	columns have been used since either a previous tab or the start
+ *	of input.  This information will be used later, along with
+ *	canon column (if applicable), to go back the correct number
+ *	of columns.
+ *
+ *	Locking: echo_lock to protect the echo buffer
+ */
+
+static void echo_erase_tab(unsigned int num_chars, int after_tab,
+			   struct tty_struct *tty)
+{
+	mutex_lock(&tty->echo_lock);
+
+	add_echo_byte(ECHO_OP_START, tty);
+	add_echo_byte(ECHO_OP_ERASE_TAB, tty);
+
+	/* We only need to know this modulo 8 (tab spacing) */
+	num_chars &= 7;
+
+	/* Set the high bit as a flag if num_chars is after a previous tab */
+	if (after_tab)
+		num_chars |= 0x80;
+	
+	add_echo_byte(num_chars, tty);
+
+	mutex_unlock(&tty->echo_lock);
+}
+
+/**
+ *	echo_char_raw	-	echo a character raw
+ *	@c: unicode byte to echo
+ *	@tty: terminal device
+ *
+ *	Echo user input back onto the screen. This must be called only when
+ *	L_ECHO(tty) is true. Called from the driver receive_buf path.
+ *
+ *	This variant does not treat control characters specially.
+ *
+ *	Locking: echo_lock to protect the echo buffer
+ */
+
+static void echo_char_raw(unsigned char c, struct tty_struct *tty)
+{
+	mutex_lock(&tty->echo_lock);
+
+	if (c == ECHO_OP_START) {
+		add_echo_byte(ECHO_OP_START, tty);
+		add_echo_byte(ECHO_OP_START, tty);
+	} else {
+		add_echo_byte(c, tty);
+	}
+
+	mutex_unlock(&tty->echo_lock);
+}
 
 /**
- *	echo_char	-	echo characters
+ *	echo_char	-	echo a character
  *	@c: unicode byte to echo
  *	@tty: terminal device
  *
  *	Echo user input back onto the screen. This must be called only when
  *	L_ECHO(tty) is true. Called from the driver receive_buf path.
  *
- *	Relies on BKL for tty column locking
+ *	This variant tags control characters to be possibly echoed as
+ *	as "^X" (where X is the letter representing the control char).
+ *
+ *	Locking: echo_lock to protect the echo buffer
  */
 
 static void echo_char(unsigned char c, struct tty_struct *tty)
 {
-	if (L_ECHOCTL(tty) && iscntrl(c) && c != '\t') {
-		tty_put_char(tty, '^');
-		tty_put_char(tty, c ^ 0100);
-		tty->column += 2;
-	} else
-		opost(c, tty);
+	mutex_lock(&tty->echo_lock);
+
+	if (c == ECHO_OP_START) {
+		add_echo_byte(ECHO_OP_START, tty);
+		add_echo_byte(ECHO_OP_START, tty);
+	} else {
+		if (iscntrl(c) && c != '\t')
+			add_echo_byte(ECHO_OP_START, tty);
+		add_echo_byte(c, tty);
+	}
+
+	mutex_unlock(&tty->echo_lock);
 }
 
 /**
- *	finsh_erasing		-	complete erase
+ *	finish_erasing		-	complete erase
  *	@tty: tty doing the erase
- *
- *	Relies on BKL for tty column locking
  */
+
 static inline void finish_erasing(struct tty_struct *tty)
 {
 	if (tty->erasing) {
-		tty_put_char(tty, '/');
-		tty->column++;
+		echo_char_raw('/', tty);
 		tty->erasing = 0;
 	}
 }
@@ -460,7 +856,7 @@ static inline void finish_erasing(struct tty_struct *tty)
  *	present in the stream from the driver layer. Handles the complexities
  *	of UTF-8 multibyte symbols.
  *
- *	Locking: read_lock for tty buffers, BKL for column/erasing state
+ *	Locking: read_lock for tty buffers
  */
 
 static void eraser(unsigned char c, struct tty_struct *tty)
@@ -471,7 +867,7 @@ static void eraser(unsigned char c, struct tty_struct *tty)
 
 	/* FIXME: locking needed ? */
 	if (tty->read_head == tty->canon_head) {
-		/* opost('\a', tty); */		/* what do you think? */
+		/* echo_char_raw('\a', tty); */ /* what do you think? */
 		return;
 	}
 	if (c == ERASE_CHAR(tty))
@@ -497,7 +893,7 @@ static void eraser(unsigned char c, struct tty_struct *tty)
 			echo_char(KILL_CHAR(tty), tty);
 			/* Add a newline if ECHOK is on and ECHOKE is off. */
 			if (L_ECHOK(tty))
-				opost('\n', tty);
+				echo_char_raw('\n', tty);
 			return;
 		}
 		kill_type = KILL;
@@ -533,67 +929,62 @@ static void eraser(unsigned char c, struct tty_struct *tty)
 		if (L_ECHO(tty)) {
 			if (L_ECHOPRT(tty)) {
 				if (!tty->erasing) {
-					tty_put_char(tty, '\\');
-					tty->column++;
+					echo_char_raw('\\', tty);
 					tty->erasing = 1;
 				}
 				/* if cnt > 1, output a multi-byte character */
 				echo_char(c, tty);
 				while (--cnt > 0) {
 					head = (head+1) & (N_TTY_BUF_SIZE-1);
-					tty_put_char(tty, tty->read_buf[head]);
+					echo_char_raw(tty->read_buf[head], tty);
+					echo_move_back_col(tty);
 				}
 			} else if (kill_type == ERASE && !L_ECHOE(tty)) {
 				echo_char(ERASE_CHAR(tty), tty);
 			} else if (c == '\t') {
-				unsigned int col = tty->canon_column;
-				unsigned long tail = tty->canon_head;
-
-				/* Find the column of the last char. */
-				while (tail != tty->read_head) {
+				unsigned int num_chars = 0;
+				int after_tab = 0;
+				unsigned long tail = tty->read_head;
+
+				/*
+				 * Count the columns used for characters
+				 * since the start of input or after a
+				 * previous tab.
+				 * This info is used to go back the correct
+				 * number of columns.
+				 */
+				while (tail != tty->canon_head) {
+					tail = (tail-1) & (N_TTY_BUF_SIZE-1);
 					c = tty->read_buf[tail];
-					if (c == '\t')
-						col = (col | 7) + 1;
+					if (c == '\t') {
+						after_tab = 1;
+						break;
+					}
 					else if (iscntrl(c)) {
 						if (L_ECHOCTL(tty))
-							col += 2;
-					} else if (!is_continuation(c, tty))
-						col++;
-					tail = (tail+1) & (N_TTY_BUF_SIZE-1);
-				}
-
-				/* should never happen */
-				if (tty->column > 0x80000000)
-					tty->column = 0;
-
-				/* Now backup to that column. */
-				while (tty->column > col) {
-					/* Can't use opost here. */
-					tty_put_char(tty, '\b');
-					if (tty->column > 0)
-						tty->column--;
+							num_chars += 2;
+					} else if (!is_continuation(c, tty)) {
+						num_chars++;
+					}
 				}
+				echo_erase_tab(num_chars, after_tab, tty);
 			} else {
 				if (iscntrl(c) && L_ECHOCTL(tty)) {
-					tty_put_char(tty, '\b');
-					tty_put_char(tty, ' ');
-					tty_put_char(tty, '\b');
-					if (tty->column > 0)
-						tty->column--;
+					echo_char_raw('\b', tty);
+					echo_char_raw(' ', tty);
+					echo_char_raw('\b', tty);
 				}
 				if (!iscntrl(c) || L_ECHOCTL(tty)) {
-					tty_put_char(tty, '\b');
-					tty_put_char(tty, ' ');
-					tty_put_char(tty, '\b');
-					if (tty->column > 0)
-						tty->column--;
+					echo_char_raw('\b', tty);
+					echo_char_raw(' ', tty);
+					echo_char_raw('\b', tty);
 				}
 			}
 		}
 		if (kill_type == ERASE)
 			break;
 	}
-	if (tty->read_head == tty->canon_head)
+	if (tty->read_head == tty->canon_head && L_ECHO(tty))
 		finish_erasing(tty);
 }
 
@@ -724,14 +1115,18 @@ static inline void n_tty_receive_char(struct tty_struct *tty, unsigned char c)
 		c=tolower(c);
 
 	if (tty->stopped && !tty->flow_stopped && I_IXON(tty) &&
-	    ((I_IXANY(tty) && c != START_CHAR(tty) && c != STOP_CHAR(tty)) ||
-	     c == INTR_CHAR(tty) || c == QUIT_CHAR(tty) || c == SUSP_CHAR(tty)))
+	    I_IXANY(tty) && c != START_CHAR(tty) && c != STOP_CHAR(tty) &&
+	    c != INTR_CHAR(tty) && c != QUIT_CHAR(tty) && c != SUSP_CHAR(tty)) {
 		start_tty(tty);
+		process_echoes(tty);
+	}
 
 	if (tty->closing) {
 		if (I_IXON(tty)) {
-			if (c == START_CHAR(tty))
+			if (c == START_CHAR(tty)) {
 				start_tty(tty);
+				process_echoes(tty);
+			}
 			else if (c == STOP_CHAR(tty))
 				stop_tty(tty);
 		}
@@ -745,17 +1140,20 @@ static inline void n_tty_receive_char(struct tty_struct *tty, unsigned char c)
 	 * up.
 	 */
 	if (!test_bit(c, tty->process_char_map) || tty->lnext) {
-		finish_erasing(tty);
 		tty->lnext = 0;
 		if (L_ECHO(tty)) {
+			finish_erasing(tty);
 			if (tty->read_cnt >= N_TTY_BUF_SIZE-1) {
-				tty_put_char(tty, '\a'); /* beep if no space */
+				/* beep if no space */
+				echo_char_raw('\a', tty);
+				process_echoes(tty);
 				return;
 			}
 			/* Record the column of first canon char. */
 			if (tty->canon_head == tty->read_head)
-				tty->canon_column = tty->column;
+				echo_set_canon_col(tty);
 			echo_char(c, tty);
+			process_echoes(tty);
 		}
 		if (I_PARMRK(tty) && c == (unsigned char) '\377')
 			put_tty_queue(c, tty);
@@ -766,6 +1164,7 @@ static inline void n_tty_receive_char(struct tty_struct *tty, unsigned char c)
 	if (I_IXON(tty)) {
 		if (c == START_CHAR(tty)) {
 			start_tty(tty);
+			process_echoes(tty);
 			return;
 		}
 		if (c == STOP_CHAR(tty)) {
@@ -786,7 +1185,6 @@ static inline void n_tty_receive_char(struct tty_struct *tty, unsigned char c)
 		if (c == SUSP_CHAR(tty)) {
 send_signal:
 			/*
-			 * Echo character, and then send the signal.
 			 * Note that we do not use isig() here because we want
 			 * the order to be:
 			 * 1) flush, 2) echo, 3) signal
@@ -795,8 +1193,12 @@ send_signal:
 				n_tty_flush_buffer(tty);
 				tty_driver_flush_buffer(tty);
 			}
-			if (L_ECHO(tty))
+			if (I_IXON(tty))
+				start_tty(tty);
+			if (L_ECHO(tty)) {
 				echo_char(c, tty);
+				process_echoes(tty);
+			}
 			if (tty->pgrp)
 				kill_pgrp(tty->pgrp, signal, 1);
 			return;
@@ -815,6 +1217,7 @@ send_signal:
 		if (c == ERASE_CHAR(tty) || c == KILL_CHAR(tty) ||
 		    (c == WERASE_CHAR(tty) && L_IEXTEN(tty))) {
 			eraser(c, tty);
+			process_echoes(tty);
 			return;
 		}
 		if (c == LNEXT_CHAR(tty) && L_IEXTEN(tty)) {
@@ -822,8 +1225,9 @@ send_signal:
 			if (L_ECHO(tty)) {
 				finish_erasing(tty);
 				if (L_ECHOCTL(tty)) {
-					tty_put_char(tty, '^');
-					tty_put_char(tty, '\b');
+					echo_char_raw('^', tty);
+					echo_char_raw('\b', tty);
+					process_echoes(tty);
 				}
 			}
 			return;
@@ -834,18 +1238,20 @@ send_signal:
 
 			finish_erasing(tty);
 			echo_char(c, tty);
-			opost('\n', tty);
+			echo_char_raw('\n', tty);
 			while (tail != tty->read_head) {
 				echo_char(tty->read_buf[tail], tty);
 				tail = (tail+1) & (N_TTY_BUF_SIZE-1);
 			}
+			process_echoes(tty);
 			return;
 		}
 		if (c == '\n') {
 			if (L_ECHO(tty) || L_ECHONL(tty)) {
 				if (tty->read_cnt >= N_TTY_BUF_SIZE-1)
-					tty_put_char(tty, '\a');
-				opost('\n', tty);
+					echo_char_raw('\a', tty);
+				echo_char_raw('\n', tty);
+				process_echoes(tty);
 			}
 			goto handle_newline;
 		}
@@ -862,11 +1268,12 @@ send_signal:
 			 */
 			if (L_ECHO(tty)) {
 				if (tty->read_cnt >= N_TTY_BUF_SIZE-1)
-					tty_put_char(tty, '\a');
+					echo_char_raw('\a', tty);
 				/* Record the column of first canon char. */
 				if (tty->canon_head == tty->read_head)
-					tty->canon_column = tty->column;
+					echo_set_canon_col(tty);
 				echo_char(c, tty);
+				process_echoes(tty);
 			}
 			/*
 			 * XXX does PARMRK doubling happen for
@@ -889,20 +1296,23 @@ handle_newline:
 		}
 	}
 
-	finish_erasing(tty);
 	if (L_ECHO(tty)) {
+		finish_erasing(tty);
 		if (tty->read_cnt >= N_TTY_BUF_SIZE-1) {
-			tty_put_char(tty, '\a'); /* beep if no space */
+			/* beep if no space */
+			echo_char_raw('\a', tty);
+			process_echoes(tty);
 			return;
 		}
 		if (c == '\n')
-			opost('\n', tty);
+			echo_char_raw('\n', tty);
 		else {
 			/* Record the column of first canon char. */
 			if (tty->canon_head == tty->read_head)
-				tty->canon_column = tty->column;
+				echo_set_canon_col(tty);
 			echo_char(c, tty);
 		}
+		process_echoes(tty);
 	}
 
 	if (I_PARMRK(tty) && c == (unsigned char) '\377')
@@ -923,6 +1333,9 @@ handle_newline:
 
 static void n_tty_write_wakeup(struct tty_struct *tty)
 {
+	/* Write out any echoed characters that are still pending */
+	process_echoes(tty);
+	
 	if (tty->fasync) {
 		set_bit(TTY_DO_WRITE_WAKEUP, &tty->flags);
 		kill_fasync(&tty->fasync, SIGIO, POLL_OUT);
@@ -1134,6 +1547,10 @@ static void n_tty_close(struct tty_struct *tty)
 		free_buf(tty->read_buf);
 		tty->read_buf = NULL;
 	}
+	if (tty->echo_buf) {
+		free_buf(tty->echo_buf);
+		tty->echo_buf = NULL;
+	}
 }
 
 /**
@@ -1151,13 +1568,19 @@ static int n_tty_open(struct tty_struct *tty)
 	if (!tty)
 		return -EINVAL;
 
-	/* This one is ugly. Currently a malloc failure here can panic */
+	/* These are ugly. Currently a malloc failure here can panic */
 	if (!tty->read_buf) {
 		tty->read_buf = alloc_buf();
 		if (!tty->read_buf)
 			return -ENOMEM;
 	}
+	if (!tty->echo_buf) {
+		tty->echo_buf = alloc_buf();
+		if (!tty->echo_buf)
+			return -ENOMEM;
+	}
 	memset(tty->read_buf, 0, N_TTY_BUF_SIZE);
+	memset(tty->echo_buf, 0, N_TTY_BUF_SIZE);
 	reset_buffer_flags(tty);
 	tty->column = 0;
 	n_tty_set_termios(tty, NULL);
@@ -1487,16 +1910,23 @@ do_it_again:
  *	@buf: userspace buffer pointer
  *	@nr: size of I/O
  *
- *	Write function of the terminal device. This is serialized with
+ *	Write function of the terminal device.  This is serialized with
  *	respect to other write callers but not to termios changes, reads
- *	and other such events. We must be careful with N_TTY as the receive
- *	code will echo characters, thus calling driver write methods.
+ *	and other such events.  Since the receive code will echo characters,
+ *	thus calling driver write methods, the output_lock is used in
+ *	the output processing functions called here as well as in the
+ *	echo processing function to protect the column state and space
+ *	left in the buffer.
  *
  *	This code must be sure never to sleep through a hangup.
+ *
+ *	Locking: output_lock to protect column state and space left
+ *		 (note that the process_output*() functions take this
+ *		  lock themselves)
  */
 
 static ssize_t n_tty_write(struct tty_struct *tty, struct file *file,
-			  const unsigned char *buf, size_t nr)
+			   const unsigned char *buf, size_t nr)
 {
 	const unsigned char *b = buf;
 	DECLARE_WAITQUEUE(wait, current);
@@ -1510,6 +1940,9 @@ static ssize_t n_tty_write(struct tty_struct *tty, struct file *file,
 			return retval;
 	}
 
+	/* Write out any echoed characters that are still pending */
+	process_echoes(tty);
+	
 	add_wait_queue(&tty->write_wait, &wait);
 	while (1) {
 		set_current_state(TASK_INTERRUPTIBLE);
@@ -1523,7 +1956,7 @@ static ssize_t n_tty_write(struct tty_struct *tty, struct file *file,
 		}
 		if (O_OPOST(tty) && !(test_bit(TTY_HW_COOK_OUT, &tty->flags))) {
 			while (nr > 0) {
-				ssize_t num = opost_block(tty, b, nr);
+				ssize_t num = process_output_block(tty, b, nr);
 				if (num < 0) {
 					if (num == -EAGAIN)
 						break;
@@ -1535,7 +1968,7 @@ static ssize_t n_tty_write(struct tty_struct *tty, struct file *file,
 				if (nr == 0)
 					break;
 				c = *b;
-				if (opost(c, tty) < 0)
+				if (process_output(c, tty) < 0)
 					break;
 				b++; nr--;
 			}
@@ -1663,4 +2096,3 @@ struct tty_ldisc_ops tty_ldisc_N_TTY = {
 	.receive_buf     = n_tty_receive_buf,
 	.write_wakeup    = n_tty_write_wakeup
 };
-
diff --git a/drivers/char/tty_io.c b/drivers/char/tty_io.c
index db15f9ba7c0..d8d240c8a25 100644
--- a/drivers/char/tty_io.c
+++ b/drivers/char/tty_io.c
@@ -1111,9 +1111,7 @@ void tty_write_message(struct tty_struct *tty, char *msg)
  *		Locks the line discipline as required
  *		Writes to the tty driver are serialized by the atomic_write_lock
  *	and are then processed in chunks to the device. The line discipline
- *	write method will not be involked in parallel for each device
- *		The line discipline write method is called under the big
- *	kernel lock for historical reasons. New code should not rely on this.
+ *	write method will not be invoked in parallel for each device.
  */
 
 static ssize_t tty_write(struct file *file, const char __user *buf,
@@ -2785,6 +2783,8 @@ void initialize_tty_struct(struct tty_struct *tty,
 	INIT_WORK(&tty->hangup_work, do_tty_hangup);
 	mutex_init(&tty->atomic_read_lock);
 	mutex_init(&tty->atomic_write_lock);
+	mutex_init(&tty->output_lock);
+	mutex_init(&tty->echo_lock);
 	spin_lock_init(&tty->read_lock);
 	spin_lock_init(&tty->ctrl_lock);
 	INIT_LIST_HEAD(&tty->tty_files);
diff --git a/drivers/char/vt.c b/drivers/char/vt.c
index 008176edbd6..639e126b2bf 100644
--- a/drivers/char/vt.c
+++ b/drivers/char/vt.c
@@ -2679,7 +2679,7 @@ static int con_write_room(struct tty_struct *tty)
 {
 	if (tty->stopped)
 		return 0;
-	return 4096;		/* No limit, really; we're not buffering */
+	return 32768;		/* No limit, really; we're not buffering */
 }
 
 static int con_chars_in_buffer(struct tty_struct *tty)
diff --git a/include/linux/tty.h b/include/linux/tty.h
index 3f4954c55e5..dfc77ded198 100644
--- a/include/linux/tty.h
+++ b/include/linux/tty.h
@@ -253,6 +253,7 @@ struct tty_struct {
 	unsigned int column;
 	unsigned char lnext:1, erasing:1, raw:1, real_raw:1, icanon:1;
 	unsigned char closing:1;
+	unsigned char echo_overrun:1;
 	unsigned short minimum_to_wake;
 	unsigned long overrun_time;
 	int num_overrun;
@@ -262,11 +263,16 @@ struct tty_struct {
 	int read_tail;
 	int read_cnt;
 	unsigned long read_flags[N_TTY_BUF_SIZE/(8*sizeof(unsigned long))];
+	unsigned char *echo_buf;
+	unsigned int echo_pos;
+	unsigned int echo_cnt;
 	int canon_data;
 	unsigned long canon_head;
 	unsigned int canon_column;
 	struct mutex atomic_read_lock;
 	struct mutex atomic_write_lock;
+	struct mutex output_lock;
+	struct mutex echo_lock;
 	unsigned char *write_buf;
 	int write_cnt;
 	spinlock_t read_lock;
-- 
cgit v1.2.3-70-g09d2


From fc6f6238226e6d1248e1967eae2bf556eaf3ac17 Mon Sep 17 00:00:00 2001
From: Alan Cox <alan@redhat.com>
Date: Fri, 2 Jan 2009 13:43:17 +0000
Subject: pty: simplify resize

We have special case logic for resizing pty/tty pairs. We also have a per
driver resize method so for the pty case we should use it.

Signed-off-by: Alan Cox <alan@redhat.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/char/hvc_console.c |  2 +-
 drivers/char/pty.c         | 54 +++++++++++++++++++++++++++++++++++++++++++++-
 drivers/char/tty_io.c      | 31 ++++++++++----------------
 drivers/char/vt.c          | 14 ++++++------
 include/linux/tty.h        |  3 +--
 include/linux/tty_driver.h |  6 ++----
 6 files changed, 74 insertions(+), 36 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/char/hvc_console.c b/drivers/char/hvc_console.c
index 0587b66d6fc..5a8a4c28c86 100644
--- a/drivers/char/hvc_console.c
+++ b/drivers/char/hvc_console.c
@@ -529,7 +529,7 @@ static void hvc_set_winsz(struct work_struct *work)
 	tty = tty_kref_get(hp->tty);
 	spin_unlock_irqrestore(&hp->lock, hvc_flags);
 
-	tty_do_resize(tty, tty, &ws);
+	tty_do_resize(tty, &ws);
 	tty_kref_put(tty);
 }
 
diff --git a/drivers/char/pty.c b/drivers/char/pty.c
index 6d4582712b1..b5daaaa9007 100644
--- a/drivers/char/pty.c
+++ b/drivers/char/pty.c
@@ -230,6 +230,55 @@ static void pty_set_termios(struct tty_struct *tty,
 	tty->termios->c_cflag |= (CS8 | CREAD);
 }
 
+/**
+ *	pty_do_resize		-	resize event
+ *	@tty: tty being resized
+ *	@real_tty: real tty (not the same as tty if using a pty/tty pair)
+ *	@rows: rows (character)
+ *	@cols: cols (character)
+ *
+ *	Update the termios variables and send the neccessary signals to
+ *	peform a terminal resize correctly
+ */
+
+int pty_resize(struct tty_struct *tty,  struct winsize *ws)
+{
+	struct pid *pgrp, *rpgrp;
+	unsigned long flags;
+	struct tty_struct *pty = tty->link;
+
+	/* For a PTY we need to lock the tty side */
+	mutex_lock(&tty->termios_mutex);
+	if (!memcmp(ws, &tty->winsize, sizeof(*ws)))
+		goto done;
+
+	/* Get the PID values and reference them so we can
+	   avoid holding the tty ctrl lock while sending signals.
+	   We need to lock these individually however. */
+
+	spin_lock_irqsave(&tty->ctrl_lock, flags);
+	pgrp = get_pid(tty->pgrp);
+	spin_unlock_irqrestore(&tty->ctrl_lock, flags);
+
+	spin_lock_irqsave(&pty->ctrl_lock, flags);
+	rpgrp = get_pid(pty->pgrp);
+	spin_unlock_irqrestore(&pty->ctrl_lock, flags);
+
+	if (pgrp)
+		kill_pgrp(pgrp, SIGWINCH, 1);
+	if (rpgrp != pgrp && rpgrp)
+		kill_pgrp(rpgrp, SIGWINCH, 1);
+
+	put_pid(pgrp);
+	put_pid(rpgrp);
+
+	tty->winsize = *ws;
+	pty->winsize = *ws;	/* Never used so will go away soon */
+done:
+	mutex_unlock(&tty->termios_mutex);
+	return 0;
+}
+
 static int pty_install(struct tty_driver *driver, struct tty_struct *tty)
 {
 	struct tty_struct *o_tty;
@@ -290,6 +339,7 @@ static const struct tty_operations pty_ops = {
 	.chars_in_buffer = pty_chars_in_buffer,
 	.unthrottle = pty_unthrottle,
 	.set_termios = pty_set_termios,
+	.resize = pty_resize
 };
 
 /* Traditional BSD devices */
@@ -319,6 +369,7 @@ static const struct tty_operations pty_ops_bsd = {
 	.unthrottle = pty_unthrottle,
 	.set_termios = pty_set_termios,
 	.ioctl = pty_bsd_ioctl,
+	.resize = pty_resize
 };
 
 static void __init legacy_pty_init(void)
@@ -561,7 +612,8 @@ static const struct tty_operations ptm_unix98_ops = {
 	.unthrottle = pty_unthrottle,
 	.set_termios = pty_set_termios,
 	.ioctl = pty_unix98_ioctl,
-	.shutdown = pty_unix98_shutdown
+	.shutdown = pty_unix98_shutdown,
+	.resize = pty_resize
 };
 
 static const struct tty_operations pty_unix98_ops = {
diff --git a/drivers/char/tty_io.c b/drivers/char/tty_io.c
index 2a15af65dd1..d33e5ab0617 100644
--- a/drivers/char/tty_io.c
+++ b/drivers/char/tty_io.c
@@ -2048,7 +2048,6 @@ static int tiocgwinsz(struct tty_struct *tty, struct winsize __user *arg)
 /**
  *	tty_do_resize		-	resize event
  *	@tty: tty being resized
- *	@real_tty: real tty (not the same as tty if using a pty/tty pair)
  *	@rows: rows (character)
  *	@cols: cols (character)
  *
@@ -2056,41 +2055,34 @@ static int tiocgwinsz(struct tty_struct *tty, struct winsize __user *arg)
  *	peform a terminal resize correctly
  */
 
-int tty_do_resize(struct tty_struct *tty, struct tty_struct *real_tty,
-					struct winsize *ws)
+int tty_do_resize(struct tty_struct *tty, struct winsize *ws)
 {
-	struct pid *pgrp, *rpgrp;
+	struct pid *pgrp;
 	unsigned long flags;
 
-	/* For a PTY we need to lock the tty side */
-	mutex_lock(&real_tty->termios_mutex);
-	if (!memcmp(ws, &real_tty->winsize, sizeof(*ws)))
+	/* Lock the tty */
+	mutex_lock(&tty->termios_mutex);
+	if (!memcmp(ws, &tty->winsize, sizeof(*ws)))
 		goto done;
 	/* Get the PID values and reference them so we can
 	   avoid holding the tty ctrl lock while sending signals */
 	spin_lock_irqsave(&tty->ctrl_lock, flags);
 	pgrp = get_pid(tty->pgrp);
-	rpgrp = get_pid(real_tty->pgrp);
 	spin_unlock_irqrestore(&tty->ctrl_lock, flags);
 
 	if (pgrp)
 		kill_pgrp(pgrp, SIGWINCH, 1);
-	if (rpgrp != pgrp && rpgrp)
-		kill_pgrp(rpgrp, SIGWINCH, 1);
-
 	put_pid(pgrp);
-	put_pid(rpgrp);
 
 	tty->winsize = *ws;
-	real_tty->winsize = *ws;
 done:
-	mutex_unlock(&real_tty->termios_mutex);
+	mutex_unlock(&tty->termios_mutex);
 	return 0;
 }
 
 /**
  *	tiocswinsz		-	implement window size set ioctl
- *	@tty; tty
+ *	@tty; tty side of tty
  *	@arg: user buffer for result
  *
  *	Copies the user idea of the window size to the kernel. Traditionally
@@ -2103,17 +2095,16 @@ done:
  *	then calls into the default method.
  */
 
-static int tiocswinsz(struct tty_struct *tty, struct tty_struct *real_tty,
-	struct winsize __user *arg)
+static int tiocswinsz(struct tty_struct *tty, struct winsize __user *arg)
 {
 	struct winsize tmp_ws;
 	if (copy_from_user(&tmp_ws, arg, sizeof(*arg)))
 		return -EFAULT;
 
 	if (tty->ops->resize)
-		return tty->ops->resize(tty, real_tty, &tmp_ws);
+		return tty->ops->resize(tty, &tmp_ws);
 	else
-		return tty_do_resize(tty, real_tty, &tmp_ws);
+		return tty_do_resize(tty, &tmp_ws);
 }
 
 /**
@@ -2538,7 +2529,7 @@ long tty_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 	case TIOCGWINSZ:
 		return tiocgwinsz(real_tty, p);
 	case TIOCSWINSZ:
-		return tiocswinsz(tty, real_tty, p);
+		return tiocswinsz(real_tty, p);
 	case TIOCCONS:
 		return real_tty != tty ? -EINVAL : tioccons(file);
 	case FIONBIO:
diff --git a/drivers/char/vt.c b/drivers/char/vt.c
index 639e126b2bf..80014213fb5 100644
--- a/drivers/char/vt.c
+++ b/drivers/char/vt.c
@@ -819,8 +819,8 @@ static inline int resize_screen(struct vc_data *vc, int width, int height,
  *	ctrl_lock of the tty IFF a tty is passed.
  */
 
-static int vc_do_resize(struct tty_struct *tty, struct tty_struct *real_tty,
-		struct vc_data *vc, unsigned int cols, unsigned int lines)
+static int vc_do_resize(struct tty_struct *tty, struct vc_data *vc,
+				unsigned int cols, unsigned int lines)
 {
 	unsigned long old_origin, new_origin, new_scr_end, rlth, rrem, err = 0;
 	unsigned int old_cols, old_rows, old_row_size, old_screen_size;
@@ -932,7 +932,7 @@ static int vc_do_resize(struct tty_struct *tty, struct tty_struct *real_tty,
 		ws.ws_row = vc->vc_rows;
 		ws.ws_col = vc->vc_cols;
 		ws.ws_ypixel = vc->vc_scan_lines;
-		tty_do_resize(tty, real_tty, &ws);
+		tty_do_resize(tty, &ws);
 	}
 
 	if (CON_IS_VISIBLE(vc))
@@ -954,13 +954,12 @@ static int vc_do_resize(struct tty_struct *tty, struct tty_struct *real_tty,
 
 int vc_resize(struct vc_data *vc, unsigned int cols, unsigned int rows)
 {
-	return vc_do_resize(vc->vc_tty, vc->vc_tty, vc, cols, rows);
+	return vc_do_resize(vc->vc_tty, vc, cols, rows);
 }
 
 /**
  *	vt_resize		-	resize a VT
  *	@tty: tty to resize
- *	@real_tty: tty if a pty/tty pair
  *	@ws: winsize attributes
  *
  *	Resize a virtual terminal. This is called by the tty layer as we
@@ -971,14 +970,13 @@ int vc_resize(struct vc_data *vc, unsigned int cols, unsigned int rows)
  *	termios_mutex and the tty ctrl_lock in that order.
  */
 
-int vt_resize(struct tty_struct *tty, struct tty_struct *real_tty,
-	struct winsize *ws)
+int vt_resize(struct tty_struct *tty, struct winsize *ws)
 {
 	struct vc_data *vc = tty->driver_data;
 	int ret;
 
 	acquire_console_sem();
-	ret = vc_do_resize(tty, real_tty, vc, ws->ws_col, ws->ws_row);
+	ret = vc_do_resize(tty, vc, ws->ws_col, ws->ws_row);
 	release_console_sem();
 	return ret;
 }
diff --git a/include/linux/tty.h b/include/linux/tty.h
index dfc77ded198..f88169787a5 100644
--- a/include/linux/tty.h
+++ b/include/linux/tty.h
@@ -360,8 +360,7 @@ extern int tty_write_room(struct tty_struct *tty);
 extern void tty_driver_flush_buffer(struct tty_struct *tty);
 extern void tty_throttle(struct tty_struct *tty);
 extern void tty_unthrottle(struct tty_struct *tty);
-extern int tty_do_resize(struct tty_struct *tty, struct tty_struct *real_tty,
-						struct winsize *ws);
+extern int tty_do_resize(struct tty_struct *tty, struct winsize *ws);
 extern void tty_shutdown(struct tty_struct *tty);
 extern void tty_free_termios(struct tty_struct *tty);
 extern int is_current_pgrp_orphaned(void);
diff --git a/include/linux/tty_driver.h b/include/linux/tty_driver.h
index 78416b90158..08e088334db 100644
--- a/include/linux/tty_driver.h
+++ b/include/linux/tty_driver.h
@@ -196,8 +196,7 @@
  *	Optional: If not provided then the write method is called under
  *	the atomic write lock to keep it serialized with the ldisc.
  *
- * int (*resize)(struct tty_struct *tty, struct tty_struct *real_tty,
- *				unsigned int rows, unsigned int cols);
+ * int (*resize)(struct tty_struct *tty, struct winsize *ws)
  *
  *	Called when a termios request is issued which changes the
  *	requested terminal geometry.
@@ -258,8 +257,7 @@ struct tty_operations {
 	int (*tiocmget)(struct tty_struct *tty, struct file *file);
 	int (*tiocmset)(struct tty_struct *tty, struct file *file,
 			unsigned int set, unsigned int clear);
-	int (*resize)(struct tty_struct *tty, struct tty_struct *real_tty,
-				struct winsize *ws);
+	int (*resize)(struct tty_struct *tty, struct winsize *ws);
 	int (*set_termiox)(struct tty_struct *tty, struct termiox *tnew);
 #ifdef CONFIG_CONSOLE_POLL
 	int (*poll_init)(struct tty_driver *driver, int line, char *options);
-- 
cgit v1.2.3-70-g09d2


From 975a1a7d887048d4afc9201383e11b7af991866b Mon Sep 17 00:00:00 2001
From: Russell King <rmk+lkml@arm.linux.org.uk>
Date: Fri, 2 Jan 2009 13:44:27 +0000
Subject: And here's a patch (to be applied on top of the last) which prevents

this happening again by making use of 'const'.

Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>
Signed-off-by: Alan Cox <alan@redhat.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/serial/8250_pci.c | 37 +++++++++++++++++++++----------------
 include/linux/8250_pci.h  |  2 +-
 2 files changed, 22 insertions(+), 17 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/serial/8250_pci.c b/drivers/serial/8250_pci.c
index 057b532ccaa..0b794138f68 100644
--- a/drivers/serial/8250_pci.c
+++ b/drivers/serial/8250_pci.c
@@ -42,7 +42,8 @@ struct pci_serial_quirk {
 	u32	subvendor;
 	u32	subdevice;
 	int	(*init)(struct pci_dev *dev);
-	int	(*setup)(struct serial_private *, struct pciserial_board *,
+	int	(*setup)(struct serial_private *,
+			 const struct pciserial_board *,
 			 struct uart_port *, int);
 	void	(*exit)(struct pci_dev *dev);
 };
@@ -107,7 +108,7 @@ setup_port(struct serial_private *priv, struct uart_port *port,
  * ADDI-DATA GmbH communication cards <info@addi-data.com>
  */
 static int addidata_apci7800_setup(struct serial_private *priv,
-				struct pciserial_board *board,
+				const struct pciserial_board *board,
 				struct uart_port *port, int idx)
 {
 	unsigned int bar = 0, offset = board->first_offset;
@@ -134,7 +135,7 @@ static int addidata_apci7800_setup(struct serial_private *priv,
  * Not that ugly ;) -- HW
  */
 static int
-afavlab_setup(struct serial_private *priv, struct pciserial_board *board,
+afavlab_setup(struct serial_private *priv, const struct pciserial_board *board,
 	      struct uart_port *port, int idx)
 {
 	unsigned int bar, offset = board->first_offset;
@@ -188,8 +189,9 @@ static int pci_hp_diva_init(struct pci_dev *dev)
  * some serial ports are supposed to be hidden on certain models.
  */
 static int
-pci_hp_diva_setup(struct serial_private *priv, struct pciserial_board *board,
-	      struct uart_port *port, int idx)
+pci_hp_diva_setup(struct serial_private *priv,
+		const struct pciserial_board *board,
+		struct uart_port *port, int idx)
 {
 	unsigned int offset = board->first_offset;
 	unsigned int bar = FL_GET_BASE(board->flags);
@@ -306,7 +308,7 @@ static void __devexit pci_plx9050_exit(struct pci_dev *dev)
 
 /* SBS Technologies Inc. PMC-OCTPRO and P-OCTAL cards */
 static int
-sbs_setup(struct serial_private *priv, struct pciserial_board *board,
+sbs_setup(struct serial_private *priv, const struct pciserial_board *board,
 		struct uart_port *port, int idx)
 {
 	unsigned int bar, offset = board->first_offset;
@@ -463,7 +465,7 @@ static int pci_siig_init(struct pci_dev *dev)
 }
 
 static int pci_siig_setup(struct serial_private *priv,
-			  struct pciserial_board *board,
+			  const struct pciserial_board *board,
 			  struct uart_port *port, int idx)
 {
 	unsigned int bar = FL_GET_BASE(board->flags) + idx, offset = 0;
@@ -534,7 +536,8 @@ static int pci_timedia_init(struct pci_dev *dev)
  * Ugh, this is ugly as all hell --- TYT
  */
 static int
-pci_timedia_setup(struct serial_private *priv, struct pciserial_board *board,
+pci_timedia_setup(struct serial_private *priv,
+		  const struct pciserial_board *board,
 		  struct uart_port *port, int idx)
 {
 	unsigned int bar = 0, offset = board->first_offset;
@@ -568,7 +571,7 @@ pci_timedia_setup(struct serial_private *priv, struct pciserial_board *board,
  */
 static int
 titan_400l_800l_setup(struct serial_private *priv,
-		      struct pciserial_board *board,
+		      const struct pciserial_board *board,
 		      struct uart_port *port, int idx)
 {
 	unsigned int bar, offset = board->first_offset;
@@ -770,7 +773,8 @@ static int pci_oxsemi_tornado_init(struct pci_dev *dev)
 }
 
 static int
-pci_default_setup(struct serial_private *priv, struct pciserial_board *board,
+pci_default_setup(struct serial_private *priv,
+		  const struct pciserial_board *board,
 		  struct uart_port *port, int idx)
 {
 	unsigned int bar, offset = board->first_offset, maxnr;
@@ -1099,7 +1103,7 @@ static struct pci_serial_quirk *find_quirk(struct pci_dev *dev)
 }
 
 static inline int get_pci_irq(struct pci_dev *dev,
-				struct pciserial_board *board)
+				const struct pciserial_board *board)
 {
 	if (board->flags & FL_NOIRQ)
 		return 0;
@@ -1894,8 +1898,8 @@ serial_pci_guess_board(struct pci_dev *dev, struct pciserial_board *board)
 }
 
 static inline int
-serial_pci_matches(struct pciserial_board *board,
-		   struct pciserial_board *guessed)
+serial_pci_matches(const struct pciserial_board *board,
+		   const struct pciserial_board *guessed)
 {
 	return
 	    board->num_ports == guessed->num_ports &&
@@ -1906,7 +1910,7 @@ serial_pci_matches(struct pciserial_board *board,
 }
 
 struct serial_private *
-pciserial_init_ports(struct pci_dev *dev, struct pciserial_board *board)
+pciserial_init_ports(struct pci_dev *dev, const struct pciserial_board *board)
 {
 	struct uart_port serial_port;
 	struct serial_private *priv;
@@ -2039,7 +2043,8 @@ static int __devinit
 pciserial_init_one(struct pci_dev *dev, const struct pci_device_id *ent)
 {
 	struct serial_private *priv;
-	struct pciserial_board *board, tmp;
+	const struct pciserial_board *board;
+	struct pciserial_board tmp;
 	int rc;
 
 	if (ent->driver_data >= ARRAY_SIZE(pci_boards)) {
@@ -2066,7 +2071,7 @@ pciserial_init_one(struct pci_dev *dev, const struct pci_device_id *ent)
 		 * We matched one of our class entries.  Try to
 		 * determine the parameters of this board.
 		 */
-		rc = serial_pci_guess_board(dev, board);
+		rc = serial_pci_guess_board(dev, &tmp);
 		if (rc)
 			goto disable;
 	} else {
diff --git a/include/linux/8250_pci.h b/include/linux/8250_pci.h
index 3209dd46ea7..b24ff086a66 100644
--- a/include/linux/8250_pci.h
+++ b/include/linux/8250_pci.h
@@ -31,7 +31,7 @@ struct pciserial_board {
 struct serial_private;
 
 struct serial_private *
-pciserial_init_ports(struct pci_dev *dev, struct pciserial_board *board);
+pciserial_init_ports(struct pci_dev *dev, const struct pciserial_board *board);
 void pciserial_remove_ports(struct serial_private *priv);
 void pciserial_suspend_ports(struct serial_private *priv);
 void pciserial_resume_ports(struct serial_private *priv);
-- 
cgit v1.2.3-70-g09d2


From c9b3976e3fec266be25c5001a70aa0a890b6c476 Mon Sep 17 00:00:00 2001
From: Alan Cox <alan@redhat.com>
Date: Fri, 2 Jan 2009 13:44:56 +0000
Subject: tty: Fix PPP hang under load

Signed-off-by: Alan Cox <alan@redhat.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/char/tty_ldisc.c | 30 +++++++++++++++++++++---------
 include/linux/tty.h      |  1 +
 2 files changed, 22 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/char/tty_ldisc.c b/drivers/char/tty_ldisc.c
index f307f135cbf..7a84b406a95 100644
--- a/drivers/char/tty_ldisc.c
+++ b/drivers/char/tty_ldisc.c
@@ -316,8 +316,7 @@ struct tty_ldisc *tty_ldisc_ref_wait(struct tty_struct *tty)
 {
 	/* wait_event is a macro */
 	wait_event(tty_ldisc_wait, tty_ldisc_try(tty));
-	if (tty->ldisc.refcount == 0)
-		printk(KERN_ERR "tty_ldisc_ref_wait\n");
+	WARN_ON(tty->ldisc.refcount == 0);
 	return &tty->ldisc;
 }
 
@@ -376,15 +375,17 @@ EXPORT_SYMBOL_GPL(tty_ldisc_deref);
  *	@tty: terminal to activate ldisc on
  *
  *	Set the TTY_LDISC flag when the line discipline can be called
- *	again. Do necessary wakeups for existing sleepers.
+ *	again. Do necessary wakeups for existing sleepers. Clear the LDISC
+ *	changing flag to indicate any ldisc change is now over.
  *
- *	Note: nobody should set this bit except via this function. Clearing
- *	directly is allowed.
+ *	Note: nobody should set the TTY_LDISC bit except via this function.
+ *	Clearing directly is allowed.
  */
 
 void tty_ldisc_enable(struct tty_struct *tty)
 {
 	set_bit(TTY_LDISC, &tty->flags);
+	clear_bit(TTY_LDISC_CHANGING, &tty->flags);
 	wake_up(&tty_ldisc_wait);
 }
 
@@ -496,7 +497,14 @@ restart:
 	 *	reference to the line discipline. The TTY_LDISC bit
 	 *	prevents anyone taking a reference once it is clear.
 	 *	We need the lock to avoid racing reference takers.
+	 *
+	 *	We must clear the TTY_LDISC bit here to avoid a livelock
+	 *	with a userspace app continually trying to use the tty in
+	 *	parallel to the change and re-referencing the tty.
 	 */
+	clear_bit(TTY_LDISC, &tty->flags);
+	if (o_tty)
+		clear_bit(TTY_LDISC, &o_tty->flags);
 
 	spin_lock_irqsave(&tty_ldisc_lock, flags);
 	if (tty->ldisc.refcount || (o_tty && o_tty->ldisc.refcount)) {
@@ -528,7 +536,7 @@ restart:
 	 *	If the TTY_LDISC bit is set, then we are racing against
 	 *	another ldisc change
 	 */
-	if (!test_bit(TTY_LDISC, &tty->flags)) {
+	if (test_bit(TTY_LDISC_CHANGING, &tty->flags)) {
 		struct tty_ldisc *ld;
 		spin_unlock_irqrestore(&tty_ldisc_lock, flags);
 		tty_ldisc_put(new_ldisc.ops);
@@ -536,10 +544,14 @@ restart:
 		tty_ldisc_deref(ld);
 		goto restart;
 	}
-
-	clear_bit(TTY_LDISC, &tty->flags);
+	/*
+	 *	This flag is used to avoid two parallel ldisc changes. Once
+	 *	open and close are fine grained locked this may work better
+	 *	as a mutex shared with the open/close/hup paths
+	 */
+	set_bit(TTY_LDISC_CHANGING, &tty->flags);
 	if (o_tty)
-		clear_bit(TTY_LDISC, &o_tty->flags);
+		set_bit(TTY_LDISC_CHANGING, &o_tty->flags);
 	spin_unlock_irqrestore(&tty_ldisc_lock, flags);
 	
 	/*
diff --git a/include/linux/tty.h b/include/linux/tty.h
index f88169787a5..bbbeaef9962 100644
--- a/include/linux/tty.h
+++ b/include/linux/tty.h
@@ -301,6 +301,7 @@ struct tty_struct {
 #define TTY_PUSH 		6	/* n_tty private */
 #define TTY_CLOSING 		7	/* ->close() in progress */
 #define TTY_LDISC 		9	/* Line discipline attached */
+#define TTY_LDISC_CHANGING 	10	/* Line discipline changing */
 #define TTY_HW_COOK_OUT 	14	/* Hardware can do output cooking */
 #define TTY_HW_COOK_IN 		15	/* Hardware can do input cooking */
 #define TTY_PTY_LOCK 		16	/* pty private */
-- 
cgit v1.2.3-70-g09d2


From 31f35939d1d9bcfb3099b32c67b896d2792603f9 Mon Sep 17 00:00:00 2001
From: Alan Cox <alan@redhat.com>
Date: Fri, 2 Jan 2009 13:45:05 +0000
Subject: tty_port: Add a port level carrier detect operation

This is the first step to generalising the various pieces of waiting logic
duplicated in all sorts of serial drivers.

Signed-off-by: Alan Cox <alan@redhat.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/char/esp.c             | 61 +++++++++++++++++++++++----------------
 drivers/char/generic_serial.c  | 43 ++++++++++++++--------------
 drivers/char/isicom.c          | 51 +++++++++++++++++++++------------
 drivers/char/istallion.c       | 28 ++++++++++++------
 drivers/char/moxa.c            | 26 +++++++++++++----
 drivers/char/mxser.c           | 54 +++++++++++++++++++++--------------
 drivers/char/rio/rio_linux.c   | 18 ++++++------
 drivers/char/riscom8.c         | 65 ++++++++++++++++++++++++++----------------
 drivers/char/rocket.c          | 40 +++++++++++++++++---------
 drivers/char/ser_a2232.c       | 19 ++++++------
 drivers/char/stallion.c        | 28 +++++++++++++-----
 drivers/char/sx.c              | 27 +++++++++---------
 drivers/char/synclink.c        | 61 ++++++++++++++++++++++++++-------------
 drivers/char/synclink_gt.c     | 48 ++++++++++++++++++++-----------
 drivers/char/synclinkmp.c      | 55 ++++++++++++++++++++++-------------
 drivers/char/tty_port.c        | 17 +++++++++++
 drivers/char/vme_scc.c         | 15 ++++++----
 include/linux/generic_serial.h |  1 -
 include/linux/tty.h            |  9 ++++++
 19 files changed, 427 insertions(+), 239 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/char/esp.c b/drivers/char/esp.c
index 7f077c0097f..45ec263ec01 100644
--- a/drivers/char/esp.c
+++ b/drivers/char/esp.c
@@ -2054,6 +2054,15 @@ static void esp_hangup(struct tty_struct *tty)
 	wake_up_interruptible(&info->port.open_wait);
 }
 
+static int esp_carrier_raised(struct tty_port *port)
+{
+	struct esp_struct *info = container_of(port, struct esp_struct, port);
+	serial_out(info, UART_ESI_CMD1, ESI_GET_UART_STAT);
+	if (serial_in(info, UART_ESI_STAT2) & UART_MSR_DCD)
+		return 1;
+	return 0;
+}
+
 /*
  * ------------------------------------------------------------
  * esp_open() and friends
@@ -2066,17 +2075,19 @@ static int block_til_ready(struct tty_struct *tty, struct file *filp,
 	int		retval;
 	int		do_clocal = 0;
 	unsigned long	flags;
+	int		cd;
+	struct tty_port *port = &info->port;
 
 	/*
 	 * If the device is in the middle of being closed, then block
 	 * until it's done, and then try again.
 	 */
 	if (tty_hung_up_p(filp) ||
-	    (info->port.flags & ASYNC_CLOSING)) {
-		if (info->port.flags & ASYNC_CLOSING)
-			interruptible_sleep_on(&info->port.close_wait);
+	    (port->flags & ASYNC_CLOSING)) {
+		if (port->flags & ASYNC_CLOSING)
+			interruptible_sleep_on(&port->close_wait);
 #ifdef SERIAL_DO_RESTART
-		if (info->port.flags & ASYNC_HUP_NOTIFY)
+		if (port->flags & ASYNC_HUP_NOTIFY)
 			return -EAGAIN;
 		else
 			return -ERESTARTSYS;
@@ -2091,7 +2102,7 @@ static int block_til_ready(struct tty_struct *tty, struct file *filp,
 	 */
 	if ((filp->f_flags & O_NONBLOCK) ||
 	    (tty->flags & (1 << TTY_IO_ERROR))) {
-		info->port.flags |= ASYNC_NORMAL_ACTIVE;
+		port->flags |= ASYNC_NORMAL_ACTIVE;
 		return 0;
 	}
 
@@ -2101,20 +2112,20 @@ static int block_til_ready(struct tty_struct *tty, struct file *filp,
 	/*
 	 * Block waiting for the carrier detect and the line to become
 	 * free (i.e., not in use by the callout).  While we are in
-	 * this loop, info->port.count is dropped by one, so that
+	 * this loop, port->count is dropped by one, so that
 	 * rs_close() knows when to free things.  We restore it upon
 	 * exit, either normal or abnormal.
 	 */
 	retval = 0;
-	add_wait_queue(&info->port.open_wait, &wait);
+	add_wait_queue(&port->open_wait, &wait);
 #ifdef SERIAL_DEBUG_OPEN
 	printk(KERN_DEBUG "block_til_ready before block: ttys%d, count = %d\n",
-	       info->line, info->port.count);
+	       info->line, port->count);
 #endif
 	spin_lock_irqsave(&info->lock, flags);
 	if (!tty_hung_up_p(filp))
-		info->port.count--;
-	info->port.blocked_open++;
+		port->count--;
+	port->blocked_open++;
 	while (1) {
 		if ((tty->termios->c_cflag & CBAUD)) {
 			unsigned int scratch;
@@ -2129,9 +2140,9 @@ static int block_til_ready(struct tty_struct *tty, struct file *filp,
 		}
 		set_current_state(TASK_INTERRUPTIBLE);
 		if (tty_hung_up_p(filp) ||
-		    !(info->port.flags & ASYNC_INITIALIZED)) {
+		    !(port->flags & ASYNC_INITIALIZED)) {
 #ifdef SERIAL_DO_RESTART
-			if (info->port.flags & ASYNC_HUP_NOTIFY)
+			if (port->flags & ASYNC_HUP_NOTIFY)
 				retval = -EAGAIN;
 			else
 				retval = -ERESTARTSYS;
@@ -2141,11 +2152,9 @@ static int block_til_ready(struct tty_struct *tty, struct file *filp,
 			break;
 		}
 
-		serial_out(info, UART_ESI_CMD1, ESI_GET_UART_STAT);
-		if (serial_in(info, UART_ESI_STAT2) & UART_MSR_DCD)
-			do_clocal = 1;
+		cd = tty_port_carrier_raised(port);
 
-		if (!(info->port.flags & ASYNC_CLOSING) &&
+		if (!(port->flags & ASYNC_CLOSING) &&
 		    (do_clocal))
 			break;
 		if (signal_pending(current)) {
@@ -2154,25 +2163,25 @@ static int block_til_ready(struct tty_struct *tty, struct file *filp,
 		}
 #ifdef SERIAL_DEBUG_OPEN
 		printk(KERN_DEBUG "block_til_ready blocking: ttys%d, count = %d\n",
-		       info->line, info->port.count);
+		       info->line, port->count);
 #endif
 		spin_unlock_irqrestore(&info->lock, flags);
 		schedule();
 		spin_lock_irqsave(&info->lock, flags);
 	}
 	set_current_state(TASK_RUNNING);
-	remove_wait_queue(&info->port.open_wait, &wait);
+	remove_wait_queue(&port->open_wait, &wait);
 	if (!tty_hung_up_p(filp))
-		info->port.count++;
-	info->port.blocked_open--;
+		port->count++;
+	port->blocked_open--;
 	spin_unlock_irqrestore(&info->lock, flags);
 #ifdef SERIAL_DEBUG_OPEN
 	printk(KERN_DEBUG "block_til_ready after blocking: ttys%d, count = %d\n",
-	       info->line, info->port.count);
+	       info->line, port->count);
 #endif
 	if (retval)
 		return retval;
-	info->port.flags |= ASYNC_NORMAL_ACTIVE;
+	port->flags |= ASYNC_NORMAL_ACTIVE;
 	return 0;
 }
 
@@ -2329,6 +2338,10 @@ static const struct tty_operations esp_ops = {
 	.tiocmset = esp_tiocmset,
 };
 
+static const struct tty_port_operations esp_port_ops = {
+	.esp_carrier_raised,
+};
+
 /*
  * The serial driver boot-time initialization code!
  */
@@ -2415,6 +2428,8 @@ static int __init espserial_init(void)
 	offset = 0;
 
 	do {
+		tty_port_init(&info->port);
+		info->port.ops = &esp_port_ops;
 		info->io_port = esp[i] + offset;
 		info->irq = irq[i];
 		info->line = (i * 8) + (offset / 8);
@@ -2437,8 +2452,6 @@ static int __init espserial_init(void)
 		info->config.flow_off = flow_off;
 		info->config.pio_threshold = pio_threshold;
 		info->next_port = ports;
-		init_waitqueue_head(&info->port.open_wait);
-		init_waitqueue_head(&info->port.close_wait);
 		init_waitqueue_head(&info->delta_msr_wait);
 		init_waitqueue_head(&info->break_wait);
 		ports = info;
diff --git a/drivers/char/generic_serial.c b/drivers/char/generic_serial.c
index c6090f84a2e..2356994ee01 100644
--- a/drivers/char/generic_serial.c
+++ b/drivers/char/generic_serial.c
@@ -397,7 +397,8 @@ void gs_hangup(struct tty_struct *tty)
 
 int gs_block_til_ready(void *port_, struct file * filp)
 {
-	struct gs_port *port = port_;
+	struct gs_port *gp = port_;
+	struct tty_port *port = &gp->port;
 	DECLARE_WAITQUEUE(wait, current);
 	int    retval;
 	int    do_clocal = 0;
@@ -409,16 +410,16 @@ int gs_block_til_ready(void *port_, struct file * filp)
 
 	if (!port) return 0;
 
-	tty = port->port.tty;
+	tty = port->tty;
 
 	gs_dprintk (GS_DEBUG_BTR, "Entering gs_block_till_ready.\n"); 
 	/*
 	 * If the device is in the middle of being closed, then block
 	 * until it's done, and then try again.
 	 */
-	if (tty_hung_up_p(filp) || port->port.flags & ASYNC_CLOSING) {
-		interruptible_sleep_on(&port->port.close_wait);
-		if (port->port.flags & ASYNC_HUP_NOTIFY)
+	if (tty_hung_up_p(filp) || port->flags & ASYNC_CLOSING) {
+		interruptible_sleep_on(&port->close_wait);
+		if (port->flags & ASYNC_HUP_NOTIFY)
 			return -EAGAIN;
 		else
 			return -ERESTARTSYS;
@@ -432,7 +433,7 @@ int gs_block_til_ready(void *port_, struct file * filp)
 	 */
 	if ((filp->f_flags & O_NONBLOCK) ||
 	    (tty->flags & (1 << TTY_IO_ERROR))) {
-		port->port.flags |= ASYNC_NORMAL_ACTIVE;
+		port->flags |= ASYNC_NORMAL_ACTIVE;
 		return 0;
 	}
 
@@ -444,34 +445,34 @@ int gs_block_til_ready(void *port_, struct file * filp)
 	/*
 	 * Block waiting for the carrier detect and the line to become
 	 * free (i.e., not in use by the callout).  While we are in
-	 * this loop, port->port.count is dropped by one, so that
+	 * this loop, port->count is dropped by one, so that
 	 * rs_close() knows when to free things.  We restore it upon
 	 * exit, either normal or abnormal.
 	 */
 	retval = 0;
 
-	add_wait_queue(&port->port.open_wait, &wait);
+	add_wait_queue(&port->open_wait, &wait);
 
 	gs_dprintk (GS_DEBUG_BTR, "after add waitq.\n"); 
-	spin_lock_irqsave(&port->driver_lock, flags);
+	spin_lock_irqsave(&gp->driver_lock, flags);
 	if (!tty_hung_up_p(filp)) {
-		port->port.count--;
+		port->count--;
 	}
-	spin_unlock_irqrestore(&port->driver_lock, flags);
-	port->port.blocked_open++;
+	spin_unlock_irqrestore(&gp->driver_lock, flags);
+	port->blocked_open++;
 	while (1) {
-		CD = port->rd->get_CD (port);
+		CD = tty_port_carrier_raised(port);
 		gs_dprintk (GS_DEBUG_BTR, "CD is now %d.\n", CD);
 		set_current_state (TASK_INTERRUPTIBLE);
 		if (tty_hung_up_p(filp) ||
-		    !(port->port.flags & ASYNC_INITIALIZED)) {
-			if (port->port.flags & ASYNC_HUP_NOTIFY)
+		    !(port->flags & ASYNC_INITIALIZED)) {
+			if (port->flags & ASYNC_HUP_NOTIFY)
 				retval = -EAGAIN;
 			else
 				retval = -ERESTARTSYS;
 			break;
 		}
-		if (!(port->port.flags & ASYNC_CLOSING) &&
+		if (!(port->flags & ASYNC_CLOSING) &&
 		    (do_clocal || CD))
 			break;
 		gs_dprintk (GS_DEBUG_BTR, "signal_pending is now: %d (%lx)\n", 
@@ -483,17 +484,17 @@ int gs_block_til_ready(void *port_, struct file * filp)
 		schedule();
 	}
 	gs_dprintk (GS_DEBUG_BTR, "Got out of the loop. (%d)\n",
-		    port->port.blocked_open);
+		    port->blocked_open);
 	set_current_state (TASK_RUNNING);
-	remove_wait_queue(&port->port.open_wait, &wait);
+	remove_wait_queue(&port->open_wait, &wait);
 	if (!tty_hung_up_p(filp)) {
-		port->port.count++;
+		port->count++;
 	}
-	port->port.blocked_open--;
+	port->blocked_open--;
 	if (retval)
 		return retval;
 
-	port->port.flags |= ASYNC_NORMAL_ACTIVE;
+	port->flags |= ASYNC_NORMAL_ACTIVE;
 	func_exit ();
 	return 0;
 }			 
diff --git a/drivers/char/isicom.c b/drivers/char/isicom.c
index 04e4549299b..b3da4858fd4 100644
--- a/drivers/char/isicom.c
+++ b/drivers/char/isicom.c
@@ -830,20 +830,28 @@ static int isicom_setup_port(struct tty_struct *tty)
 	return 0;
 }
 
+static int isicom_carrier_raised(struct tty_port *port)
+{
+	struct isi_port *ip = container_of(port, struct isi_port, port);
+	return (ip->status & ISI_DCD)?1 : 0;
+}
+
 static int block_til_ready(struct tty_struct *tty, struct file *filp,
-	struct isi_port *port)
+	struct isi_port *ip)
 {
-	struct isi_board *card = port->card;
+	struct isi_board *card = ip->card;
+	struct tty_port *port = &ip->port;
 	int do_clocal = 0, retval;
 	unsigned long flags;
 	DECLARE_WAITQUEUE(wait, current);
+	int cd;
 
 	/* block if port is in the process of being closed */
 
-	if (tty_hung_up_p(filp) || port->port.flags & ASYNC_CLOSING) {
+	if (tty_hung_up_p(filp) || port->flags & ASYNC_CLOSING) {
 		pr_dbg("block_til_ready: close in progress.\n");
-		interruptible_sleep_on(&port->port.close_wait);
-		if (port->port.flags & ASYNC_HUP_NOTIFY)
+		interruptible_sleep_on(&port->close_wait);
+		if (port->flags & ASYNC_HUP_NOTIFY)
 			return -EAGAIN;
 		else
 			return -ERESTARTSYS;
@@ -854,7 +862,7 @@ static int block_til_ready(struct tty_struct *tty, struct file *filp,
 	if ((filp->f_flags & O_NONBLOCK) ||
 			(tty->flags & (1 << TTY_IO_ERROR))) {
 		pr_dbg("block_til_ready: non-block mode.\n");
-		port->port.flags |= ASYNC_NORMAL_ACTIVE;
+		port->flags |= ASYNC_NORMAL_ACTIVE;
 		return 0;
 	}
 
@@ -864,29 +872,29 @@ static int block_til_ready(struct tty_struct *tty, struct file *filp,
 	/* block waiting for DCD to be asserted, and while
 						callout dev is busy */
 	retval = 0;
-	add_wait_queue(&port->port.open_wait, &wait);
+	add_wait_queue(&port->open_wait, &wait);
 
 	spin_lock_irqsave(&card->card_lock, flags);
 	if (!tty_hung_up_p(filp))
-		port->port.count--;
-	port->port.blocked_open++;
+		port->count--;
+	port->blocked_open++;
 	spin_unlock_irqrestore(&card->card_lock, flags);
 
 	while (1) {
-		raise_dtr_rts(port);
+		raise_dtr_rts(ip);
 
 		set_current_state(TASK_INTERRUPTIBLE);
-		if (tty_hung_up_p(filp) || !(port->port.flags & ASYNC_INITIALIZED)) {
-			if (port->port.flags & ASYNC_HUP_NOTIFY)
+		if (tty_hung_up_p(filp) || !(port->flags & ASYNC_INITIALIZED)) {
+			if (port->flags & ASYNC_HUP_NOTIFY)
 				retval = -EAGAIN;
 			else
 				retval = -ERESTARTSYS;
 			break;
 		}
-		if (!(port->port.flags & ASYNC_CLOSING) &&
-				(do_clocal || (port->status & ISI_DCD))) {
+		cd = tty_port_carrier_raised(port);
+		if (!(port->flags & ASYNC_CLOSING) &&
+				(do_clocal || cd))
 			break;
-		}
 		if (signal_pending(current)) {
 			retval = -ERESTARTSYS;
 			break;
@@ -894,15 +902,15 @@ static int block_til_ready(struct tty_struct *tty, struct file *filp,
 		schedule();
 	}
 	set_current_state(TASK_RUNNING);
-	remove_wait_queue(&port->port.open_wait, &wait);
+	remove_wait_queue(&port->open_wait, &wait);
 	spin_lock_irqsave(&card->card_lock, flags);
 	if (!tty_hung_up_p(filp))
-		port->port.count++;
-	port->port.blocked_open--;
+		port->count++;
+	port->blocked_open--;
 	spin_unlock_irqrestore(&card->card_lock, flags);
 	if (retval)
 		return retval;
-	port->port.flags |= ASYNC_NORMAL_ACTIVE;
+	port->flags |= ASYNC_NORMAL_ACTIVE;
 	return 0;
 }
 
@@ -1452,6 +1460,10 @@ static const struct tty_operations isicom_ops = {
 	.break_ctl		= isicom_send_break,
 };
 
+static const struct tty_port_operations isicom_port_ops = {
+	.carrier_raised		= isicom_carrier_raised,
+};
+
 static int __devinit reset_card(struct pci_dev *pdev,
 	const unsigned int card, unsigned int *signature)
 {
@@ -1794,6 +1806,7 @@ static int __init isicom_init(void)
 		spin_lock_init(&isi_card[idx].card_lock);
 		for (channel = 0; channel < 16; channel++, port++) {
 			tty_port_init(&port->port);
+			port->port.ops = &isicom_port_ops;
 			port->magic = ISICOM_MAGIC;
 			port->card = &isi_card[idx];
 			port->channel = channel;
diff --git a/drivers/char/istallion.c b/drivers/char/istallion.c
index 4b10770fa93..c4682f9e34b 100644
--- a/drivers/char/istallion.c
+++ b/drivers/char/istallion.c
@@ -151,7 +151,7 @@ static char	*stli_drvversion = "5.6.0";
 static char	*stli_serialname = "ttyE";
 
 static struct tty_driver	*stli_serial;
-
+static const struct tty_port_operations stli_port_ops;
 
 #define	STLI_TXBUFSIZE		4096
 
@@ -1183,6 +1183,12 @@ static int stli_setport(struct tty_struct *tty)
 
 /*****************************************************************************/
 
+static int stli_carrier_raised(struct tty_port *port)
+{
+	struct stliport *portp = container_of(port, struct stliport, port);
+	return (portp->sigs & TIOCM_CD) ? 1 : 0;
+}
+
 /*
  *	Possibly need to wait for carrier (DCD signal) to come high. Say
  *	maybe because if we are clocal then we don't need to wait...
@@ -1193,6 +1199,7 @@ static int stli_waitcarrier(struct tty_struct *tty, struct stlibrd *brdp,
 {
 	unsigned long flags;
 	int rc, doclocal;
+	struct tty_port *port = &portp->port;
 
 	rc = 0;
 	doclocal = 0;
@@ -1203,7 +1210,7 @@ static int stli_waitcarrier(struct tty_struct *tty, struct stlibrd *brdp,
 	spin_lock_irqsave(&stli_lock, flags);
 	portp->openwaitcnt++;
 	if (! tty_hung_up_p(filp))
-		portp->port.count--;
+		port->count--;
 	spin_unlock_irqrestore(&stli_lock, flags);
 
 	for (;;) {
@@ -1212,27 +1219,27 @@ static int stli_waitcarrier(struct tty_struct *tty, struct stlibrd *brdp,
 		    &portp->asig, sizeof(asysigs_t), 0)) < 0)
 			break;
 		if (tty_hung_up_p(filp) ||
-		    ((portp->port.flags & ASYNC_INITIALIZED) == 0)) {
-			if (portp->port.flags & ASYNC_HUP_NOTIFY)
+		    ((port->flags & ASYNC_INITIALIZED) == 0)) {
+			if (port->flags & ASYNC_HUP_NOTIFY)
 				rc = -EBUSY;
 			else
 				rc = -ERESTARTSYS;
 			break;
 		}
-		if (((portp->port.flags & ASYNC_CLOSING) == 0) &&
-		    (doclocal || (portp->sigs & TIOCM_CD))) {
+		if (((port->flags & ASYNC_CLOSING) == 0) &&
+		    (doclocal || tty_port_carrier_raised(port))) {
 			break;
 		}
 		if (signal_pending(current)) {
 			rc = -ERESTARTSYS;
 			break;
 		}
-		interruptible_sleep_on(&portp->port.open_wait);
+		interruptible_sleep_on(&port->open_wait);
 	}
 
 	spin_lock_irqsave(&stli_lock, flags);
 	if (! tty_hung_up_p(filp))
-		portp->port.count++;
+		port->count++;
 	portp->openwaitcnt--;
 	spin_unlock_irqrestore(&stli_lock, flags);
 
@@ -2696,6 +2703,7 @@ static int stli_initports(struct stlibrd *brdp)
 			continue;
 		}
 		tty_port_init(&portp->port);
+		portp->port.ops = &stli_port_ops;
 		portp->magic = STLI_PORTMAGIC;
 		portp->portnr = i;
 		portp->brdnr = brdp->brdnr;
@@ -4518,6 +4526,10 @@ static const struct tty_operations stli_ops = {
 	.tiocmset = stli_tiocmset,
 };
 
+static const struct tty_port_operations stli_port_ops = {
+	.carrier_raised = stli_carrier_raised,
+};
+
 /*****************************************************************************/
 /*
  *	Loadable module initialization stuff.
diff --git a/drivers/char/moxa.c b/drivers/char/moxa.c
index 12d327a2c9b..8b0da97d529 100644
--- a/drivers/char/moxa.c
+++ b/drivers/char/moxa.c
@@ -206,6 +206,7 @@ static void moxa_poll(unsigned long);
 static void moxa_set_tty_param(struct tty_struct *, struct ktermios *);
 static void moxa_setup_empty_event(struct tty_struct *);
 static void moxa_shut_down(struct tty_struct *);
+static int moxa_carrier_raised(struct tty_port *);
 /*
  * moxa board interface functions:
  */
@@ -405,6 +406,10 @@ static const struct tty_operations moxa_ops = {
 	.tiocmset = moxa_tiocmset,
 };
 
+static const struct tty_port_operations moxa_port_ops = {
+	.carrier_raised = moxa_carrier_raised,
+};
+
 static struct tty_driver *moxaDriver;
 static DEFINE_TIMER(moxaTimer, moxa_poll, 0, 0);
 static DEFINE_SPINLOCK(moxa_lock);
@@ -826,6 +831,7 @@ static int moxa_init_board(struct moxa_board_conf *brd, struct device *dev)
 
 	for (i = 0, p = brd->ports; i < MAX_PORTS_PER_BOARD; i++, p++) {
 		tty_port_init(&p->port);
+		p->port.ops = &moxa_port_ops;
 		p->type = PORT_16550A;
 		p->cflag = B9600 | CS8 | CREAD | CLOCAL | HUPCL;
 	}
@@ -1115,15 +1121,27 @@ static void moxa_close_port(struct tty_struct *tty)
 	tty_port_tty_set(&ch->port, NULL);
 }
 
+static int moxa_carrier_raised(struct tty_port *port)
+{
+	struct moxa_port *ch = container_of(port, struct moxa_port, port);
+	int dcd;
+
+	spin_lock_bh(&moxa_lock);
+	dcd = ch->DCDState;
+	spin_unlock_bh(&moxa_lock);
+	return dcd;
+}
+
 static int moxa_block_till_ready(struct tty_struct *tty, struct file *filp,
 			    struct moxa_port *ch)
 {
+	struct tty_port *port = &ch->port;
 	DEFINE_WAIT(wait);
 	int retval = 0;
 	u8 dcd;
 
 	while (1) {
-		prepare_to_wait(&ch->port.open_wait, &wait, TASK_INTERRUPTIBLE);
+		prepare_to_wait(&port->open_wait, &wait, TASK_INTERRUPTIBLE);
 		if (tty_hung_up_p(filp)) {
 #ifdef SERIAL_DO_RESTART
 			retval = -ERESTARTSYS;
@@ -1132,9 +1150,7 @@ static int moxa_block_till_ready(struct tty_struct *tty, struct file *filp,
 #endif
 			break;
 		}
-		spin_lock_bh(&moxa_lock);
-		dcd = ch->DCDState;
-		spin_unlock_bh(&moxa_lock);
+		dcd = tty_port_carrier_raised(port);
 		if (dcd)
 			break;
 
@@ -1144,7 +1160,7 @@ static int moxa_block_till_ready(struct tty_struct *tty, struct file *filp,
 		}
 		schedule();
 	}
-	finish_wait(&ch->port.open_wait, &wait);
+	finish_wait(&port->open_wait, &wait);
 
 	return retval;
 }
diff --git a/drivers/char/mxser.c b/drivers/char/mxser.c
index 04776691541..eafbbcf355e 100644
--- a/drivers/char/mxser.c
+++ b/drivers/char/mxser.c
@@ -541,13 +541,21 @@ static unsigned char mxser_get_msr(int baseaddr, int mode, int port)
 	return status;
 }
 
+static int mxser_carrier_raised(struct tty_port *port)
+{
+	struct mxser_port *mp = container_of(port, struct mxser_port, port);
+	return (inb(mp->ioaddr + UART_MSR) & UART_MSR_DCD)?1:0;
+}
+
 static int mxser_block_til_ready(struct tty_struct *tty, struct file *filp,
-		struct mxser_port *port)
+		struct mxser_port *mp)
 {
 	DECLARE_WAITQUEUE(wait, current);
 	int retval;
 	int do_clocal = 0;
 	unsigned long flags;
+	int cd;
+	struct tty_port *port = &mp->port;
 
 	/*
 	 * If non-blocking mode is set, or the port is not enabled,
@@ -555,7 +563,7 @@ static int mxser_block_til_ready(struct tty_struct *tty, struct file *filp,
 	 */
 	if ((filp->f_flags & O_NONBLOCK) ||
 			test_bit(TTY_IO_ERROR, &tty->flags)) {
-		port->port.flags |= ASYNC_NORMAL_ACTIVE;
+		port->flags |= ASYNC_NORMAL_ACTIVE;
 		return 0;
 	}
 
@@ -565,34 +573,33 @@ static int mxser_block_til_ready(struct tty_struct *tty, struct file *filp,
 	/*
 	 * Block waiting for the carrier detect and the line to become
 	 * free (i.e., not in use by the callout).  While we are in
-	 * this loop, port->port.count is dropped by one, so that
+	 * this loop, port->count is dropped by one, so that
 	 * mxser_close() knows when to free things.  We restore it upon
 	 * exit, either normal or abnormal.
 	 */
 	retval = 0;
-	add_wait_queue(&port->port.open_wait, &wait);
+	add_wait_queue(&port->open_wait, &wait);
 
-	spin_lock_irqsave(&port->slock, flags);
+	spin_lock_irqsave(&mp->slock, flags);
 	if (!tty_hung_up_p(filp))
-		port->port.count--;
-	spin_unlock_irqrestore(&port->slock, flags);
-	port->port.blocked_open++;
+		port->count--;
+	spin_unlock_irqrestore(&mp->slock, flags);
+	port->blocked_open++;
 	while (1) {
-		spin_lock_irqsave(&port->slock, flags);
-		outb(inb(port->ioaddr + UART_MCR) |
-			UART_MCR_DTR | UART_MCR_RTS, port->ioaddr + UART_MCR);
-		spin_unlock_irqrestore(&port->slock, flags);
+		spin_lock_irqsave(&mp->slock, flags);
+		outb(inb(mp->ioaddr + UART_MCR) |
+			UART_MCR_DTR | UART_MCR_RTS, mp->ioaddr + UART_MCR);
+		spin_unlock_irqrestore(&mp->slock, flags);
 		set_current_state(TASK_INTERRUPTIBLE);
-		if (tty_hung_up_p(filp) || !(port->port.flags & ASYNC_INITIALIZED)) {
-			if (port->port.flags & ASYNC_HUP_NOTIFY)
+		if (tty_hung_up_p(filp) || !(port->flags & ASYNC_INITIALIZED)) {
+			if (port->flags & ASYNC_HUP_NOTIFY)
 				retval = -EAGAIN;
 			else
 				retval = -ERESTARTSYS;
 			break;
 		}
-		if (!(port->port.flags & ASYNC_CLOSING) &&
-				(do_clocal ||
-				(inb(port->ioaddr + UART_MSR) & UART_MSR_DCD)))
+		cd = tty_port_carrier_raised(port);
+		if (!(port->flags & ASYNC_CLOSING) && (do_clocal || cd))
 			break;
 		if (signal_pending(current)) {
 			retval = -ERESTARTSYS;
@@ -601,13 +608,13 @@ static int mxser_block_til_ready(struct tty_struct *tty, struct file *filp,
 		schedule();
 	}
 	set_current_state(TASK_RUNNING);
-	remove_wait_queue(&port->port.open_wait, &wait);
+	remove_wait_queue(&port->open_wait, &wait);
 	if (!tty_hung_up_p(filp))
-		port->port.count++;
-	port->port.blocked_open--;
+		port->count++;
+	port->blocked_open--;
 	if (retval)
 		return retval;
-	port->port.flags |= ASYNC_NORMAL_ACTIVE;
+	port->flags |= ASYNC_NORMAL_ACTIVE;
 	return 0;
 }
 
@@ -2449,6 +2456,10 @@ static const struct tty_operations mxser_ops = {
 	.tiocmset = mxser_tiocmset,
 };
 
+struct tty_port_operations mxser_port_ops = {
+	.carrier_raised = mxser_carrier_raised,
+};
+
 /*
  * The MOXA Smartio/Industio serial driver boot-time initialization code!
  */
@@ -2482,6 +2493,7 @@ static int __devinit mxser_initbrd(struct mxser_board *brd,
 	for (i = 0; i < brd->info->nports; i++) {
 		info = &brd->ports[i];
 		tty_port_init(&info->port);
+		info->port.ops = &mxser_port_ops;
 		info->board = brd;
 		info->stop_rx = 0;
 		info->ldisc_stop_rx = 0;
diff --git a/drivers/char/rio/rio_linux.c b/drivers/char/rio/rio_linux.c
index a8f68a3f14d..ec2afd13947 100644
--- a/drivers/char/rio/rio_linux.c
+++ b/drivers/char/rio/rio_linux.c
@@ -173,7 +173,7 @@ static void rio_disable_tx_interrupts(void *ptr);
 static void rio_enable_tx_interrupts(void *ptr);
 static void rio_disable_rx_interrupts(void *ptr);
 static void rio_enable_rx_interrupts(void *ptr);
-static int rio_get_CD(void *ptr);
+static int rio_carrier_raised(struct tty_port *port);
 static void rio_shutdown_port(void *ptr);
 static int rio_set_real_termios(void *ptr);
 static void rio_hungup(void *ptr);
@@ -224,7 +224,6 @@ static struct real_driver rio_real_driver = {
 	rio_enable_tx_interrupts,
 	rio_disable_rx_interrupts,
 	rio_enable_rx_interrupts,
-	rio_get_CD,
 	rio_shutdown_port,
 	rio_set_real_termios,
 	rio_chars_in_buffer,
@@ -476,9 +475,9 @@ static void rio_enable_rx_interrupts(void *ptr)
 
 
 /* Jeez. Isn't this simple?  */
-static int rio_get_CD(void *ptr)
+static int rio_carrier_raised(struct tty_port *port)
 {
-	struct Port *PortP = ptr;
+	struct Port *PortP = container_of(port, struct Port, gs.port);
 	int rv;
 
 	func_enter();
@@ -806,7 +805,9 @@ static void *ckmalloc(int size)
 	return p;
 }
 
-
+static const struct tty_port_operations rio_port_ops = {
+	.carrier_raised = rio_carrier_raised,
+};
 
 static int rio_init_datastructures(void)
 {
@@ -842,17 +843,14 @@ static int rio_init_datastructures(void)
 			goto free6;
 		}
 		rio_dprintk(RIO_DEBUG_INIT, "initing port %d (%d)\n", i, port->Mapped);
+		tty_port_init(&port->gs.port);
+		port->gs.port.ops = &rio_port_ops;
 		port->PortNum = i;
 		port->gs.magic = RIO_MAGIC;
 		port->gs.close_delay = HZ / 2;
 		port->gs.closing_wait = 30 * HZ;
 		port->gs.rd = &rio_real_driver;
 		spin_lock_init(&port->portSem);
-		/*
-		 * Initializing wait queue
-		 */
-		init_waitqueue_head(&port->gs.port.open_wait);
-		init_waitqueue_head(&port->gs.port.close_wait);
 	}
 #else
 	/* We could postpone initializing them to when they are configured. */
diff --git a/drivers/char/riscom8.c b/drivers/char/riscom8.c
index 2c6c8f33d6b..6ad1c2aa2a9 100644
--- a/drivers/char/riscom8.c
+++ b/drivers/char/riscom8.c
@@ -857,23 +857,40 @@ static void rc_shutdown_port(struct tty_struct *tty,
 		rc_shutdown_board(bp);
 }
 
+static int carrier_raised(struct tty_port *port)
+{
+	struct riscom_port *p = container_of(port, struct riscom_port, port);
+	struct riscom_board *bp = port_Board(p);
+	unsigned long flags;
+	int CD;
+	
+	spin_lock_irqsave(&riscom_lock, flags);
+	rc_out(bp, CD180_CAR, port_No(p));
+	CD = rc_in(bp, CD180_MSVR) & MSVR_CD;
+	rc_out(bp, CD180_MSVR, MSVR_RTS);
+	bp->DTR &= ~(1u << port_No(p));
+	rc_out(bp, RC_DTR, bp->DTR);
+	spin_unlock_irqrestore(&riscom_lock, flags);
+	return CD;
+}
+
 static int block_til_ready(struct tty_struct *tty, struct file *filp,
-			   struct riscom_port *port)
+			   struct riscom_port *rp)
 {
 	DECLARE_WAITQUEUE(wait, current);
-	struct riscom_board *bp = port_Board(port);
 	int    retval;
 	int    do_clocal = 0;
 	int    CD;
 	unsigned long flags;
+	struct tty_port *port = &rp->port;
 
 	/*
 	 * If the device is in the middle of being closed, then block
 	 * until it's done, and then try again.
 	 */
-	if (tty_hung_up_p(filp) || port->port.flags & ASYNC_CLOSING) {
-		interruptible_sleep_on(&port->port.close_wait);
-		if (port->port.flags & ASYNC_HUP_NOTIFY)
+	if (tty_hung_up_p(filp) || port->flags & ASYNC_CLOSING) {
+		interruptible_sleep_on(&port->close_wait);
+		if (port->flags & ASYNC_HUP_NOTIFY)
 			return -EAGAIN;
 		else
 			return -ERESTARTSYS;
@@ -885,7 +902,7 @@ static int block_til_ready(struct tty_struct *tty, struct file *filp,
 	 */
 	if ((filp->f_flags & O_NONBLOCK) ||
 	    (tty->flags & (1 << TTY_IO_ERROR))) {
-		port->port.flags |= ASYNC_NORMAL_ACTIVE;
+		port->flags |= ASYNC_NORMAL_ACTIVE;
 		return 0;
 	}
 
@@ -900,37 +917,29 @@ static int block_til_ready(struct tty_struct *tty, struct file *filp,
 	 * exit, either normal or abnormal.
 	 */
 	retval = 0;
-	add_wait_queue(&port->port.open_wait, &wait);
+	add_wait_queue(&port->open_wait, &wait);
 
 	spin_lock_irqsave(&riscom_lock, flags);
 
 	if (!tty_hung_up_p(filp))
-		port->port.count--;
+		port->count--;
 
 	spin_unlock_irqrestore(&riscom_lock, flags);
 
-	port->port.blocked_open++;
+	port->blocked_open++;
 	while (1) {
-		spin_lock_irqsave(&riscom_lock, flags);
-
-		rc_out(bp, CD180_CAR, port_No(port));
-		CD = rc_in(bp, CD180_MSVR) & MSVR_CD;
-		rc_out(bp, CD180_MSVR, MSVR_RTS);
-		bp->DTR &= ~(1u << port_No(port));
-		rc_out(bp, RC_DTR, bp->DTR);
-
-		spin_unlock_irqrestore(&riscom_lock, flags);
 
+		CD = tty_port_carrier_raised(port);
 		set_current_state(TASK_INTERRUPTIBLE);
 		if (tty_hung_up_p(filp) ||
-		    !(port->port.flags & ASYNC_INITIALIZED)) {
-			if (port->port.flags & ASYNC_HUP_NOTIFY)
+		    !(port->flags & ASYNC_INITIALIZED)) {
+			if (port->flags & ASYNC_HUP_NOTIFY)
 				retval = -EAGAIN;
 			else
 				retval = -ERESTARTSYS;
 			break;
 		}
-		if (!(port->port.flags & ASYNC_CLOSING) &&
+		if (!(port->flags & ASYNC_CLOSING) &&
 		    (do_clocal || CD))
 			break;
 		if (signal_pending(current)) {
@@ -940,14 +949,14 @@ static int block_til_ready(struct tty_struct *tty, struct file *filp,
 		schedule();
 	}
 	__set_current_state(TASK_RUNNING);
-	remove_wait_queue(&port->port.open_wait, &wait);
+	remove_wait_queue(&port->open_wait, &wait);
 	if (!tty_hung_up_p(filp))
-		port->port.count++;
-	port->port.blocked_open--;
+		port->count++;
+	port->blocked_open--;
 	if (retval)
 		return retval;
 
-	port->port.flags |= ASYNC_NORMAL_ACTIVE;
+	port->flags |= ASYNC_NORMAL_ACTIVE;
 	return 0;
 }
 
@@ -1510,6 +1519,11 @@ static const struct tty_operations riscom_ops = {
 	.break_ctl = rc_send_break,
 };
 
+static const struct tty_port_operations riscom_port_ops = {
+	.carrier_raised = carrier_raised,
+};
+
+
 static int __init rc_init_drivers(void)
 {
 	int error;
@@ -1541,6 +1555,7 @@ static int __init rc_init_drivers(void)
 	memset(rc_port, 0, sizeof(rc_port));
 	for (i = 0; i < RC_NPORT * RC_NBOARD; i++)  {
 		tty_port_init(&rc_port[i].port);
+		rc_port[i].port.ops = &riscom_port_ops;
 		rc_port[i].magic = RISCOM8_MAGIC;
 	}
 	return 0;
diff --git a/drivers/char/rocket.c b/drivers/char/rocket.c
index 584d791e84a..4a4110e703a 100644
--- a/drivers/char/rocket.c
+++ b/drivers/char/rocket.c
@@ -135,6 +135,7 @@ static int rcktpt_type[NUM_BOARDS];
 static int is_PCI[NUM_BOARDS];
 static rocketModel_t rocketModel[NUM_BOARDS];
 static int max_board;
+static const struct tty_port_operations rocket_port_ops;
 
 /*
  * The following arrays define the interrupt bits corresponding to each AIOP.
@@ -649,9 +650,8 @@ static void init_r_port(int board, int aiop, int chan, struct pci_dev *pci_dev)
 	info->board = board;
 	info->aiop = aiop;
 	info->chan = chan;
-	info->port.closing_wait = 3000;
-	info->port.close_delay = 50;
-	init_waitqueue_head(&info->port.open_wait);
+	tty_port_init(&info->port);
+	info->port.ops = &rocket_port_ops;
 	init_completion(&info->close_wait);
 	info->flags &= ~ROCKET_MODE_MASK;
 	switch (pc104[board][line]) {
@@ -864,11 +864,18 @@ static void configure_r_port(struct r_port *info,
 	}
 }
 
+static int carrier_raised(struct tty_port *port)
+{
+	struct r_port *info = container_of(port, struct r_port, port);
+	return (sGetChanStatusLo(&info->channel) & CD_ACT) ? 1 : 0;
+}
+
 /*  info->port.count is considered critical, protected by spinlocks.  */
 static int block_til_ready(struct tty_struct *tty, struct file *filp,
 			   struct r_port *info)
 {
 	DECLARE_WAITQUEUE(wait, current);
+	struct tty_port *port = &info->port;
 	int retval;
 	int do_clocal = 0, extra_count = 0;
 	unsigned long flags;
@@ -898,13 +905,13 @@ static int block_til_ready(struct tty_struct *tty, struct file *filp,
 
 	/*
 	 * Block waiting for the carrier detect and the line to become free.  While we are in
-	 * this loop, info->port.count is dropped by one, so that rp_close() knows when to free things.
+	 * this loop, port->count is dropped by one, so that rp_close() knows when to free things.
          * We restore it upon exit, either normal or abnormal.
 	 */
 	retval = 0;
-	add_wait_queue(&info->port.open_wait, &wait);
+	add_wait_queue(&port->open_wait, &wait);
 #ifdef ROCKET_DEBUG_OPEN
-	printk(KERN_INFO "block_til_ready before block: ttyR%d, count = %d\n", info->line, info->port.count);
+	printk(KERN_INFO "block_til_ready before block: ttyR%d, count = %d\n", info->line, port->count);
 #endif
 	spin_lock_irqsave(&info->slock, flags);
 
@@ -913,10 +920,10 @@ static int block_til_ready(struct tty_struct *tty, struct file *filp,
 #else
 	if (!tty_hung_up_p(filp)) {
 		extra_count = 1;
-		info->port.count--;
+		port->count--;
 	}
 #endif
-	info->port.blocked_open++;
+	port->blocked_open++;
 
 	spin_unlock_irqrestore(&info->slock, flags);
 
@@ -933,7 +940,8 @@ static int block_til_ready(struct tty_struct *tty, struct file *filp,
 				retval = -ERESTARTSYS;
 			break;
 		}
-		if (!(info->flags & ROCKET_CLOSING) && (do_clocal || (sGetChanStatusLo(&info->channel) & CD_ACT)))
+		if (!(info->flags & ROCKET_CLOSING) &&
+			(do_clocal || tty_port_carrier_raised(port)))
 			break;
 		if (signal_pending(current)) {
 			retval = -ERESTARTSYS;
@@ -941,24 +949,24 @@ static int block_til_ready(struct tty_struct *tty, struct file *filp,
 		}
 #ifdef ROCKET_DEBUG_OPEN
 		printk(KERN_INFO "block_til_ready blocking: ttyR%d, count = %d, flags=0x%0x\n",
-		     info->line, info->port.count, info->flags);
+		     info->line, port->count, info->flags);
 #endif
 		schedule();	/*  Don't hold spinlock here, will hang PC */
 	}
 	__set_current_state(TASK_RUNNING);
-	remove_wait_queue(&info->port.open_wait, &wait);
+	remove_wait_queue(&port->open_wait, &wait);
 
 	spin_lock_irqsave(&info->slock, flags);
 
 	if (extra_count)
-		info->port.count++;
-	info->port.blocked_open--;
+		port->count++;
+	port->blocked_open--;
 
 	spin_unlock_irqrestore(&info->slock, flags);
 
 #ifdef ROCKET_DEBUG_OPEN
 	printk(KERN_INFO "block_til_ready after blocking: ttyR%d, count = %d\n",
-	       info->line, info->port.count);
+	       info->line, port->count);
 #endif
 	if (retval)
 		return retval;
@@ -2371,6 +2379,10 @@ static const struct tty_operations rocket_ops = {
 	.tiocmset = rp_tiocmset,
 };
 
+static const struct tty_port_operations rocket_port_ops = {
+	.carrier_raised = carrier_raised,
+};
+
 /*
  * The module "startup" routine; it's run when the module is loaded.
  */
diff --git a/drivers/char/ser_a2232.c b/drivers/char/ser_a2232.c
index 7b0c35207d9..0c97f34df63 100644
--- a/drivers/char/ser_a2232.c
+++ b/drivers/char/ser_a2232.c
@@ -122,7 +122,7 @@ static void a2232_disable_tx_interrupts(void *ptr);
 static void a2232_enable_tx_interrupts(void *ptr);
 static void a2232_disable_rx_interrupts(void *ptr);
 static void a2232_enable_rx_interrupts(void *ptr);
-static int  a2232_get_CD(void *ptr);
+static int  a2232_carrier_raised(struct tty_port *port);
 static void a2232_shutdown_port(void *ptr);
 static int  a2232_set_real_termios(void *ptr);
 static int  a2232_chars_in_buffer(void *ptr);
@@ -148,7 +148,6 @@ static struct real_driver a2232_real_driver = {
         a2232_enable_tx_interrupts,
         a2232_disable_rx_interrupts,
         a2232_enable_rx_interrupts,
-        a2232_get_CD,
         a2232_shutdown_port,
         a2232_set_real_termios,
         a2232_chars_in_buffer,
@@ -260,9 +259,10 @@ static void a2232_enable_rx_interrupts(void *ptr)
 	port->disable_rx = 0;
 }
 
-static int  a2232_get_CD(void *ptr)
+static int  a2232_carrier_raised(struct tty_port *port)
 {
-	return ((struct a2232_port *) ptr)->cd_status;
+	struct a2232_port *ap = container_of(port, struct a2232_port, gs.port);
+	return ap->cd_status;
 }
 
 static void a2232_shutdown_port(void *ptr)
@@ -638,6 +638,10 @@ int ch, err, n, p;
 	return IRQ_HANDLED;
 }
 
+static const struct tty_port_operations a2232_port_ops = {
+	.carrier_raised = a2232_carrier_raised,
+};
+
 static void a2232_init_portstructs(void)
 {
 	struct a2232_port *port;
@@ -645,6 +649,8 @@ static void a2232_init_portstructs(void)
 
 	for (i = 0; i < MAX_A2232_BOARDS*NUMLINES; i++) {
 		port = a2232_ports + i;
+		tty_port_init(&port->gs.port);
+		port->gs.port.ops = &a2232_port_ops;
 		port->which_a2232 = i/NUMLINES;
 		port->which_port_on_a2232 = i%NUMLINES;
 		port->disable_rx = port->throttle_input = port->cd_status = 0;
@@ -652,11 +658,6 @@ static void a2232_init_portstructs(void)
 		port->gs.close_delay = HZ/2;
 		port->gs.closing_wait = 30 * HZ;
 		port->gs.rd = &a2232_real_driver;
-#ifdef NEW_WRITE_LOCKING
-		mutex_init(&(port->gs.port_write_mutex));
-#endif
-		init_waitqueue_head(&port->gs.port.open_wait);
-		init_waitqueue_head(&port->gs.port.close_wait);
 	}
 }
 
diff --git a/drivers/char/stallion.c b/drivers/char/stallion.c
index 963b03fb29e..12aecdaf61e 100644
--- a/drivers/char/stallion.c
+++ b/drivers/char/stallion.c
@@ -130,6 +130,8 @@ static char		stl_unwanted[SC26198_RXFIFOSIZE];
 static DEFINE_MUTEX(stl_brdslock);
 static struct stlbrd		*stl_brds[STL_MAXBRDS];
 
+static const struct tty_port_operations stl_port_ops;
+
 /*
  *	Per board state flags. Used with the state field of the board struct.
  *	Not really much here!
@@ -786,6 +788,12 @@ static int stl_open(struct tty_struct *tty, struct file *filp)
 
 /*****************************************************************************/
 
+static int stl_carrier_raised(struct tty_port *port)
+{
+	struct stlport *portp = container_of(port, struct stlport, port);
+	return (portp->sigs & TIOCM_CD) ? 1 : 0;
+}
+
 /*
  *	Possibly need to wait for carrier (DCD signal) to come high. Say
  *	maybe because if we are clocal then we don't need to wait...
@@ -796,6 +804,7 @@ static int stl_waitcarrier(struct tty_struct *tty, struct stlport *portp,
 {
 	unsigned long	flags;
 	int		rc, doclocal;
+	struct tty_port *port = &portp->port;
 
 	pr_debug("stl_waitcarrier(portp=%p,filp=%p)\n", portp, filp);
 
@@ -809,32 +818,32 @@ static int stl_waitcarrier(struct tty_struct *tty, struct stlport *portp,
 
 	portp->openwaitcnt++;
 	if (! tty_hung_up_p(filp))
-		portp->port.count--;
+		port->count--;
 
 	for (;;) {
 		/* Takes brd_lock internally */
 		stl_setsignals(portp, 1, 1);
 		if (tty_hung_up_p(filp) ||
-		    ((portp->port.flags & ASYNC_INITIALIZED) == 0)) {
-			if (portp->port.flags & ASYNC_HUP_NOTIFY)
+		    ((port->flags & ASYNC_INITIALIZED) == 0)) {
+			if (port->flags & ASYNC_HUP_NOTIFY)
 				rc = -EBUSY;
 			else
 				rc = -ERESTARTSYS;
 			break;
 		}
-		if (((portp->port.flags & ASYNC_CLOSING) == 0) &&
-		    (doclocal || (portp->sigs & TIOCM_CD)))
+		if (((port->flags & ASYNC_CLOSING) == 0) &&
+		    (doclocal || tty_port_carrier_raised(port)))
 			break;
 		if (signal_pending(current)) {
 			rc = -ERESTARTSYS;
 			break;
 		}
 		/* FIXME */
-		interruptible_sleep_on(&portp->port.open_wait);
+		interruptible_sleep_on(&port->open_wait);
 	}
 
 	if (! tty_hung_up_p(filp))
-		portp->port.count++;
+		port->count++;
 	portp->openwaitcnt--;
 	spin_unlock_irqrestore(&stallion_lock, flags);
 
@@ -1776,6 +1785,7 @@ static int __devinit stl_initports(struct stlbrd *brdp, struct stlpanel *panelp)
 			break;
 		}
 		tty_port_init(&portp->port);
+		portp->port.ops = &stl_port_ops;
 		portp->magic = STL_PORTMAGIC;
 		portp->portnr = i;
 		portp->brdnr = panelp->brdnr;
@@ -2659,6 +2669,10 @@ static const struct tty_operations stl_ops = {
 	.tiocmset = stl_tiocmset,
 };
 
+static const struct tty_port_operations stl_port_ops = {
+	.carrier_raised = stl_carrier_raised,
+};
+
 /*****************************************************************************/
 /*                       CD1400 HARDWARE FUNCTIONS                           */
 /*****************************************************************************/
diff --git a/drivers/char/sx.c b/drivers/char/sx.c
index ba4e86281fb..a71bc58abe7 100644
--- a/drivers/char/sx.c
+++ b/drivers/char/sx.c
@@ -279,7 +279,7 @@ static void sx_disable_tx_interrupts(void *ptr);
 static void sx_enable_tx_interrupts(void *ptr);
 static void sx_disable_rx_interrupts(void *ptr);
 static void sx_enable_rx_interrupts(void *ptr);
-static int sx_get_CD(void *ptr);
+static int sx_carrier_raised(struct tty_port *port);
 static void sx_shutdown_port(void *ptr);
 static int sx_set_real_termios(void *ptr);
 static void sx_close(void *ptr);
@@ -360,7 +360,6 @@ static struct real_driver sx_real_driver = {
 	sx_enable_tx_interrupts,
 	sx_disable_rx_interrupts,
 	sx_enable_rx_interrupts,
-	sx_get_CD,
 	sx_shutdown_port,
 	sx_set_real_termios,
 	sx_chars_in_buffer,
@@ -791,7 +790,7 @@ static int sx_getsignals(struct sx_port *port)
 	sx_dprintk(SX_DEBUG_MODEMSIGNALS, "getsignals: %d/%d  (%d/%d) "
 			"%02x/%02x\n",
 			(o_stat & OP_DTR) != 0, (o_stat & OP_RTS) != 0,
-			port->c_dcd, sx_get_CD(port),
+			port->c_dcd, tty_port_carrier_raised(&port->gs.port),
 			sx_read_channel_byte(port, hi_ip),
 			sx_read_channel_byte(port, hi_state));
 
@@ -1190,7 +1189,7 @@ static inline void sx_check_modem_signals(struct sx_port *port)
 
 	hi_state = sx_read_channel_byte(port, hi_state);
 	sx_dprintk(SX_DEBUG_MODEMSIGNALS, "Checking modem signals (%d/%d)\n",
-			port->c_dcd, sx_get_CD(port));
+			port->c_dcd, tty_port_carrier_raised(&port->gs.port));
 
 	if (hi_state & ST_BREAK) {
 		hi_state &= ~ST_BREAK;
@@ -1202,11 +1201,11 @@ static inline void sx_check_modem_signals(struct sx_port *port)
 		hi_state &= ~ST_DCD;
 		sx_dprintk(SX_DEBUG_MODEMSIGNALS, "got a DCD change.\n");
 		sx_write_channel_byte(port, hi_state, hi_state);
-		c_dcd = sx_get_CD(port);
+		c_dcd = tty_port_carrier_raised(&port->gs.port);
 		sx_dprintk(SX_DEBUG_MODEMSIGNALS, "DCD is now %d\n", c_dcd);
 		if (c_dcd != port->c_dcd) {
 			port->c_dcd = c_dcd;
-			if (sx_get_CD(port)) {
+			if (tty_port_carrier_raised(&port->gs.port)) {
 				/* DCD went UP */
 				if ((sx_read_channel_byte(port, hi_hstat) !=
 						HS_IDLE_CLOSED) &&
@@ -1415,13 +1414,10 @@ static void sx_enable_rx_interrupts(void *ptr)
 }
 
 /* Jeez. Isn't this simple? */
-static int sx_get_CD(void *ptr)
+static int sx_carrier_raised(struct tty_port *port)
 {
-	struct sx_port *port = ptr;
-	func_enter2();
-
-	func_exit();
-	return ((sx_read_channel_byte(port, hi_ip) & IP_DCD) != 0);
+	struct sx_port *sp = container_of(port, struct sx_port, gs.port);
+	return ((sx_read_channel_byte(sp, hi_ip) & IP_DCD) != 0);
 }
 
 /* Jeez. Isn't this simple? */
@@ -1536,7 +1532,7 @@ static int sx_open(struct tty_struct *tty, struct file *filp)
 	}
 	/* tty->low_latency = 1; */
 
-	port->c_dcd = sx_get_CD(port);
+	port->c_dcd = sx_carrier_raised(&port->gs.port);
 	sx_dprintk(SX_DEBUG_OPEN, "at open: cd=%d\n", port->c_dcd);
 
 	func_exit();
@@ -2354,6 +2350,10 @@ static const struct tty_operations sx_ops = {
 	.tiocmset = sx_tiocmset,
 };
 
+static const struct tty_port_operations sx_port_ops = {
+	.carrier_raised = sx_carrier_raised,
+};
+
 static int sx_init_drivers(void)
 {
 	int error;
@@ -2410,6 +2410,7 @@ static int sx_init_portstructs(int nboards, int nports)
 		for (j = 0; j < boards[i].nports; j++) {
 			sx_dprintk(SX_DEBUG_INIT, "initing port %d\n", j);
 			tty_port_init(&port->gs.port);
+			port->gs.port.ops = &sx_port_ops;
 			port->gs.magic = SX_MAGIC;
 			port->gs.close_delay = HZ / 2;
 			port->gs.closing_wait = 30 * HZ;
diff --git a/drivers/char/synclink.c b/drivers/char/synclink.c
index 500f5176b6b..fb2e6b5e0ef 100644
--- a/drivers/char/synclink.c
+++ b/drivers/char/synclink.c
@@ -3281,6 +3281,23 @@ static void mgsl_hangup(struct tty_struct *tty)
 	
 }	/* end of mgsl_hangup() */
 
+/*
+ * carrier_raised()
+ *
+ *	Return true if carrier is raised
+ */
+
+static int carrier_raised(struct tty_port *port)
+{
+	unsigned long flags;
+	struct mgsl_struct *info = container_of(port, struct mgsl_struct, port);
+	
+	spin_lock_irqsave(&info->irq_spinlock, flags);
+ 	usc_get_serial_signals(info);
+	spin_unlock_irqrestore(&info->irq_spinlock, flags);
+	return (info->serial_signals & SerialSignal_DCD) ? 1 : 0;
+}
+
 /* block_til_ready()
  * 
  * 	Block the current process until the specified port
@@ -3302,6 +3319,8 @@ static int block_til_ready(struct tty_struct *tty, struct file * filp,
 	bool		do_clocal = false;
 	bool		extra_count = false;
 	unsigned long	flags;
+	int		dcd;
+	struct tty_port *port = &info->port;
 	
 	if (debug_level >= DEBUG_LEVEL_INFO)
 		printk("%s(%d):block_til_ready on %s\n",
@@ -3309,7 +3328,7 @@ static int block_til_ready(struct tty_struct *tty, struct file * filp,
 
 	if (filp->f_flags & O_NONBLOCK || tty->flags & (1 << TTY_IO_ERROR)){
 		/* nonblock mode is set or port is not enabled */
-		info->port.flags |= ASYNC_NORMAL_ACTIVE;
+		port->flags |= ASYNC_NORMAL_ACTIVE;
 		return 0;
 	}
 
@@ -3318,25 +3337,25 @@ static int block_til_ready(struct tty_struct *tty, struct file * filp,
 
 	/* Wait for carrier detect and the line to become
 	 * free (i.e., not in use by the callout).  While we are in
-	 * this loop, info->port.count is dropped by one, so that
+	 * this loop, port->count is dropped by one, so that
 	 * mgsl_close() knows when to free things.  We restore it upon
 	 * exit, either normal or abnormal.
 	 */
 	 
 	retval = 0;
-	add_wait_queue(&info->port.open_wait, &wait);
+	add_wait_queue(&port->open_wait, &wait);
 	
 	if (debug_level >= DEBUG_LEVEL_INFO)
 		printk("%s(%d):block_til_ready before block on %s count=%d\n",
-			 __FILE__,__LINE__, tty->driver->name, info->port.count );
+			 __FILE__,__LINE__, tty->driver->name, port->count );
 
 	spin_lock_irqsave(&info->irq_spinlock, flags);
 	if (!tty_hung_up_p(filp)) {
 		extra_count = true;
-		info->port.count--;
+		port->count--;
 	}
 	spin_unlock_irqrestore(&info->irq_spinlock, flags);
-	info->port.blocked_open++;
+	port->blocked_open++;
 	
 	while (1) {
 		if (tty->termios->c_cflag & CBAUD) {
@@ -3348,20 +3367,16 @@ static int block_til_ready(struct tty_struct *tty, struct file * filp,
 		
 		set_current_state(TASK_INTERRUPTIBLE);
 		
-		if (tty_hung_up_p(filp) || !(info->port.flags & ASYNC_INITIALIZED)){
-			retval = (info->port.flags & ASYNC_HUP_NOTIFY) ?
+		if (tty_hung_up_p(filp) || !(port->flags & ASYNC_INITIALIZED)){
+			retval = (port->flags & ASYNC_HUP_NOTIFY) ?
 					-EAGAIN : -ERESTARTSYS;
 			break;
 		}
 		
-		spin_lock_irqsave(&info->irq_spinlock,flags);
-	 	usc_get_serial_signals(info);
-		spin_unlock_irqrestore(&info->irq_spinlock,flags);
+		dcd = tty_port_carrier_raised(&info->port);
 		
- 		if (!(info->port.flags & ASYNC_CLOSING) &&
- 		    (do_clocal || (info->serial_signals & SerialSignal_DCD)) ) {
+ 		if (!(port->flags & ASYNC_CLOSING) && (do_clocal || dcd))
  			break;
-		}
 			
 		if (signal_pending(current)) {
 			retval = -ERESTARTSYS;
@@ -3370,24 +3385,24 @@ static int block_til_ready(struct tty_struct *tty, struct file * filp,
 		
 		if (debug_level >= DEBUG_LEVEL_INFO)
 			printk("%s(%d):block_til_ready blocking on %s count=%d\n",
-				 __FILE__,__LINE__, tty->driver->name, info->port.count );
+				 __FILE__,__LINE__, tty->driver->name, port->count );
 				 
 		schedule();
 	}
 	
 	set_current_state(TASK_RUNNING);
-	remove_wait_queue(&info->port.open_wait, &wait);
+	remove_wait_queue(&port->open_wait, &wait);
 	
 	if (extra_count)
-		info->port.count++;
-	info->port.blocked_open--;
+		port->count++;
+	port->blocked_open--;
 	
 	if (debug_level >= DEBUG_LEVEL_INFO)
 		printk("%s(%d):block_til_ready after blocking on %s count=%d\n",
-			 __FILE__,__LINE__, tty->driver->name, info->port.count );
+			 __FILE__,__LINE__, tty->driver->name, port->count );
 			 
 	if (!retval)
-		info->port.flags |= ASYNC_NORMAL_ACTIVE;
+		port->flags |= ASYNC_NORMAL_ACTIVE;
 		
 	return retval;
 	
@@ -4304,6 +4319,11 @@ static void mgsl_add_device( struct mgsl_struct *info )
 
 }	/* end of mgsl_add_device() */
 
+static const struct tty_port_operations mgsl_port_ops = {
+	.carrier_raised = carrier_raised,
+};
+
+
 /* mgsl_allocate_device()
  * 
  * 	Allocate and initialize a device instance structure
@@ -4322,6 +4342,7 @@ static struct mgsl_struct* mgsl_allocate_device(void)
 		printk("Error can't allocate device instance data\n");
 	} else {
 		tty_port_init(&info->port);
+		info->port.ops = &mgsl_port_ops;
 		info->magic = MGSL_MAGIC;
 		INIT_WORK(&info->task, mgsl_bh_handler);
 		info->max_frame_size = 4096;
diff --git a/drivers/char/synclink_gt.c b/drivers/char/synclink_gt.c
index 08911ed6649..39ccaba8ca3 100644
--- a/drivers/char/synclink_gt.c
+++ b/drivers/char/synclink_gt.c
@@ -3132,6 +3132,17 @@ static int tiocmset(struct tty_struct *tty, struct file *file,
 	return 0;
 }
 
+static int carrier_raised(struct tty_port *port)
+{
+	unsigned long flags;
+	struct slgt_info *info = container_of(port, struct slgt_info, port);
+
+	spin_lock_irqsave(&info->lock,flags);
+ 	get_signals(info);
+	spin_unlock_irqrestore(&info->lock,flags);
+	return (info->signals & SerialSignal_DCD) ? 1 : 0;
+}
+
 /*
  *  block current process until the device is ready to open
  */
@@ -3143,12 +3154,14 @@ static int block_til_ready(struct tty_struct *tty, struct file *filp,
 	bool		do_clocal = false;
 	bool		extra_count = false;
 	unsigned long	flags;
+	int		cd;
+	struct tty_port *port = &info->port;
 
 	DBGINFO(("%s block_til_ready\n", tty->driver->name));
 
 	if (filp->f_flags & O_NONBLOCK || tty->flags & (1 << TTY_IO_ERROR)){
 		/* nonblock mode is set or port is not enabled */
-		info->port.flags |= ASYNC_NORMAL_ACTIVE;
+		port->flags |= ASYNC_NORMAL_ACTIVE;
 		return 0;
 	}
 
@@ -3157,21 +3170,21 @@ static int block_til_ready(struct tty_struct *tty, struct file *filp,
 
 	/* Wait for carrier detect and the line to become
 	 * free (i.e., not in use by the callout).  While we are in
-	 * this loop, info->port.count is dropped by one, so that
+	 * this loop, port->count is dropped by one, so that
 	 * close() knows when to free things.  We restore it upon
 	 * exit, either normal or abnormal.
 	 */
 
 	retval = 0;
-	add_wait_queue(&info->port.open_wait, &wait);
+	add_wait_queue(&port->open_wait, &wait);
 
 	spin_lock_irqsave(&info->lock, flags);
 	if (!tty_hung_up_p(filp)) {
 		extra_count = true;
-		info->port.count--;
+		port->count--;
 	}
 	spin_unlock_irqrestore(&info->lock, flags);
-	info->port.blocked_open++;
+	port->blocked_open++;
 
 	while (1) {
 		if ((tty->termios->c_cflag & CBAUD)) {
@@ -3183,20 +3196,16 @@ static int block_til_ready(struct tty_struct *tty, struct file *filp,
 
 		set_current_state(TASK_INTERRUPTIBLE);
 
-		if (tty_hung_up_p(filp) || !(info->port.flags & ASYNC_INITIALIZED)){
-			retval = (info->port.flags & ASYNC_HUP_NOTIFY) ?
+		if (tty_hung_up_p(filp) || !(port->flags & ASYNC_INITIALIZED)){
+			retval = (port->flags & ASYNC_HUP_NOTIFY) ?
 					-EAGAIN : -ERESTARTSYS;
 			break;
 		}
 
-		spin_lock_irqsave(&info->lock,flags);
-	 	get_signals(info);
-		spin_unlock_irqrestore(&info->lock,flags);
+		cd = tty_port_carrier_raised(port);
 
- 		if (!(info->port.flags & ASYNC_CLOSING) &&
- 		    (do_clocal || (info->signals & SerialSignal_DCD)) ) {
+ 		if (!(port->flags & ASYNC_CLOSING) && (do_clocal || cd ))
  			break;
-		}
 
 		if (signal_pending(current)) {
 			retval = -ERESTARTSYS;
@@ -3208,14 +3217,14 @@ static int block_til_ready(struct tty_struct *tty, struct file *filp,
 	}
 
 	set_current_state(TASK_RUNNING);
-	remove_wait_queue(&info->port.open_wait, &wait);
+	remove_wait_queue(&port->open_wait, &wait);
 
 	if (extra_count)
-		info->port.count++;
-	info->port.blocked_open--;
+		port->count++;
+	port->blocked_open--;
 
 	if (!retval)
-		info->port.flags |= ASYNC_NORMAL_ACTIVE;
+		port->flags |= ASYNC_NORMAL_ACTIVE;
 
 	DBGINFO(("%s block_til_ready ready, rc=%d\n", tty->driver->name, retval));
 	return retval;
@@ -3444,6 +3453,10 @@ static void add_device(struct slgt_info *info)
 #endif
 }
 
+static const struct tty_port_operations slgt_port_ops = {
+	.carrier_raised = carrier_raised,
+};
+
 /*
  *  allocate device instance structure, return NULL on failure
  */
@@ -3458,6 +3471,7 @@ static struct slgt_info *alloc_dev(int adapter_num, int port_num, struct pci_dev
 			driver_name, adapter_num, port_num));
 	} else {
 		tty_port_init(&info->port);
+		info->port.ops = &slgt_port_ops;
 		info->magic = MGSL_MAGIC;
 		INIT_WORK(&info->task, bh_handler);
 		info->max_frame_size = 4096;
diff --git a/drivers/char/synclinkmp.c b/drivers/char/synclinkmp.c
index 6bdb44f7bec..fcf1ec77450 100644
--- a/drivers/char/synclinkmp.c
+++ b/drivers/char/synclinkmp.c
@@ -558,6 +558,7 @@ static void release_resources(SLMP_INFO *info);
 
 static int  startup(SLMP_INFO *info);
 static int  block_til_ready(struct tty_struct *tty, struct file * filp,SLMP_INFO *info);
+static int carrier_raised(struct tty_port *port);
 static void shutdown(SLMP_INFO *info);
 static void program_hw(SLMP_INFO *info);
 static void change_params(SLMP_INFO *info);
@@ -3318,7 +3319,17 @@ static int tiocmset(struct tty_struct *tty, struct file *file,
 	return 0;
 }
 
+static int carrier_raised(struct tty_port *port)
+{
+	SLMP_INFO *info = container_of(port, SLMP_INFO, port);
+	unsigned long flags;
 
+	spin_lock_irqsave(&info->lock,flags);
+ 	get_signals(info);
+	spin_unlock_irqrestore(&info->lock,flags);
+
+	return (info->serial_signals & SerialSignal_DCD) ? 1 : 0;
+}
 
 /* Block the current process until the specified port is ready to open.
  */
@@ -3330,6 +3341,8 @@ static int block_til_ready(struct tty_struct *tty, struct file *filp,
 	bool		do_clocal = false;
 	bool		extra_count = false;
 	unsigned long	flags;
+	int		cd;
+	struct tty_port *port = &info->port;
 
 	if (debug_level >= DEBUG_LEVEL_INFO)
 		printk("%s(%d):%s block_til_ready()\n",
@@ -3338,7 +3351,7 @@ static int block_til_ready(struct tty_struct *tty, struct file *filp,
 	if (filp->f_flags & O_NONBLOCK || tty->flags & (1 << TTY_IO_ERROR)){
 		/* nonblock mode is set or port is not enabled */
 		/* just verify that callout device is not active */
-		info->port.flags |= ASYNC_NORMAL_ACTIVE;
+		port->flags |= ASYNC_NORMAL_ACTIVE;
 		return 0;
 	}
 
@@ -3347,25 +3360,25 @@ static int block_til_ready(struct tty_struct *tty, struct file *filp,
 
 	/* Wait for carrier detect and the line to become
 	 * free (i.e., not in use by the callout).  While we are in
-	 * this loop, info->port.count is dropped by one, so that
+	 * this loop, port->count is dropped by one, so that
 	 * close() knows when to free things.  We restore it upon
 	 * exit, either normal or abnormal.
 	 */
 
 	retval = 0;
-	add_wait_queue(&info->port.open_wait, &wait);
+	add_wait_queue(&port->open_wait, &wait);
 
 	if (debug_level >= DEBUG_LEVEL_INFO)
 		printk("%s(%d):%s block_til_ready() before block, count=%d\n",
-			 __FILE__,__LINE__, tty->driver->name, info->port.count );
+			 __FILE__,__LINE__, tty->driver->name, port->count );
 
 	spin_lock_irqsave(&info->lock, flags);
 	if (!tty_hung_up_p(filp)) {
 		extra_count = true;
-		info->port.count--;
+		port->count--;
 	}
 	spin_unlock_irqrestore(&info->lock, flags);
-	info->port.blocked_open++;
+	port->blocked_open++;
 
 	while (1) {
 		if ((tty->termios->c_cflag & CBAUD)) {
@@ -3377,20 +3390,16 @@ static int block_til_ready(struct tty_struct *tty, struct file *filp,
 
 		set_current_state(TASK_INTERRUPTIBLE);
 
-		if (tty_hung_up_p(filp) || !(info->port.flags & ASYNC_INITIALIZED)){
-			retval = (info->port.flags & ASYNC_HUP_NOTIFY) ?
+		if (tty_hung_up_p(filp) || !(port->flags & ASYNC_INITIALIZED)){
+			retval = (port->flags & ASYNC_HUP_NOTIFY) ?
 					-EAGAIN : -ERESTARTSYS;
 			break;
 		}
 
-		spin_lock_irqsave(&info->lock,flags);
-	 	get_signals(info);
-		spin_unlock_irqrestore(&info->lock,flags);
+		cd = tty_port_carrier_raised(port);
 
- 		if (!(info->port.flags & ASYNC_CLOSING) &&
- 		    (do_clocal || (info->serial_signals & SerialSignal_DCD)) ) {
+ 		if (!(port->flags & ASYNC_CLOSING) && (do_clocal || cd))
  			break;
-		}
 
 		if (signal_pending(current)) {
 			retval = -ERESTARTSYS;
@@ -3399,24 +3408,24 @@ static int block_til_ready(struct tty_struct *tty, struct file *filp,
 
 		if (debug_level >= DEBUG_LEVEL_INFO)
 			printk("%s(%d):%s block_til_ready() count=%d\n",
-				 __FILE__,__LINE__, tty->driver->name, info->port.count );
+				 __FILE__,__LINE__, tty->driver->name, port->count );
 
 		schedule();
 	}
 
 	set_current_state(TASK_RUNNING);
-	remove_wait_queue(&info->port.open_wait, &wait);
+	remove_wait_queue(&port->open_wait, &wait);
 
 	if (extra_count)
-		info->port.count++;
-	info->port.blocked_open--;
+		port->count++;
+	port->blocked_open--;
 
 	if (debug_level >= DEBUG_LEVEL_INFO)
 		printk("%s(%d):%s block_til_ready() after, count=%d\n",
-			 __FILE__,__LINE__, tty->driver->name, info->port.count );
+			 __FILE__,__LINE__, tty->driver->name, port->count );
 
 	if (!retval)
-		info->port.flags |= ASYNC_NORMAL_ACTIVE;
+		port->flags |= ASYNC_NORMAL_ACTIVE;
 
 	return retval;
 }
@@ -3782,6 +3791,10 @@ static void add_device(SLMP_INFO *info)
 #endif
 }
 
+static const struct tty_port_operations port_ops = {
+	.carrier_raised = carrier_raised,
+};
+
 /* Allocate and initialize a device instance structure
  *
  * Return Value:	pointer to SLMP_INFO if success, otherwise NULL
@@ -3798,6 +3811,7 @@ static SLMP_INFO *alloc_dev(int adapter_num, int port_num, struct pci_dev *pdev)
 			__FILE__,__LINE__, adapter_num, port_num);
 	} else {
 		tty_port_init(&info->port);
+		info->port.ops = &port_ops;
 		info->magic = MGSL_MAGIC;
 		INIT_WORK(&info->task, bh_handler);
 		info->max_frame_size = 4096;
@@ -3940,6 +3954,7 @@ static const struct tty_operations ops = {
 	.tiocmset = tiocmset,
 };
 
+
 static void synclinkmp_cleanup(void)
 {
 	int rc;
diff --git a/drivers/char/tty_port.c b/drivers/char/tty_port.c
index c8f8024cb40..f54e40cbf02 100644
--- a/drivers/char/tty_port.c
+++ b/drivers/char/tty_port.c
@@ -94,3 +94,20 @@ void tty_port_tty_set(struct tty_port *port, struct tty_struct *tty)
 	spin_unlock_irqrestore(&port->lock, flags);
 }
 EXPORT_SYMBOL(tty_port_tty_set);
+
+/**
+ *	tty_port_carrier_raised	-	carrier raised check
+ *	@port: tty port
+ *
+ *	Wrapper for the carrier detect logic. For the moment this is used
+ *	to hide some internal details. This will eventually become entirely
+ *	internal to the tty port.
+ */
+
+int tty_port_carrier_raised(struct tty_port *port)
+{
+	if (port->ops->carrier_raised == NULL)
+		return 1;
+	return port->ops->carrier_raised(port);
+}
+EXPORT_SYMBOL(tty_port_carrier_raised);
diff --git a/drivers/char/vme_scc.c b/drivers/char/vme_scc.c
index 1718b3c481d..d4e1534c0e0 100644
--- a/drivers/char/vme_scc.c
+++ b/drivers/char/vme_scc.c
@@ -69,7 +69,7 @@ static void scc_disable_tx_interrupts(void * ptr);
 static void scc_enable_tx_interrupts(void * ptr);
 static void scc_disable_rx_interrupts(void * ptr);
 static void scc_enable_rx_interrupts(void * ptr);
-static int  scc_get_CD(void * ptr);
+static int  scc_carrier_raised(struct tty_port *port);
 static void scc_shutdown_port(void * ptr);
 static int scc_set_real_termios(void  *ptr);
 static void scc_hungup(void  *ptr);
@@ -100,7 +100,6 @@ static struct real_driver scc_real_driver = {
         scc_enable_tx_interrupts,
         scc_disable_rx_interrupts,
         scc_enable_rx_interrupts,
-        scc_get_CD,
         scc_shutdown_port,
         scc_set_real_termios,
         scc_chars_in_buffer,
@@ -129,6 +128,10 @@ static const struct tty_operations scc_ops = {
 	.break_ctl = scc_break_ctl,
 };
 
+static const struct tty_port_operations scc_port_ops = {
+	.carrier_raised = scc_carrier_raised,
+};
+
 /*----------------------------------------------------------------------------
  * vme_scc_init() and support functions
  *---------------------------------------------------------------------------*/
@@ -176,6 +179,8 @@ static void scc_init_portstructs(void)
 
 	for (i = 0; i < 2; i++) {
 		port = scc_ports + i;
+		tty_port_init(&port->gs.port);
+		port->gs.port.ops = &scc_port_ops;
 		port->gs.magic = SCC_MAGIC;
 		port->gs.close_delay = HZ/2;
 		port->gs.closing_wait = 30 * HZ;
@@ -624,9 +629,9 @@ static void scc_enable_rx_interrupts(void *ptr)
 }
 
 
-static int scc_get_CD(void *ptr)
+static int scc_carrier_raised(struct tty_port *port)
 {
-	struct scc_port *port = ptr;
+	struct scc_port *scc = container_of(port, struct scc_port, gs.port);
 	unsigned channel = port->channel;
 
 	return !!(scc_last_status_reg[channel] & SR_DCD);
@@ -896,7 +901,7 @@ static int scc_open (struct tty_struct * tty, struct file * filp)
 		return retval;
 	}
 
-	port->c_dcd = scc_get_CD (port);
+	port->c_dcd = tty_port_carrier_raised(&port->gs.port);
 
 	scc_enable_rx_interrupts(port);
 
diff --git a/include/linux/generic_serial.h b/include/linux/generic_serial.h
index 4cc91393981..fadff28505b 100644
--- a/include/linux/generic_serial.h
+++ b/include/linux/generic_serial.h
@@ -21,7 +21,6 @@ struct real_driver {
   void                    (*enable_tx_interrupts) (void *);
   void                    (*disable_rx_interrupts) (void *);
   void                    (*enable_rx_interrupts) (void *);
-  int                     (*get_CD) (void *);
   void                    (*shutdown_port) (void*);
   int                     (*set_real_termios) (void*);
   int                     (*chars_in_buffer) (void*);
diff --git a/include/linux/tty.h b/include/linux/tty.h
index bbbeaef9962..bc7bae78e22 100644
--- a/include/linux/tty.h
+++ b/include/linux/tty.h
@@ -180,8 +180,16 @@ struct signal_struct;
  * until a hangup so don't use the wrong path.
  */
 
+struct tty_port;
+
+struct tty_port_operations {
+	/* Return 1 if the carrier is raised */
+	int (*carrier_raised)(struct tty_port *port);
+};
+	
 struct tty_port {
 	struct tty_struct	*tty;		/* Back pointer */
+	const struct tty_port_operations *ops;	/* Port operations */
 	spinlock_t		lock;		/* Lock protecting tty field */
 	int			blocked_open;	/* Waiting to open */
 	int			count;		/* Usage count */
@@ -427,6 +435,7 @@ extern int tty_port_alloc_xmit_buf(struct tty_port *port);
 extern void tty_port_free_xmit_buf(struct tty_port *port);
 extern struct tty_struct *tty_port_tty_get(struct tty_port *port);
 extern void tty_port_tty_set(struct tty_port *port, struct tty_struct *tty);
+extern int tty_port_carrier_raised(struct tty_port *port);
 
 extern int tty_register_ldisc(int disc, struct tty_ldisc_ops *new_ldisc);
 extern int tty_unregister_ldisc(int disc);
-- 
cgit v1.2.3-70-g09d2


From 5d951fb458f847e5485b5251597fbf326000bb3b Mon Sep 17 00:00:00 2001
From: Alan Cox <alan@redhat.com>
Date: Fri, 2 Jan 2009 13:45:19 +0000
Subject: tty: Pull the dtr raise into tty port

This moves another per device special out of what should be shared open
wait paths into private methods

Signed-off-by: Alan Cox <alan@redhat.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/char/isicom.c      | 13 ++++++++-----
 drivers/char/mxser.c       | 17 +++++++++++++----
 drivers/char/rocket.c      | 14 ++++++++++----
 drivers/char/synclink.c    | 21 +++++++++++++++------
 drivers/char/synclink_gt.c | 21 +++++++++++++++------
 drivers/char/tty_port.c    | 16 ++++++++++++++++
 include/linux/tty.h        |  2 ++
 7 files changed, 79 insertions(+), 25 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/char/isicom.c b/drivers/char/isicom.c
index b3da4858fd4..a449449e301 100644
--- a/drivers/char/isicom.c
+++ b/drivers/char/isicom.c
@@ -328,11 +328,13 @@ static inline void drop_rts(struct isi_port *port)
 }
 
 /* card->lock MUST NOT be held */
-static inline void raise_dtr_rts(struct isi_port *port)
+
+static void isicom_raise_dtr_rts(struct tty_port *port)
 {
-	struct isi_board *card = port->card;
+	struct isi_port *ip = container_of(port, struct isi_port, port);
+	struct isi_board *card = ip->card;
 	unsigned long base = card->base;
-	u16 channel = port->channel;
+	u16 channel = ip->channel;
 
 	if (!lock_card(card))
 		return;
@@ -340,7 +342,7 @@ static inline void raise_dtr_rts(struct isi_port *port)
 	outw(0x8000 | (channel << card->shift_count) | 0x02, base);
 	outw(0x0f04, base);
 	InterruptTheCard(base);
-	port->status |= (ISI_DTR | ISI_RTS);
+	ip->status |= (ISI_DTR | ISI_RTS);
 	unlock_card(card);
 }
 
@@ -881,7 +883,7 @@ static int block_til_ready(struct tty_struct *tty, struct file *filp,
 	spin_unlock_irqrestore(&card->card_lock, flags);
 
 	while (1) {
-		raise_dtr_rts(ip);
+		tty_port_raise_dtr_rts(port);
 
 		set_current_state(TASK_INTERRUPTIBLE);
 		if (tty_hung_up_p(filp) || !(port->flags & ASYNC_INITIALIZED)) {
@@ -1462,6 +1464,7 @@ static const struct tty_operations isicom_ops = {
 
 static const struct tty_port_operations isicom_port_ops = {
 	.carrier_raised		= isicom_carrier_raised,
+	.raise_dtr_rts		= isicom_raise_dtr_rts,
 };
 
 static int __devinit reset_card(struct pci_dev *pdev,
diff --git a/drivers/char/mxser.c b/drivers/char/mxser.c
index eafbbcf355e..ff5ff618880 100644
--- a/drivers/char/mxser.c
+++ b/drivers/char/mxser.c
@@ -547,6 +547,17 @@ static int mxser_carrier_raised(struct tty_port *port)
 	return (inb(mp->ioaddr + UART_MSR) & UART_MSR_DCD)?1:0;
 }
 
+static void mxser_raise_dtr_rts(struct tty_port *port)
+{
+	struct mxser_port *mp = container_of(port, struct mxser_port, port);
+	unsigned long flags;
+
+	spin_lock_irqsave(&mp->slock, flags);
+	outb(inb(mp->ioaddr + UART_MCR) |
+		UART_MCR_DTR | UART_MCR_RTS, mp->ioaddr + UART_MCR);
+	spin_unlock_irqrestore(&mp->slock, flags);
+}
+
 static int mxser_block_til_ready(struct tty_struct *tty, struct file *filp,
 		struct mxser_port *mp)
 {
@@ -586,10 +597,7 @@ static int mxser_block_til_ready(struct tty_struct *tty, struct file *filp,
 	spin_unlock_irqrestore(&mp->slock, flags);
 	port->blocked_open++;
 	while (1) {
-		spin_lock_irqsave(&mp->slock, flags);
-		outb(inb(mp->ioaddr + UART_MCR) |
-			UART_MCR_DTR | UART_MCR_RTS, mp->ioaddr + UART_MCR);
-		spin_unlock_irqrestore(&mp->slock, flags);
+		tty_port_raise_dtr_rts(port);
 		set_current_state(TASK_INTERRUPTIBLE);
 		if (tty_hung_up_p(filp) || !(port->flags & ASYNC_INITIALIZED)) {
 			if (port->flags & ASYNC_HUP_NOTIFY)
@@ -2458,6 +2466,7 @@ static const struct tty_operations mxser_ops = {
 
 struct tty_port_operations mxser_port_ops = {
 	.carrier_raised = mxser_carrier_raised,
+	.raise_dtr_rts = mxser_raise_dtr_rts,
 };
 
 /*
diff --git a/drivers/char/rocket.c b/drivers/char/rocket.c
index 4a4110e703a..f4d9c399348 100644
--- a/drivers/char/rocket.c
+++ b/drivers/char/rocket.c
@@ -870,6 +870,13 @@ static int carrier_raised(struct tty_port *port)
 	return (sGetChanStatusLo(&info->channel) & CD_ACT) ? 1 : 0;
 }
 
+static void raise_dtr_rts(struct tty_port *port)
+{
+	struct r_port *info = container_of(port, struct r_port, port);
+	sSetDTR(&info->channel);
+	sSetRTS(&info->channel);
+}
+
 /*  info->port.count is considered critical, protected by spinlocks.  */
 static int block_til_ready(struct tty_struct *tty, struct file *filp,
 			   struct r_port *info)
@@ -928,10 +935,8 @@ static int block_til_ready(struct tty_struct *tty, struct file *filp,
 	spin_unlock_irqrestore(&info->slock, flags);
 
 	while (1) {
-		if (tty->termios->c_cflag & CBAUD) {
-			sSetDTR(&info->channel);
-			sSetRTS(&info->channel);
-		}
+		if (tty->termios->c_cflag & CBAUD)
+			tty_port_raise_dtr_rts(port);
 		set_current_state(TASK_INTERRUPTIBLE);
 		if (tty_hung_up_p(filp) || !(info->flags & ROCKET_INITIALIZED)) {
 			if (info->flags & ROCKET_HUP_NOTIFY)
@@ -2381,6 +2386,7 @@ static const struct tty_operations rocket_ops = {
 
 static const struct tty_port_operations rocket_port_ops = {
 	.carrier_raised = carrier_raised,
+	.raise_dtr_rts = raise_dtr_rts,
 };
 
 /*
diff --git a/drivers/char/synclink.c b/drivers/char/synclink.c
index fb2e6b5e0ef..ac9f21e18c3 100644
--- a/drivers/char/synclink.c
+++ b/drivers/char/synclink.c
@@ -3298,6 +3298,18 @@ static int carrier_raised(struct tty_port *port)
 	return (info->serial_signals & SerialSignal_DCD) ? 1 : 0;
 }
 
+static void raise_dtr_rts(struct tty_port *port)
+{
+	struct mgsl_struct *info = container_of(port, struct mgsl_struct, port);
+	unsigned long flags;
+
+	spin_lock_irqsave(&info->irq_spinlock,flags);
+	info->serial_signals |= SerialSignal_RTS + SerialSignal_DTR;
+ 	usc_set_serial_signals(info);
+	spin_unlock_irqrestore(&info->irq_spinlock,flags);
+}
+
+
 /* block_til_ready()
  * 
  * 	Block the current process until the specified port
@@ -3358,12 +3370,8 @@ static int block_til_ready(struct tty_struct *tty, struct file * filp,
 	port->blocked_open++;
 	
 	while (1) {
-		if (tty->termios->c_cflag & CBAUD) {
-			spin_lock_irqsave(&info->irq_spinlock,flags);
-			info->serial_signals |= SerialSignal_RTS + SerialSignal_DTR;
-		 	usc_set_serial_signals(info);
-			spin_unlock_irqrestore(&info->irq_spinlock,flags);
-		}
+		if (tty->termios->c_cflag & CBAUD)
+			tty_port_raise_dtr_rts(port);
 		
 		set_current_state(TASK_INTERRUPTIBLE);
 		
@@ -4321,6 +4329,7 @@ static void mgsl_add_device( struct mgsl_struct *info )
 
 static const struct tty_port_operations mgsl_port_ops = {
 	.carrier_raised = carrier_raised,
+	.raise_dtr_rts = raise_dtr_rts,
 };
 
 
diff --git a/drivers/char/synclink_gt.c b/drivers/char/synclink_gt.c
index 39ccaba8ca3..625c9bde3be 100644
--- a/drivers/char/synclink_gt.c
+++ b/drivers/char/synclink_gt.c
@@ -3143,6 +3143,18 @@ static int carrier_raised(struct tty_port *port)
 	return (info->signals & SerialSignal_DCD) ? 1 : 0;
 }
 
+static void raise_dtr_rts(struct tty_port *port)
+{
+	unsigned long flags;
+	struct slgt_info *info = container_of(port, struct slgt_info, port);
+
+	spin_lock_irqsave(&info->lock,flags);
+	info->signals |= SerialSignal_RTS + SerialSignal_DTR;
+ 	set_signals(info);
+	spin_unlock_irqrestore(&info->lock,flags);
+}
+
+
 /*
  *  block current process until the device is ready to open
  */
@@ -3187,12 +3199,8 @@ static int block_til_ready(struct tty_struct *tty, struct file *filp,
 	port->blocked_open++;
 
 	while (1) {
-		if ((tty->termios->c_cflag & CBAUD)) {
-			spin_lock_irqsave(&info->lock,flags);
-			info->signals |= SerialSignal_RTS + SerialSignal_DTR;
-		 	set_signals(info);
-			spin_unlock_irqrestore(&info->lock,flags);
-		}
+		if ((tty->termios->c_cflag & CBAUD))
+			tty_port_raise_dtr_rts(port);
 
 		set_current_state(TASK_INTERRUPTIBLE);
 
@@ -3455,6 +3463,7 @@ static void add_device(struct slgt_info *info)
 
 static const struct tty_port_operations slgt_port_ops = {
 	.carrier_raised = carrier_raised,
+	.raise_dtr_rts = raise_dtr_rts,
 };
 
 /*
diff --git a/drivers/char/tty_port.c b/drivers/char/tty_port.c
index f54e40cbf02..0557b638474 100644
--- a/drivers/char/tty_port.c
+++ b/drivers/char/tty_port.c
@@ -111,3 +111,19 @@ int tty_port_carrier_raised(struct tty_port *port)
 	return port->ops->carrier_raised(port);
 }
 EXPORT_SYMBOL(tty_port_carrier_raised);
+
+/**
+ *	tty_port_raise_dtr_rts	-	Riase DTR/RTS
+ *	@port: tty port
+ *
+ *	Wrapper for the DTR/RTS raise logic. For the moment this is used
+ *	to hide some internal details. This will eventually become entirely
+ *	internal to the tty port.
+ */
+
+void tty_port_raise_dtr_rts(struct tty_port *port)
+{
+	if (port->ops->raise_dtr_rts)
+		port->ops->raise_dtr_rts(port);
+}
+EXPORT_SYMBOL(tty_port_raise_dtr_rts);
diff --git a/include/linux/tty.h b/include/linux/tty.h
index bc7bae78e22..5001bbcacff 100644
--- a/include/linux/tty.h
+++ b/include/linux/tty.h
@@ -185,6 +185,7 @@ struct tty_port;
 struct tty_port_operations {
 	/* Return 1 if the carrier is raised */
 	int (*carrier_raised)(struct tty_port *port);
+	void (*raise_dtr_rts)(struct tty_port *port);
 };
 	
 struct tty_port {
@@ -436,6 +437,7 @@ extern void tty_port_free_xmit_buf(struct tty_port *port);
 extern struct tty_struct *tty_port_tty_get(struct tty_port *port);
 extern void tty_port_tty_set(struct tty_port *port, struct tty_struct *tty);
 extern int tty_port_carrier_raised(struct tty_port *port);
+extern void tty_port_raise_dtr_rts(struct tty_port *port);
 
 extern int tty_register_ldisc(int disc, struct tty_ldisc_ops *new_ldisc);
 extern int tty_unregister_ldisc(int disc);
-- 
cgit v1.2.3-70-g09d2


From 3e61696bdc2103107674b06d0daf30b76193e922 Mon Sep 17 00:00:00 2001
From: Alan Cox <alan@redhat.com>
Date: Fri, 2 Jan 2009 13:45:26 +0000
Subject: isicom: redo locking to use tty port locks

This helps set the basis for moving block_til_ready into common code. We also
introduce a tty_port_hangup helper as this will also be generally needed.

Signed-off-by: Alan Cox <alan@redhat.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/char/isicom.c     | 35 +++++++++++++++--------------------
 drivers/char/synclinkmp.c | 20 ++++++++++++++------
 drivers/char/tty_port.c   | 24 ++++++++++++++++++++++++
 include/linux/tty.h       |  1 +
 4 files changed, 54 insertions(+), 26 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/char/isicom.c b/drivers/char/isicom.c
index a449449e301..db53db91ae4 100644
--- a/drivers/char/isicom.c
+++ b/drivers/char/isicom.c
@@ -841,7 +841,6 @@ static int isicom_carrier_raised(struct tty_port *port)
 static int block_til_ready(struct tty_struct *tty, struct file *filp,
 	struct isi_port *ip)
 {
-	struct isi_board *card = ip->card;
 	struct tty_port *port = &ip->port;
 	int do_clocal = 0, retval;
 	unsigned long flags;
@@ -876,11 +875,11 @@ static int block_til_ready(struct tty_struct *tty, struct file *filp,
 	retval = 0;
 	add_wait_queue(&port->open_wait, &wait);
 
-	spin_lock_irqsave(&card->card_lock, flags);
+	spin_lock_irqsave(&port->lock, flags);
 	if (!tty_hung_up_p(filp))
 		port->count--;
 	port->blocked_open++;
-	spin_unlock_irqrestore(&card->card_lock, flags);
+	spin_unlock_irqrestore(&port->lock, flags);
 
 	while (1) {
 		tty_port_raise_dtr_rts(port);
@@ -905,14 +904,13 @@ static int block_til_ready(struct tty_struct *tty, struct file *filp,
 	}
 	set_current_state(TASK_RUNNING);
 	remove_wait_queue(&port->open_wait, &wait);
-	spin_lock_irqsave(&card->card_lock, flags);
+	spin_lock_irqsave(&port->lock, flags);
 	if (!tty_hung_up_p(filp))
 		port->count++;
 	port->blocked_open--;
-	spin_unlock_irqrestore(&card->card_lock, flags);
-	if (retval)
-		return retval;
-	port->flags |= ASYNC_NORMAL_ACTIVE;
+	if (retval == 0)
+		port->flags |= ASYNC_NORMAL_ACTIVE;
+	spin_unlock_irqrestore(&port->lock, flags);
 	return 0;
 }
 
@@ -1034,9 +1032,9 @@ static void isicom_close(struct tty_struct *tty, struct file *filp)
 
 	pr_dbg("Close start!!!.\n");
 
-	spin_lock_irqsave(&card->card_lock, flags);
+	spin_lock_irqsave(&port->port.lock, flags);
 	if (tty_hung_up_p(filp)) {
-		spin_unlock_irqrestore(&card->card_lock, flags);
+		spin_unlock_irqrestore(&port->port.lock, flags);
 		return;
 	}
 
@@ -1054,12 +1052,12 @@ static void isicom_close(struct tty_struct *tty, struct file *filp)
 	}
 
 	if (port->port.count) {
-		spin_unlock_irqrestore(&card->card_lock, flags);
+		spin_unlock_irqrestore(&port->port.lock, flags);
 		return;
 	}
 	port->port.flags |= ASYNC_CLOSING;
 	tty->closing = 1;
-	spin_unlock_irqrestore(&card->card_lock, flags);
+	spin_unlock_irqrestore(&port->port.lock, flags);
 
 	if (port->port.closing_wait != ASYNC_CLOSING_WAIT_NONE)
 		tty_wait_until_sent(tty, port->port.closing_wait);
@@ -1076,22 +1074,22 @@ static void isicom_close(struct tty_struct *tty, struct file *filp)
 	isicom_flush_buffer(tty);
 	tty_ldisc_flush(tty);
 
-	spin_lock_irqsave(&card->card_lock, flags);
+	spin_lock_irqsave(&port->port.lock, flags);
 	tty->closing = 0;
 
 	if (port->port.blocked_open) {
-		spin_unlock_irqrestore(&card->card_lock, flags);
+		spin_unlock_irqrestore(&port->port.lock, flags);
 		if (port->port.close_delay) {
 			pr_dbg("scheduling until time out.\n");
 			msleep_interruptible(
 				jiffies_to_msecs(port->port.close_delay));
 		}
-		spin_lock_irqsave(&card->card_lock, flags);
+		spin_lock_irqsave(&port->port.lock, flags);
 		wake_up_interruptible(&port->port.open_wait);
 	}
 	port->port.flags &= ~(ASYNC_NORMAL_ACTIVE | ASYNC_CLOSING);
 	wake_up_interruptible(&port->port.close_wait);
-	spin_unlock_irqrestore(&card->card_lock, flags);
+	spin_unlock_irqrestore(&port->port.lock, flags);
 }
 
 /* write et all */
@@ -1430,10 +1428,7 @@ static void isicom_hangup(struct tty_struct *tty)
 	isicom_shutdown_port(port);
 	spin_unlock_irqrestore(&port->card->card_lock, flags);
 
-	port->port.count = 0;
-	port->port.flags &= ~ASYNC_NORMAL_ACTIVE;
-	tty_port_tty_set(&port->port, NULL);
-	wake_up_interruptible(&port->port.open_wait);
+	tty_port_hangup(&port->port);
 }
 
 
diff --git a/drivers/char/synclinkmp.c b/drivers/char/synclinkmp.c
index fcf1ec77450..1f5c21ec4b1 100644
--- a/drivers/char/synclinkmp.c
+++ b/drivers/char/synclinkmp.c
@@ -3331,6 +3331,17 @@ static int carrier_raised(struct tty_port *port)
 	return (info->serial_signals & SerialSignal_DCD) ? 1 : 0;
 }
 
+static void raise_dtr_rts(struct tty_port *port)
+{
+	SLMP_INFO *info = container_of(port, SLMP_INFO, port);
+	unsigned long flags;
+
+	spin_lock_irqsave(&info->lock,flags);
+	info->serial_signals |= SerialSignal_RTS + SerialSignal_DTR;
+ 	set_signals(info);
+	spin_unlock_irqrestore(&info->lock,flags);
+}
+
 /* Block the current process until the specified port is ready to open.
  */
 static int block_til_ready(struct tty_struct *tty, struct file *filp,
@@ -3381,12 +3392,8 @@ static int block_til_ready(struct tty_struct *tty, struct file *filp,
 	port->blocked_open++;
 
 	while (1) {
-		if ((tty->termios->c_cflag & CBAUD)) {
-			spin_lock_irqsave(&info->lock,flags);
-			info->serial_signals |= SerialSignal_RTS + SerialSignal_DTR;
-		 	set_signals(info);
-			spin_unlock_irqrestore(&info->lock,flags);
-		}
+		if (tty->termios->c_cflag & CBAUD)
+			tty_port_raise_dtr_rts(port);
 
 		set_current_state(TASK_INTERRUPTIBLE);
 
@@ -3793,6 +3800,7 @@ static void add_device(SLMP_INFO *info)
 
 static const struct tty_port_operations port_ops = {
 	.carrier_raised = carrier_raised,
+	.raise_dtr_rts = raise_dtr_rts,
 };
 
 /* Allocate and initialize a device instance structure
diff --git a/drivers/char/tty_port.c b/drivers/char/tty_port.c
index 0557b638474..9f418bca4a2 100644
--- a/drivers/char/tty_port.c
+++ b/drivers/char/tty_port.c
@@ -7,6 +7,7 @@
 #include <linux/tty.h>
 #include <linux/tty_driver.h>
 #include <linux/tty_flip.h>
+#include <linux/serial.h>
 #include <linux/timer.h>
 #include <linux/string.h>
 #include <linux/slab.h>
@@ -95,6 +96,29 @@ void tty_port_tty_set(struct tty_port *port, struct tty_struct *tty)
 }
 EXPORT_SYMBOL(tty_port_tty_set);
 
+/**
+ *	tty_port_hangup		-	hangup helper
+ *	@port: tty port
+ *
+ *	Perform port level tty hangup flag and count changes. Drop the tty
+ *	reference.
+ */
+
+void tty_port_hangup(struct tty_port *port)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&port->lock, flags);
+	port->count = 0;
+	port->flags &= ~ASYNC_NORMAL_ACTIVE;
+	if (port->tty)
+		tty_kref_put(port->tty);
+	port->tty = NULL;
+	spin_unlock_irqrestore(&port->lock, flags);
+	wake_up_interruptible(&port->open_wait);
+}
+EXPORT_SYMBOL(tty_port_hangup);
+
 /**
  *	tty_port_carrier_raised	-	carrier raised check
  *	@port: tty port
diff --git a/include/linux/tty.h b/include/linux/tty.h
index 5001bbcacff..a1a93140e6e 100644
--- a/include/linux/tty.h
+++ b/include/linux/tty.h
@@ -438,6 +438,7 @@ extern struct tty_struct *tty_port_tty_get(struct tty_port *port);
 extern void tty_port_tty_set(struct tty_port *port, struct tty_struct *tty);
 extern int tty_port_carrier_raised(struct tty_port *port);
 extern void tty_port_raise_dtr_rts(struct tty_port *port);
+extern void tty_port_hangup(struct tty_port *port);
 
 extern int tty_register_ldisc(int disc, struct tty_ldisc_ops *new_ldisc);
 extern int tty_unregister_ldisc(int disc);
-- 
cgit v1.2.3-70-g09d2


From 36c621d82b956ff6ff72273f848af53e6c581aba Mon Sep 17 00:00:00 2001
From: Alan Cox <alan@redhat.com>
Date: Fri, 2 Jan 2009 13:46:10 +0000
Subject: tty: Introduce a tty_port generic block_til_ready

Start sucking more commonality out of the drivers into a single piece of
core code.

Signed-off-by: Alan Cox <alan@redhat.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/char/isicom.c   |  79 +-----------------------------------
 drivers/char/mxser.c    |  71 +-------------------------------
 drivers/char/riscom8.c  |  86 +--------------------------------------
 drivers/char/synclink.c |   1 +
 drivers/char/tty_port.c | 105 ++++++++++++++++++++++++++++++++++++++++++++++++
 drivers/char/vme_scc.c  |   6 +--
 include/linux/tty.h     |   2 +
 7 files changed, 115 insertions(+), 235 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/char/isicom.c b/drivers/char/isicom.c
index db53db91ae4..bac55cf4424 100644
--- a/drivers/char/isicom.c
+++ b/drivers/char/isicom.c
@@ -838,82 +838,6 @@ static int isicom_carrier_raised(struct tty_port *port)
 	return (ip->status & ISI_DCD)?1 : 0;
 }
 
-static int block_til_ready(struct tty_struct *tty, struct file *filp,
-	struct isi_port *ip)
-{
-	struct tty_port *port = &ip->port;
-	int do_clocal = 0, retval;
-	unsigned long flags;
-	DECLARE_WAITQUEUE(wait, current);
-	int cd;
-
-	/* block if port is in the process of being closed */
-
-	if (tty_hung_up_p(filp) || port->flags & ASYNC_CLOSING) {
-		pr_dbg("block_til_ready: close in progress.\n");
-		interruptible_sleep_on(&port->close_wait);
-		if (port->flags & ASYNC_HUP_NOTIFY)
-			return -EAGAIN;
-		else
-			return -ERESTARTSYS;
-	}
-
-	/* if non-blocking mode is set ... */
-
-	if ((filp->f_flags & O_NONBLOCK) ||
-			(tty->flags & (1 << TTY_IO_ERROR))) {
-		pr_dbg("block_til_ready: non-block mode.\n");
-		port->flags |= ASYNC_NORMAL_ACTIVE;
-		return 0;
-	}
-
-	if (C_CLOCAL(tty))
-		do_clocal = 1;
-
-	/* block waiting for DCD to be asserted, and while
-						callout dev is busy */
-	retval = 0;
-	add_wait_queue(&port->open_wait, &wait);
-
-	spin_lock_irqsave(&port->lock, flags);
-	if (!tty_hung_up_p(filp))
-		port->count--;
-	port->blocked_open++;
-	spin_unlock_irqrestore(&port->lock, flags);
-
-	while (1) {
-		tty_port_raise_dtr_rts(port);
-
-		set_current_state(TASK_INTERRUPTIBLE);
-		if (tty_hung_up_p(filp) || !(port->flags & ASYNC_INITIALIZED)) {
-			if (port->flags & ASYNC_HUP_NOTIFY)
-				retval = -EAGAIN;
-			else
-				retval = -ERESTARTSYS;
-			break;
-		}
-		cd = tty_port_carrier_raised(port);
-		if (!(port->flags & ASYNC_CLOSING) &&
-				(do_clocal || cd))
-			break;
-		if (signal_pending(current)) {
-			retval = -ERESTARTSYS;
-			break;
-		}
-		schedule();
-	}
-	set_current_state(TASK_RUNNING);
-	remove_wait_queue(&port->open_wait, &wait);
-	spin_lock_irqsave(&port->lock, flags);
-	if (!tty_hung_up_p(filp))
-		port->count++;
-	port->blocked_open--;
-	if (retval == 0)
-		port->flags |= ASYNC_NORMAL_ACTIVE;
-	spin_unlock_irqrestore(&port->lock, flags);
-	return 0;
-}
-
 static int isicom_open(struct tty_struct *tty, struct file *filp)
 {
 	struct isi_port *port;
@@ -940,12 +864,13 @@ static int isicom_open(struct tty_struct *tty, struct file *filp)
 
 	isicom_setup_board(card);
 
+	/* FIXME: locking on port.count etc */
 	port->port.count++;
 	tty->driver_data = port;
 	tty_port_tty_set(&port->port, tty);
 	error = isicom_setup_port(tty);
 	if (error == 0)
-		error = block_til_ready(tty, filp, port);
+		error = tty_port_block_til_ready(&port->port, tty, filp);
 	return error;
 }
 
diff --git a/drivers/char/mxser.c b/drivers/char/mxser.c
index e2471cf1ee9..08ba6eb1a38 100644
--- a/drivers/char/mxser.c
+++ b/drivers/char/mxser.c
@@ -558,75 +558,6 @@ static void mxser_raise_dtr_rts(struct tty_port *port)
 	spin_unlock_irqrestore(&mp->slock, flags);
 }
 
-static int mxser_block_til_ready(struct tty_struct *tty, struct file *filp,
-		struct mxser_port *mp)
-{
-	DECLARE_WAITQUEUE(wait, current);
-	int retval;
-	int do_clocal = 0;
-	unsigned long flags;
-	int cd;
-	struct tty_port *port = &mp->port;
-
-	/*
-	 * If non-blocking mode is set, or the port is not enabled,
-	 * then make the check up front and then exit.
-	 */
-	if ((filp->f_flags & O_NONBLOCK) ||
-			test_bit(TTY_IO_ERROR, &tty->flags)) {
-		port->flags |= ASYNC_NORMAL_ACTIVE;
-		return 0;
-	}
-
-	if (tty->termios->c_cflag & CLOCAL)
-		do_clocal = 1;
-
-	/*
-	 * Block waiting for the carrier detect and the line to become
-	 * free (i.e., not in use by the callout).  While we are in
-	 * this loop, port->count is dropped by one, so that
-	 * mxser_close() knows when to free things.  We restore it upon
-	 * exit, either normal or abnormal.
-	 */
-	retval = 0;
-	add_wait_queue(&port->open_wait, &wait);
-
-	spin_lock_irqsave(&port->lock, flags);
-	if (!tty_hung_up_p(filp))
-		port->count--;
-	port->blocked_open++;
-	spin_unlock_irqrestore(&port->lock, flags);
-	while (1) {
-		tty_port_raise_dtr_rts(port);
-		set_current_state(TASK_INTERRUPTIBLE);
-		if (tty_hung_up_p(filp) || !(port->flags & ASYNC_INITIALIZED)) {
-			if (port->flags & ASYNC_HUP_NOTIFY)
-				retval = -EAGAIN;
-			else
-				retval = -ERESTARTSYS;
-			break;
-		}
-		cd = tty_port_carrier_raised(port);
-		if (!(port->flags & ASYNC_CLOSING) && (do_clocal || cd))
-			break;
-		if (signal_pending(current)) {
-			retval = -ERESTARTSYS;
-			break;
-		}
-		schedule();
-	}
-	set_current_state(TASK_RUNNING);
-	remove_wait_queue(&port->open_wait, &wait);
-	spin_lock_irqsave(&port->lock, flags);
-	if (!tty_hung_up_p(filp))
-		port->count++;
-	port->blocked_open--;
-	if (retval == 0)
-		port->flags |= ASYNC_NORMAL_ACTIVE;
-	spin_unlock_irqrestore(&port->lock, flags);
-	return 0;
-}
-
 static int mxser_set_baud(struct tty_struct *tty, long newspd)
 {
 	struct mxser_port *info = tty->driver_data;
@@ -1110,7 +1041,7 @@ static int mxser_open(struct tty_struct *tty, struct file *filp)
 	if (retval)
 		return retval;
 
-	retval = mxser_block_til_ready(tty, filp, info);
+	retval = tty_port_block_til_ready(&info->port, tty, filp);
 	if (retval)
 		return retval;
 
diff --git a/drivers/char/riscom8.c b/drivers/char/riscom8.c
index 14662d7aa62..af34c2054a0 100644
--- a/drivers/char/riscom8.c
+++ b/drivers/char/riscom8.c
@@ -874,90 +874,6 @@ static int carrier_raised(struct tty_port *port)
 	return CD;
 }
 
-static int block_til_ready(struct tty_struct *tty, struct file *filp,
-			   struct riscom_port *rp)
-{
-	DECLARE_WAITQUEUE(wait, current);
-	int    retval;
-	int    do_clocal = 0;
-	int    CD;
-	unsigned long flags;
-	struct tty_port *port = &rp->port;
-
-	/*
-	 * If the device is in the middle of being closed, then block
-	 * until it's done, and then try again.
-	 */
-	if (tty_hung_up_p(filp) || port->flags & ASYNC_CLOSING) {
-		interruptible_sleep_on(&port->close_wait);
-		if (port->flags & ASYNC_HUP_NOTIFY)
-			return -EAGAIN;
-		else
-			return -ERESTARTSYS;
-	}
-
-	/*
-	 * If non-blocking mode is set, or the port is not enabled,
-	 * then make the check up front and then exit.
-	 */
-	if ((filp->f_flags & O_NONBLOCK) ||
-	    (tty->flags & (1 << TTY_IO_ERROR))) {
-		port->flags |= ASYNC_NORMAL_ACTIVE;
-		return 0;
-	}
-
-	if (C_CLOCAL(tty))
-		do_clocal = 1;
-
-	/*
-	 * Block waiting for the carrier detect and the line to become
-	 * free (i.e., not in use by the callout).  While we are in
-	 * this loop, info->count is dropped by one, so that
-	 * rs_close() knows when to free things.  We restore it upon
-	 * exit, either normal or abnormal.
-	 */
-	retval = 0;
-	add_wait_queue(&port->open_wait, &wait);
-
-	spin_lock_irqsave(&port->lock, flags);
-	if (!tty_hung_up_p(filp))
-		port->count--;
-	port->blocked_open++;
-	spin_unlock_irqrestore(&port->lock, flags);
-
-	while (1) {
-
-		CD = tty_port_carrier_raised(port);
-		set_current_state(TASK_INTERRUPTIBLE);
-		if (tty_hung_up_p(filp) ||
-		    !(port->flags & ASYNC_INITIALIZED)) {
-			if (port->flags & ASYNC_HUP_NOTIFY)
-				retval = -EAGAIN;
-			else
-				retval = -ERESTARTSYS;
-			break;
-		}
-		if (!(port->flags & ASYNC_CLOSING) &&
-		    (do_clocal || CD))
-			break;
-		if (signal_pending(current)) {
-			retval = -ERESTARTSYS;
-			break;
-		}
-		schedule();
-	}
-	__set_current_state(TASK_RUNNING);
-	remove_wait_queue(&port->open_wait, &wait);
-	spin_lock_irqsave(&port->lock, flags);
-	if (!tty_hung_up_p(filp))
-		port->count++;
-	port->blocked_open--;
-	if (retval == 0)
-		port->flags |= ASYNC_NORMAL_ACTIVE;
-	spin_unlock_irqrestore(&port->lock, flags);
-	return 0;
-}
-
 static int rc_open(struct tty_struct *tty, struct file *filp)
 {
 	int board;
@@ -984,7 +900,7 @@ static int rc_open(struct tty_struct *tty, struct file *filp)
 
 	error = rc_setup_port(bp, port);
 	if (error == 0)
-		error = block_til_ready(tty, filp, port);
+		error = tty_port_block_til_ready(&port->port, tty, filp);
 	return error;
 }
 
diff --git a/drivers/char/synclink.c b/drivers/char/synclink.c
index ac9f21e18c3..0ded4ed3da3 100644
--- a/drivers/char/synclink.c
+++ b/drivers/char/synclink.c
@@ -3401,6 +3401,7 @@ static int block_til_ready(struct tty_struct *tty, struct file * filp,
 	set_current_state(TASK_RUNNING);
 	remove_wait_queue(&port->open_wait, &wait);
 	
+	/* FIXME: Racy on hangup during close wait */
 	if (extra_count)
 		port->count++;
 	port->blocked_open--;
diff --git a/drivers/char/tty_port.c b/drivers/char/tty_port.c
index 9f418bca4a2..ff94182b381 100644
--- a/drivers/char/tty_port.c
+++ b/drivers/char/tty_port.c
@@ -151,3 +151,108 @@ void tty_port_raise_dtr_rts(struct tty_port *port)
 		port->ops->raise_dtr_rts(port);
 }
 EXPORT_SYMBOL(tty_port_raise_dtr_rts);
+
+/**
+ *	tty_port_block_til_ready	-	Waiting logic for tty open
+ *	@port: the tty port being opened
+ *	@tty: the tty device being bound
+ *	@filp: the file pointer of the opener
+ *
+ *	Implement the core POSIX/SuS tty behaviour when opening a tty device.
+ *	Handles:
+ *		- hangup (both before and during)
+ *		- non blocking open
+ *		- rts/dtr/dcd
+ *		- signals
+ *		- port flags and counts
+ *
+ *	The passed tty_port must implement the carrier_raised method if it can
+ *	do carrier detect and the raise_dtr_rts method if it supports software
+ *	management of these lines. Note that the dtr/rts raise is done each
+ *	iteration as a hangup may have previously dropped them while we wait.
+ */
+ 
+int tty_port_block_til_ready(struct tty_port *port,
+				struct tty_struct *tty, struct file *filp)
+{
+	int do_clocal = 0, retval;
+	unsigned long flags;
+	DECLARE_WAITQUEUE(wait, current);
+	int cd;
+
+	/* block if port is in the process of being closed */
+	if (tty_hung_up_p(filp) || port->flags & ASYNC_CLOSING) {
+		interruptible_sleep_on(&port->close_wait);
+		if (port->flags & ASYNC_HUP_NOTIFY)
+			return -EAGAIN;
+		else
+			return -ERESTARTSYS;
+	}
+
+	/* if non-blocking mode is set we can pass directly to open unless
+	   the port has just hung up or is in another error state */
+	if ((filp->f_flags & O_NONBLOCK) ||
+			(tty->flags & (1 << TTY_IO_ERROR))) {
+		port->flags |= ASYNC_NORMAL_ACTIVE;
+		return 0;
+	}
+
+	if (C_CLOCAL(tty))
+		do_clocal = 1;
+
+	/* Block waiting until we can proceed. We may need to wait for the
+	   carrier, but we must also wait for any close that is in progress
+	   before the next open may complete */
+
+	retval = 0;
+	add_wait_queue(&port->open_wait, &wait);
+
+	/* The port lock protects the port counts */
+	spin_lock_irqsave(&port->lock, flags);
+	if (!tty_hung_up_p(filp))
+		port->count--;
+	port->blocked_open++;
+	spin_unlock_irqrestore(&port->lock, flags);
+
+	while (1) {
+		/* Indicate we are open */
+		tty_port_raise_dtr_rts(port);
+
+		set_current_state(TASK_INTERRUPTIBLE);
+		/* Check for a hangup or uninitialised port. Return accordingly */
+		if (tty_hung_up_p(filp) || !(port->flags & ASYNC_INITIALIZED)) {
+			if (port->flags & ASYNC_HUP_NOTIFY)
+				retval = -EAGAIN;
+			else
+				retval = -ERESTARTSYS;
+			break;
+		}
+		/* Probe the carrier. For devices with no carrier detect this
+		   will always return true */
+		cd = tty_port_carrier_raised(port);
+		if (!(port->flags & ASYNC_CLOSING) &&
+				(do_clocal || cd))
+			break;
+		if (signal_pending(current)) {
+			retval = -ERESTARTSYS;
+			break;
+		}
+		schedule();
+	}
+	set_current_state(TASK_RUNNING);
+	remove_wait_queue(&port->open_wait, &wait);
+
+	/* Update counts. A parallel hangup will have set count to zero and
+	   we must not mess that up further */
+	spin_lock_irqsave(&port->lock, flags);
+	if (!tty_hung_up_p(filp))
+		port->count++;
+	port->blocked_open--;
+	if (retval == 0)
+		port->flags |= ASYNC_NORMAL_ACTIVE;
+	spin_unlock_irqrestore(&port->lock, flags);
+	return 0;
+	
+}
+EXPORT_SYMBOL(tty_port_block_til_ready);
+
diff --git a/drivers/char/vme_scc.c b/drivers/char/vme_scc.c
index d4e1534c0e0..2d9242a45a0 100644
--- a/drivers/char/vme_scc.c
+++ b/drivers/char/vme_scc.c
@@ -631,8 +631,8 @@ static void scc_enable_rx_interrupts(void *ptr)
 
 static int scc_carrier_raised(struct tty_port *port)
 {
-	struct scc_port *scc = container_of(port, struct scc_port, gs.port);
-	unsigned channel = port->channel;
+	struct scc_port *sc = container_of(port, struct scc_port, gs.port);
+	unsigned channel = sc->channel;
 
 	return !!(scc_last_status_reg[channel] & SR_DCD);
 }
@@ -643,7 +643,7 @@ static void scc_shutdown_port(void *ptr)
 	struct scc_port *port = ptr;
 
 	port->gs.port.flags &= ~ GS_ACTIVE;
-	if (port->gs.port.tty && port->gs.port.tty->termios->c_cflag & HUPCL) {
+	if (port->gs.port.tty && (port->gs.port.tty->termios->c_cflag & HUPCL)) {
 		scc_setsignals (port, 0, 0);
 	}
 }
diff --git a/include/linux/tty.h b/include/linux/tty.h
index a1a93140e6e..61a0ab32cf1 100644
--- a/include/linux/tty.h
+++ b/include/linux/tty.h
@@ -439,6 +439,8 @@ extern void tty_port_tty_set(struct tty_port *port, struct tty_struct *tty);
 extern int tty_port_carrier_raised(struct tty_port *port);
 extern void tty_port_raise_dtr_rts(struct tty_port *port);
 extern void tty_port_hangup(struct tty_port *port);
+extern int tty_port_block_til_ready(struct tty_port *port,
+				struct tty_struct *tty, struct file *filp);
 
 extern int tty_register_ldisc(int disc, struct tty_ldisc_ops *new_ldisc);
 extern int tty_unregister_ldisc(int disc);
-- 
cgit v1.2.3-70-g09d2


From 2a6eadbd5a2ae8f458e421f3614f1ad13c0f9a1c Mon Sep 17 00:00:00 2001
From: Alan Cox <alan@redhat.com>
Date: Fri, 2 Jan 2009 13:46:18 +0000
Subject: tty: Rework istallion to use the tty port changes

Signed-off-by: Alan Cox <alan@redhat.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/char/istallion.c  | 159 +++++++++++++---------------------------------
 include/linux/istallion.h |   1 -
 2 files changed, 43 insertions(+), 117 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/char/istallion.c b/drivers/char/istallion.c
index c4682f9e34b..4c69ab97339 100644
--- a/drivers/char/istallion.c
+++ b/drivers/char/istallion.c
@@ -626,8 +626,6 @@ static int	stli_hostcmd(struct stlibrd *brdp, struct stliport *portp);
 static int	stli_initopen(struct tty_struct *tty, struct stlibrd *brdp, struct stliport *portp);
 static int	stli_rawopen(struct stlibrd *brdp, struct stliport *portp, unsigned long arg, int wait);
 static int	stli_rawclose(struct stlibrd *brdp, struct stliport *portp, unsigned long arg, int wait);
-static int	stli_waitcarrier(struct tty_struct *tty, struct stlibrd *brdp,
-				struct stliport *portp, struct file *filp);
 static int	stli_setport(struct tty_struct *tty);
 static int	stli_cmdwait(struct stlibrd *brdp, struct stliport *portp, unsigned long cmd, void *arg, int size, int copyback);
 static void	stli_sendcmd(struct stlibrd *brdp, struct stliport *portp, unsigned long cmd, void *arg, int size, int copyback);
@@ -787,6 +785,7 @@ static int stli_open(struct tty_struct *tty, struct file *filp)
 {
 	struct stlibrd *brdp;
 	struct stliport *portp;
+	struct tty_port *port;
 	unsigned int minordev, brdnr, portnr;
 	int rc;
 
@@ -808,30 +807,19 @@ static int stli_open(struct tty_struct *tty, struct file *filp)
 		return -ENODEV;
 	if (portp->devnr < 1)
 		return -ENODEV;
-
-
-/*
- *	Check if this port is in the middle of closing. If so then wait
- *	until it is closed then return error status based on flag settings.
- *	The sleep here does not need interrupt protection since the wakeup
- *	for it is done with the same context.
- */
-	if (portp->port.flags & ASYNC_CLOSING) {
-		interruptible_sleep_on(&portp->port.close_wait);
-		if (portp->port.flags & ASYNC_HUP_NOTIFY)
-			return -EAGAIN;
-		return -ERESTARTSYS;
-	}
+	port = &portp->port;
 
 /*
  *	On the first open of the device setup the port hardware, and
  *	initialize the per port data structure. Since initializing the port
  *	requires several commands to the board we will need to wait for any
  *	other open that is already initializing the port.
+ *
+ *	Review - locking
  */
-	tty_port_tty_set(&portp->port, tty);
+	tty_port_tty_set(port, tty);
 	tty->driver_data = portp;
-	portp->port.count++;
+	port->count++;
 
 	wait_event_interruptible(portp->raw_wait,
 			!test_bit(ST_INITIALIZING, &portp->state));
@@ -841,7 +829,8 @@ static int stli_open(struct tty_struct *tty, struct file *filp)
 	if ((portp->port.flags & ASYNC_INITIALIZED) == 0) {
 		set_bit(ST_INITIALIZING, &portp->state);
 		if ((rc = stli_initopen(tty, brdp, portp)) >= 0) {
-			portp->port.flags |= ASYNC_INITIALIZED;
+			/* Locking */
+			port->flags |= ASYNC_INITIALIZED;
 			clear_bit(TTY_IO_ERROR, &tty->flags);
 		}
 		clear_bit(ST_INITIALIZING, &portp->state);
@@ -849,31 +838,7 @@ static int stli_open(struct tty_struct *tty, struct file *filp)
 		if (rc < 0)
 			return rc;
 	}
-
-/*
- *	Check if this port is in the middle of closing. If so then wait
- *	until it is closed then return error status, based on flag settings.
- *	The sleep here does not need interrupt protection since the wakeup
- *	for it is done with the same context.
- */
-	if (portp->port.flags & ASYNC_CLOSING) {
-		interruptible_sleep_on(&portp->port.close_wait);
-		if (portp->port.flags & ASYNC_HUP_NOTIFY)
-			return -EAGAIN;
-		return -ERESTARTSYS;
-	}
-
-/*
- *	Based on type of open being done check if it can overlap with any
- *	previous opens still in effect. If we are a normal serial device
- *	then also we might have to wait for carrier.
- */
-	if (!(filp->f_flags & O_NONBLOCK)) {
-		if ((rc = stli_waitcarrier(tty, brdp, portp, filp)) != 0)
-			return rc;
-	}
-	portp->port.flags |= ASYNC_NORMAL_ACTIVE;
-	return 0;
+	return tty_port_block_til_ready(&portp->port, tty, filp);
 }
 
 /*****************************************************************************/
@@ -882,25 +847,29 @@ static void stli_close(struct tty_struct *tty, struct file *filp)
 {
 	struct stlibrd *brdp;
 	struct stliport *portp;
+	struct tty_port *port;
 	unsigned long flags;
 
 	portp = tty->driver_data;
 	if (portp == NULL)
 		return;
+	port = &portp->port;
 
-	spin_lock_irqsave(&stli_lock, flags);
+	spin_lock_irqsave(&port->lock, flags);
 	if (tty_hung_up_p(filp)) {
-		spin_unlock_irqrestore(&stli_lock, flags);
+		spin_unlock_irqrestore(&port->lock, flags);
 		return;
 	}
-	if ((tty->count == 1) && (portp->port.count != 1))
-		portp->port.count = 1;
-	if (portp->port.count-- > 1) {
-		spin_unlock_irqrestore(&stli_lock, flags);
+	if (tty->count == 1 && port->count != 1)
+		port->count = 1;
+	if (port->count-- > 1) {
+		spin_unlock_irqrestore(&port->lock, flags);
 		return;
 	}
 
-	portp->port.flags |= ASYNC_CLOSING;
+	port->flags |= ASYNC_CLOSING;
+	tty->closing = 1;
+	spin_unlock_irqrestore(&port->lock, flags);
 
 /*
  *	May want to wait for data to drain before closing. The BUSY flag
@@ -908,15 +877,17 @@ static void stli_close(struct tty_struct *tty, struct file *filp)
  *	updated by messages from the slave - indicating when all chars
  *	really have drained.
  */
+ 	spin_lock_irqsave(&stli_lock, flags);
 	if (tty == stli_txcooktty)
 		stli_flushchars(tty);
-	tty->closing = 1;
 	spin_unlock_irqrestore(&stli_lock, flags);
 
 	if (portp->closing_wait != ASYNC_CLOSING_WAIT_NONE)
 		tty_wait_until_sent(tty, portp->closing_wait);
 
-	portp->port.flags &= ~ASYNC_INITIALIZED;
+	/* FIXME: port locking here needs attending to */
+	port->flags &= ~ASYNC_INITIALIZED;
+
 	brdp = stli_brds[portp->brdnr];
 	stli_rawclose(brdp, portp, 0, 0);
 	if (tty->termios->c_cflag & HUPCL) {
@@ -937,14 +908,14 @@ static void stli_close(struct tty_struct *tty, struct file *filp)
 	tty->closing = 0;
 	tty_port_tty_set(&portp->port, NULL);
 
-	if (portp->openwaitcnt) {
+	if (port->blocked_open) {
 		if (portp->close_delay)
 			msleep_interruptible(jiffies_to_msecs(portp->close_delay));
-		wake_up_interruptible(&portp->port.open_wait);
+		wake_up_interruptible(&port->open_wait);
 	}
 
-	portp->port.flags &= ~(ASYNC_NORMAL_ACTIVE|ASYNC_CLOSING);
-	wake_up_interruptible(&portp->port.close_wait);
+	port->flags &= ~(ASYNC_NORMAL_ACTIVE|ASYNC_CLOSING);
+	wake_up_interruptible(&port->close_wait);
 }
 
 /*****************************************************************************/
@@ -1189,63 +1160,17 @@ static int stli_carrier_raised(struct tty_port *port)
 	return (portp->sigs & TIOCM_CD) ? 1 : 0;
 }
 
-/*
- *	Possibly need to wait for carrier (DCD signal) to come high. Say
- *	maybe because if we are clocal then we don't need to wait...
- */
-
-static int stli_waitcarrier(struct tty_struct *tty, struct stlibrd *brdp,
-				struct stliport *portp, struct file *filp)
+static void stli_raise_dtr_rts(struct tty_port *port)
 {
-	unsigned long flags;
-	int rc, doclocal;
-	struct tty_port *port = &portp->port;
-
-	rc = 0;
-	doclocal = 0;
-
-	if (tty->termios->c_cflag & CLOCAL)
-		doclocal++;
-
-	spin_lock_irqsave(&stli_lock, flags);
-	portp->openwaitcnt++;
-	if (! tty_hung_up_p(filp))
-		port->count--;
-	spin_unlock_irqrestore(&stli_lock, flags);
-
-	for (;;) {
-		stli_mkasysigs(&portp->asig, 1, 1);
-		if ((rc = stli_cmdwait(brdp, portp, A_SETSIGNALS,
-		    &portp->asig, sizeof(asysigs_t), 0)) < 0)
-			break;
-		if (tty_hung_up_p(filp) ||
-		    ((port->flags & ASYNC_INITIALIZED) == 0)) {
-			if (port->flags & ASYNC_HUP_NOTIFY)
-				rc = -EBUSY;
-			else
-				rc = -ERESTARTSYS;
-			break;
-		}
-		if (((port->flags & ASYNC_CLOSING) == 0) &&
-		    (doclocal || tty_port_carrier_raised(port))) {
-			break;
-		}
-		if (signal_pending(current)) {
-			rc = -ERESTARTSYS;
-			break;
-		}
-		interruptible_sleep_on(&port->open_wait);
-	}
-
-	spin_lock_irqsave(&stli_lock, flags);
-	if (! tty_hung_up_p(filp))
-		port->count++;
-	portp->openwaitcnt--;
-	spin_unlock_irqrestore(&stli_lock, flags);
-
-	return rc;
+	struct stliport *portp = container_of(port, struct stliport, port);
+	struct stlibrd *brdp = stli_brds[portp->brdnr];
+	stli_mkasysigs(&portp->asig, 1, 1);
+	if (stli_cmdwait(brdp, portp, A_SETSIGNALS, &portp->asig,
+		sizeof(asysigs_t), 0) < 0)
+			printk(KERN_WARNING "istallion: dtr raise failed.\n");
 }
 
+
 /*****************************************************************************/
 
 /*
@@ -1828,6 +1753,7 @@ static void stli_hangup(struct tty_struct *tty)
 {
 	struct stliport *portp;
 	struct stlibrd *brdp;
+	struct tty_port *port;
 	unsigned long flags;
 
 	portp = tty->driver_data;
@@ -1838,8 +1764,11 @@ static void stli_hangup(struct tty_struct *tty)
 	brdp = stli_brds[portp->brdnr];
 	if (brdp == NULL)
 		return;
+	port = &portp->port;
 
-	portp->port.flags &= ~ASYNC_INITIALIZED;
+	spin_lock_irqsave(&port->lock, flags);
+	port->flags &= ~ASYNC_INITIALIZED;
+	spin_unlock_irqrestore(&port->lock, flags);
 
 	if (!test_bit(ST_CLOSING, &portp->state))
 		stli_rawclose(brdp, portp, 0, 0);
@@ -1860,12 +1789,9 @@ static void stli_hangup(struct tty_struct *tty)
 	clear_bit(ST_TXBUSY, &portp->state);
 	clear_bit(ST_RXSTOP, &portp->state);
 	set_bit(TTY_IO_ERROR, &tty->flags);
-	tty_port_tty_set(&portp->port, NULL);
-	portp->port.flags &= ~ASYNC_NORMAL_ACTIVE;
-	portp->port.count = 0;
 	spin_unlock_irqrestore(&stli_lock, flags);
 
-	wake_up_interruptible(&portp->port.open_wait);
+	tty_port_hangup(port);
 }
 
 /*****************************************************************************/
@@ -4528,6 +4454,7 @@ static const struct tty_operations stli_ops = {
 
 static const struct tty_port_operations stli_port_ops = {
 	.carrier_raised = stli_carrier_raised,
+	.raise_dtr_rts = stli_raise_dtr_rts,
 };
 
 /*****************************************************************************/
diff --git a/include/linux/istallion.h b/include/linux/istallion.h
index 0d184072324..053d5aea925 100644
--- a/include/linux/istallion.h
+++ b/include/linux/istallion.h
@@ -61,7 +61,6 @@ struct stliport {
 	int			custom_divisor;
 	int			close_delay;
 	int			closing_wait;
-	int			openwaitcnt;
 	int			rc;
 	int			argsize;
 	void			*argp;
-- 
cgit v1.2.3-70-g09d2


From a6614999e800cf3a134ce93ea46ef837e3c0e76e Mon Sep 17 00:00:00 2001
From: Alan Cox <alan@redhat.com>
Date: Fri, 2 Jan 2009 13:46:50 +0000
Subject: tty: Introduce some close helpers for ports

Again this is a lot of common code we can unify

Signed-off-by: Alan Cox <alan@redhat.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/char/isicom.c      | 66 ++++++---------------------------------
 drivers/char/istallion.c   | 78 +++++++++++++++++-----------------------------
 drivers/char/mxser.c       | 56 ++++++---------------------------
 drivers/char/riscom8.c     | 49 +++--------------------------
 drivers/char/stallion.c    | 39 +++--------------------
 drivers/char/synclink.c    | 58 ++--------------------------------
 drivers/char/synclink_gt.c | 51 ++----------------------------
 drivers/char/synclinkmp.c  | 58 ++--------------------------------
 drivers/char/tty_port.c    | 58 ++++++++++++++++++++++++++++++++++
 include/linux/istallion.h  |  1 -
 include/linux/tty.h        |  3 ++
 11 files changed, 126 insertions(+), 391 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/char/isicom.c b/drivers/char/isicom.c
index bac55cf4424..24aa6e88e22 100644
--- a/drivers/char/isicom.c
+++ b/drivers/char/isicom.c
@@ -945,76 +945,30 @@ static void isicom_flush_buffer(struct tty_struct *tty)
 
 static void isicom_close(struct tty_struct *tty, struct file *filp)
 {
-	struct isi_port *port = tty->driver_data;
+	struct isi_port *ip = tty->driver_data;
+	struct tty_port *port = &ip->port;
 	struct isi_board *card;
 	unsigned long flags;
 
-	if (!port)
-		return;
-	card = port->card;
-	if (isicom_paranoia_check(port, tty->name, "isicom_close"))
-		return;
-
-	pr_dbg("Close start!!!.\n");
-
-	spin_lock_irqsave(&port->port.lock, flags);
-	if (tty_hung_up_p(filp)) {
-		spin_unlock_irqrestore(&port->port.lock, flags);
-		return;
-	}
-
-	if (tty->count == 1 && port->port.count != 1) {
-		printk(KERN_WARNING "ISICOM:(0x%lx) isicom_close: bad port "
-			"count tty->count = 1 port count = %d.\n",
-			card->base, port->port.count);
-		port->port.count = 1;
-	}
-	if (--port->port.count < 0) {
-		printk(KERN_WARNING "ISICOM:(0x%lx) isicom_close: bad port "
-			"count for channel%d = %d", card->base, port->channel,
-			port->port.count);
-		port->port.count = 0;
-	}
+	BUG_ON(!ip);
 
-	if (port->port.count) {
-		spin_unlock_irqrestore(&port->port.lock, flags);
+	card = ip->card;
+	if (isicom_paranoia_check(ip, tty->name, "isicom_close"))
 		return;
-	}
-	port->port.flags |= ASYNC_CLOSING;
-	tty->closing = 1;
-	spin_unlock_irqrestore(&port->port.lock, flags);
 
-	if (port->port.closing_wait != ASYNC_CLOSING_WAIT_NONE)
-		tty_wait_until_sent(tty, port->port.closing_wait);
 	/* indicate to the card that no more data can be received
 	   on this port */
 	spin_lock_irqsave(&card->card_lock, flags);
-	if (port->port.flags & ASYNC_INITIALIZED) {
-		card->port_status &= ~(1 << port->channel);
+	if (port->flags & ASYNC_INITIALIZED) {
+		card->port_status &= ~(1 << ip->channel);
 		outw(card->port_status, card->base + 0x02);
 	}
-	isicom_shutdown_port(port);
+	isicom_shutdown_port(ip);
 	spin_unlock_irqrestore(&card->card_lock, flags);
 
 	isicom_flush_buffer(tty);
-	tty_ldisc_flush(tty);
-
-	spin_lock_irqsave(&port->port.lock, flags);
-	tty->closing = 0;
-
-	if (port->port.blocked_open) {
-		spin_unlock_irqrestore(&port->port.lock, flags);
-		if (port->port.close_delay) {
-			pr_dbg("scheduling until time out.\n");
-			msleep_interruptible(
-				jiffies_to_msecs(port->port.close_delay));
-		}
-		spin_lock_irqsave(&port->port.lock, flags);
-		wake_up_interruptible(&port->port.open_wait);
-	}
-	port->port.flags &= ~(ASYNC_NORMAL_ACTIVE | ASYNC_CLOSING);
-	wake_up_interruptible(&port->port.close_wait);
-	spin_unlock_irqrestore(&port->port.lock, flags);
+	
+	tty_port_close_end(port, tty);
 }
 
 /* write et all */
diff --git a/drivers/char/istallion.c b/drivers/char/istallion.c
index 4c69ab97339..5c3dc6b8411 100644
--- a/drivers/char/istallion.c
+++ b/drivers/char/istallion.c
@@ -767,7 +767,7 @@ static int stli_parsebrd(struct stlconf *confp, char **argp)
 			break;
 	}
 	if (i == ARRAY_SIZE(stli_brdstr)) {
-		printk("STALLION: unknown board name, %s?\n", argp[0]);
+		printk(KERN_WARNING "istallion: unknown board name, %s?\n", argp[0]);
 		return 0;
 	}
 
@@ -855,21 +855,8 @@ static void stli_close(struct tty_struct *tty, struct file *filp)
 		return;
 	port = &portp->port;
 
-	spin_lock_irqsave(&port->lock, flags);
-	if (tty_hung_up_p(filp)) {
-		spin_unlock_irqrestore(&port->lock, flags);
-		return;
-	}
-	if (tty->count == 1 && port->count != 1)
-		port->count = 1;
-	if (port->count-- > 1) {
-		spin_unlock_irqrestore(&port->lock, flags);
+	if (tty_port_close_start(port, tty, filp) == 0)
 		return;
-	}
-
-	port->flags |= ASYNC_CLOSING;
-	tty->closing = 1;
-	spin_unlock_irqrestore(&port->lock, flags);
 
 /*
  *	May want to wait for data to drain before closing. The BUSY flag
@@ -882,6 +869,8 @@ static void stli_close(struct tty_struct *tty, struct file *filp)
 		stli_flushchars(tty);
 	spin_unlock_irqrestore(&stli_lock, flags);
 
+	/* We end up doing this twice for the moment. This needs looking at
+	   eventually. Note we still use portp->closing_wait as a result */
 	if (portp->closing_wait != ASYNC_CLOSING_WAIT_NONE)
 		tty_wait_until_sent(tty, portp->closing_wait);
 
@@ -905,17 +894,8 @@ static void stli_close(struct tty_struct *tty, struct file *filp)
 	set_bit(ST_DOFLUSHRX, &portp->state);
 	stli_flushbuffer(tty);
 
-	tty->closing = 0;
-	tty_port_tty_set(&portp->port, NULL);
-
-	if (port->blocked_open) {
-		if (portp->close_delay)
-			msleep_interruptible(jiffies_to_msecs(portp->close_delay));
-		wake_up_interruptible(&port->open_wait);
-	}
-
-	port->flags &= ~(ASYNC_NORMAL_ACTIVE|ASYNC_CLOSING);
-	wake_up_interruptible(&port->close_wait);
+	tty_port_close_end(port, tty);
+	tty_port_tty_set(port, NULL);
 }
 
 /*****************************************************************************/
@@ -1482,7 +1462,7 @@ static int stli_getserial(struct stliport *portp, struct serial_struct __user *s
 	sio.irq = 0;
 	sio.flags = portp->port.flags;
 	sio.baud_base = portp->baud_base;
-	sio.close_delay = portp->close_delay;
+	sio.close_delay = portp->port.close_delay;
 	sio.closing_wait = portp->closing_wait;
 	sio.custom_divisor = portp->custom_divisor;
 	sio.xmit_fifo_size = 0;
@@ -1514,7 +1494,7 @@ static int stli_setserial(struct tty_struct *tty, struct serial_struct __user *s
 		return -EFAULT;
 	if (!capable(CAP_SYS_ADMIN)) {
 		if ((sio.baud_base != portp->baud_base) ||
-		    (sio.close_delay != portp->close_delay) ||
+		    (sio.close_delay != portp->port.close_delay) ||
 		    ((sio.flags & ~ASYNC_USR_MASK) !=
 		    (portp->port.flags & ~ASYNC_USR_MASK)))
 			return -EPERM;
@@ -1523,7 +1503,7 @@ static int stli_setserial(struct tty_struct *tty, struct serial_struct __user *s
 	portp->port.flags = (portp->port.flags & ~ASYNC_USR_MASK) |
 		(sio.flags & ASYNC_USR_MASK);
 	portp->baud_base = sio.baud_base;
-	portp->close_delay = sio.close_delay;
+	portp->port.close_delay = sio.close_delay;
 	portp->closing_wait = sio.closing_wait;
 	portp->custom_divisor = sio.custom_divisor;
 
@@ -2065,7 +2045,7 @@ static void __stli_sendcmd(struct stlibrd *brdp, struct stliport *portp, unsigne
 	unsigned char __iomem *bits;
 
 	if (test_bit(ST_CMDING, &portp->state)) {
-		printk(KERN_ERR "STALLION: command already busy, cmd=%x!\n",
+		printk(KERN_ERR "istallion: command already busy, cmd=%x!\n",
 				(int) cmd);
 		return;
 	}
@@ -2625,7 +2605,7 @@ static int stli_initports(struct stlibrd *brdp)
 	for (i = 0, panelnr = 0, panelport = 0; (i < brdp->nrports); i++) {
 		portp = kzalloc(sizeof(struct stliport), GFP_KERNEL);
 		if (!portp) {
-			printk("STALLION: failed to allocate port structure\n");
+			printk(KERN_WARNING "istallion: failed to allocate port structure\n");
 			continue;
 		}
 		tty_port_init(&portp->port);
@@ -2635,7 +2615,7 @@ static int stli_initports(struct stlibrd *brdp)
 		portp->brdnr = brdp->brdnr;
 		portp->panelnr = panelnr;
 		portp->baud_base = STL_BAUDBASE;
-		portp->close_delay = STL_CLOSEDELAY;
+		portp->port.close_delay = STL_CLOSEDELAY;
 		portp->closing_wait = 30 * HZ;
 		init_waitqueue_head(&portp->port.open_wait);
 		init_waitqueue_head(&portp->port.close_wait);
@@ -2692,7 +2672,7 @@ static void __iomem *stli_ecpgetmemptr(struct stlibrd *brdp, unsigned long offse
 	unsigned char val;
 
 	if (offset > brdp->memsize) {
-		printk(KERN_ERR "STALLION: shared memory pointer=%x out of "
+		printk(KERN_ERR "istallion: shared memory pointer=%x out of "
 				"range at line=%d(%d), brd=%d\n",
 			(int) offset, line, __LINE__, brdp->brdnr);
 		ptr = NULL;
@@ -2766,7 +2746,7 @@ static void __iomem *stli_ecpeigetmemptr(struct stlibrd *brdp, unsigned long off
 	unsigned char	val;
 
 	if (offset > brdp->memsize) {
-		printk(KERN_ERR "STALLION: shared memory pointer=%x out of "
+		printk(KERN_ERR "istallion: shared memory pointer=%x out of "
 				"range at line=%d(%d), brd=%d\n",
 			(int) offset, line, __LINE__, brdp->brdnr);
 		ptr = NULL;
@@ -2818,7 +2798,7 @@ static void __iomem *stli_ecpmcgetmemptr(struct stlibrd *brdp, unsigned long off
 	unsigned char val;
 
 	if (offset > brdp->memsize) {
-		printk(KERN_ERR "STALLION: shared memory pointer=%x out of "
+		printk(KERN_ERR "istallion: shared memory pointer=%x out of "
 				"range at line=%d(%d), brd=%d\n",
 			(int) offset, line, __LINE__, brdp->brdnr);
 		ptr = NULL;
@@ -2863,7 +2843,7 @@ static void __iomem *stli_ecppcigetmemptr(struct stlibrd *brdp, unsigned long of
 	unsigned char	val;
 
 	if (offset > brdp->memsize) {
-		printk(KERN_ERR "STALLION: shared memory pointer=%x out of "
+		printk(KERN_ERR "istallion: shared memory pointer=%x out of "
 				"range at line=%d(%d), board=%d\n",
 				(int) offset, line, __LINE__, brdp->brdnr);
 		ptr = NULL;
@@ -2928,7 +2908,7 @@ static void __iomem *stli_onbgetmemptr(struct stlibrd *brdp, unsigned long offse
 	void __iomem *ptr;
 
 	if (offset > brdp->memsize) {
-		printk(KERN_ERR "STALLION: shared memory pointer=%x out of "
+		printk(KERN_ERR "istallion: shared memory pointer=%x out of "
 				"range at line=%d(%d), brd=%d\n",
 				(int) offset, line, __LINE__, brdp->brdnr);
 		ptr = NULL;
@@ -2994,7 +2974,7 @@ static void __iomem *stli_onbegetmemptr(struct stlibrd *brdp, unsigned long offs
 	unsigned char val;
 
 	if (offset > brdp->memsize) {
-		printk(KERN_ERR "STALLION: shared memory pointer=%x out of "
+		printk(KERN_ERR "istallion: shared memory pointer=%x out of "
 				"range at line=%d(%d), brd=%d\n",
 			(int) offset, line, __LINE__, brdp->brdnr);
 		ptr = NULL;
@@ -3433,7 +3413,7 @@ static int stli_startbrd(struct stlibrd *brdp)
 #endif
 
 	if (nrdevs < (brdp->nrports + 1)) {
-		printk(KERN_ERR "STALLION: slave failed to allocate memory for "
+		printk(KERN_ERR "istallion: slave failed to allocate memory for "
 				"all devices, devices=%d\n", nrdevs);
 		brdp->nrports = nrdevs - 1;
 	}
@@ -3443,13 +3423,13 @@ static int stli_startbrd(struct stlibrd *brdp)
 	brdp->bitsize = (nrdevs + 7) / 8;
 	memoff = readl(&hdrp->memp);
 	if (memoff > brdp->memsize) {
-		printk(KERN_ERR "STALLION: corrupted shared memory region?\n");
+		printk(KERN_ERR "istallion: corrupted shared memory region?\n");
 		rc = -EIO;
 		goto stli_donestartup;
 	}
 	memp = (cdkmem_t __iomem *) EBRDGETMEMPTR(brdp, memoff);
 	if (readw(&memp->dtype) != TYP_ASYNCTRL) {
-		printk(KERN_ERR "STALLION: no slave control device found\n");
+		printk(KERN_ERR "istallion: no slave control device found\n");
 		goto stli_donestartup;
 	}
 	memp++;
@@ -3534,7 +3514,7 @@ static int __devinit stli_brdinit(struct stlibrd *brdp)
 		retval = stli_initonb(brdp);
 		break;
 	default:
-		printk(KERN_ERR "STALLION: board=%d is unknown board "
+		printk(KERN_ERR "istallion: board=%d is unknown board "
 				"type=%d\n", brdp->brdnr, brdp->brdtype);
 		retval = -ENODEV;
 	}
@@ -3543,7 +3523,7 @@ static int __devinit stli_brdinit(struct stlibrd *brdp)
 		return retval;
 
 	stli_initports(brdp);
-	printk(KERN_INFO "STALLION: %s found, board=%d io=%x mem=%x "
+	printk(KERN_INFO "istallion: %s found, board=%d io=%x mem=%x "
 		"nrpanels=%d nrports=%d\n", stli_brdnames[brdp->brdtype],
 		brdp->brdnr, brdp->iobase, (int) brdp->memaddr,
 		brdp->nrpanels, brdp->nrports);
@@ -3637,7 +3617,7 @@ static int stli_eisamemprobe(struct stlibrd *brdp)
 	if (! foundit) {
 		brdp->memaddr = 0;
 		brdp->membase = NULL;
-		printk(KERN_ERR "STALLION: failed to probe shared memory "
+		printk(KERN_ERR "istallion: failed to probe shared memory "
 				"region for %s in EISA slot=%d\n",
 			stli_brdnames[brdp->brdtype], (brdp->iobase >> 12));
 		return -ENODEV;
@@ -3782,7 +3762,7 @@ static int __devinit stli_pciprobe(struct pci_dev *pdev,
 	mutex_lock(&stli_brdslock);
 	brdnr = stli_getbrdnr();
 	if (brdnr < 0) {
-		printk(KERN_INFO "STALLION: too many boards found, "
+		printk(KERN_INFO "istallion: too many boards found, "
 			"maximum supported %d\n", STL_MAXBRDS);
 		mutex_unlock(&stli_brdslock);
 		retval = -EIO;
@@ -3854,7 +3834,7 @@ static struct stlibrd *stli_allocbrd(void)
 
 	brdp = kzalloc(sizeof(struct stlibrd), GFP_KERNEL);
 	if (!brdp) {
-		printk(KERN_ERR "STALLION: failed to allocate memory "
+		printk(KERN_ERR "istallion: failed to allocate memory "
 				"(size=%Zd)\n", sizeof(struct stlibrd));
 		return NULL;
 	}
@@ -4493,7 +4473,7 @@ static int __init istallion_module_init(void)
 
 	stli_txcookbuf = kmalloc(STLI_TXBUFSIZE, GFP_KERNEL);
 	if (!stli_txcookbuf) {
-		printk(KERN_ERR "STALLION: failed to allocate memory "
+		printk(KERN_ERR "istallion: failed to allocate memory "
 				"(size=%d)\n", STLI_TXBUFSIZE);
 		retval = -ENOMEM;
 		goto err;
@@ -4518,7 +4498,7 @@ static int __init istallion_module_init(void)
 
 	retval = tty_register_driver(stli_serial);
 	if (retval) {
-		printk(KERN_ERR "STALLION: failed to register serial driver\n");
+		printk(KERN_ERR "istallion: failed to register serial driver\n");
 		goto err_ttyput;
 	}
 
@@ -4532,7 +4512,7 @@ static int __init istallion_module_init(void)
  */
 	retval = register_chrdev(STL_SIOMEMMAJOR, "staliomem", &stli_fsiomem);
 	if (retval) {
-		printk(KERN_ERR "STALLION: failed to register serial memory "
+		printk(KERN_ERR "istallion: failed to register serial memory "
 				"device\n");
 		goto err_deinit;
 	}
diff --git a/drivers/char/mxser.c b/drivers/char/mxser.c
index 08ba6eb1a38..402c9f217f8 100644
--- a/drivers/char/mxser.c
+++ b/drivers/char/mxser.c
@@ -1080,57 +1080,26 @@ static void mxser_flush_buffer(struct tty_struct *tty)
 static void mxser_close(struct tty_struct *tty, struct file *filp)
 {
 	struct mxser_port *info = tty->driver_data;
+	struct tty_port *port = &info->port;
 
 	unsigned long timeout;
-	unsigned long flags;
 
 	if (tty->index == MXSER_PORTS)
 		return;
 	if (!info)
 		return;
 
-	spin_lock_irqsave(&info->port.lock, flags);
-
-	if (tty_hung_up_p(filp)) {
-		spin_unlock_irqrestore(&info->port.lock, flags);
-		return;
-	}
-	if ((tty->count == 1) && (info->port.count != 1)) {
-		/*
-		 * Uh, oh.  tty->count is 1, which means that the tty
-		 * structure will be freed.  Info->port.count should always
-		 * be one in these conditions.  If it's greater than
-		 * one, we've got real problems, since it means the
-		 * serial port won't be shutdown.
-		 */
-		printk(KERN_ERR "mxser_close: bad serial port count; "
-			"tty->count is 1, info->port.count is %d\n", info->port.count);
-		info->port.count = 1;
-	}
-	if (--info->port.count < 0) {
-		printk(KERN_ERR "mxser_close: bad serial port count for "
-			"ttys%d: %d\n", tty->index, info->port.count);
-		info->port.count = 0;
-	}
-	if (info->port.count) {
-		spin_unlock_irqrestore(&info->port.lock, flags);
+	if (tty_port_close_start(port, tty, filp) == 0)
 		return;
-	}
-	info->port.flags |= ASYNC_CLOSING;
-	spin_unlock_irqrestore(&info->port.lock, flags);
+
 	/*
 	 * Save the termios structure, since this port may have
 	 * separate termios for callout and dialin.
+	 *
+	 * FIXME: Can this go ?
 	 */
 	if (info->port.flags & ASYNC_NORMAL_ACTIVE)
 		info->normal_termios = *tty->termios;
-	/*
-	 * Now we wait for the transmit buffer to clear; and we notify
-	 * the line discipline to only process XON/XOFF characters.
-	 */
-	tty->closing = 1;
-	if (info->port.closing_wait != ASYNC_CLOSING_WAIT_NONE)
-		tty_wait_until_sent(tty, info->port.closing_wait);
 	/*
 	 * At this point we stop accepting input.  To do this, we
 	 * disable the receive line status interrupts, and tell the
@@ -1156,19 +1125,12 @@ static void mxser_close(struct tty_struct *tty, struct file *filp)
 		}
 	}
 	mxser_shutdown(tty);
-
 	mxser_flush_buffer(tty);
-	tty_ldisc_flush(tty);
-
-	tty->closing = 0;
-	tty_port_tty_set(&info->port, NULL);
-	if (info->port.blocked_open) {
-		if (info->port.close_delay)
-			schedule_timeout_interruptible(info->port.close_delay);
-		wake_up_interruptible(&info->port.open_wait);
-	}
 
-	info->port.flags &= ~(ASYNC_NORMAL_ACTIVE | ASYNC_CLOSING);
+	/* Right now the tty_port set is done outside of the close_end helper
+	   as we don't yet have everyone using refcounts */	
+	tty_port_close_end(port, tty);
+	tty_port_tty_set(port, NULL);
 }
 
 static int mxser_write(struct tty_struct *tty, const unsigned char *buf, int count)
diff --git a/drivers/char/riscom8.c b/drivers/char/riscom8.c
index af34c2054a0..9ac5febd8ab 100644
--- a/drivers/char/riscom8.c
+++ b/drivers/char/riscom8.c
@@ -929,35 +929,11 @@ static void rc_close(struct tty_struct *tty, struct file *filp)
 	if (!port || rc_paranoia_check(port, tty->name, "close"))
 		return;
 
-	spin_lock_irqsave(&port->port.lock, flags);
-
-	if (tty_hung_up_p(filp))
-		goto out;
-
 	bp = port_Board(port);
-	if ((tty->count == 1) && (port->port.count != 1))  {
-		printk(KERN_INFO "rc%d: rc_close: bad port count;"
-		       " tty->count is 1, port count is %d\n",
-		       board_No(bp), port->port.count);
-		port->port.count = 1;
-	}
-	if (--port->port.count < 0)  {
-		printk(KERN_INFO "rc%d: rc_close: bad port count "
-				 "for tty%d: %d\n",
-		       board_No(bp), port_No(port), port->port.count);
-		port->port.count = 0;
-	}
-	if (port->port.count)
-		goto out;
-	port->port.flags |= ASYNC_CLOSING;
-	/*
-	 * Now we wait for the transmit buffer to clear; and we notify
-	 * the line discipline to only process XON/XOFF characters.
-	 */
-	tty->closing = 1;
-	spin_unlock_irqrestore(&port->port.lock, flags);
-	if (port->port.closing_wait != ASYNC_CLOSING_WAIT_NONE)
-		tty_wait_until_sent(tty, port->port.closing_wait);
+	
+	if (tty_port_close_start(&port->port, tty, filp) == 0)
+		return;
+	
 	/*
 	 * At this point we stop accepting input.  To do this, we
 	 * disable the receive line status interrupts, and tell the
@@ -989,23 +965,8 @@ static void rc_close(struct tty_struct *tty, struct file *filp)
 	rc_shutdown_port(tty, bp, port);
 	rc_flush_buffer(tty);
 	spin_unlock_irqrestore(&riscom_lock, flags);
-	tty_ldisc_flush(tty);
 
-	spin_lock_irqsave(&port->port.lock, flags);
-	tty->closing = 0;
-	port->port.tty = NULL;
-	if (port->port.blocked_open) {
-		spin_unlock_irqrestore(&port->port.lock, flags);
-		if (port->port.close_delay)
-			msleep_interruptible(jiffies_to_msecs(port->port.close_delay));
-		wake_up_interruptible(&port->port.open_wait);
-		spin_lock_irqsave(&port->port.lock, flags);
-	}
-	port->port.flags &= ~(ASYNC_NORMAL_ACTIVE|ASYNC_CLOSING);
-	wake_up_interruptible(&port->port.close_wait);
-
-out:
-	spin_unlock_irqrestore(&riscom_lock, flags);
+	tty_port_close_end(&port->port, tty);
 }
 
 static int rc_write(struct tty_struct *tty,
diff --git a/drivers/char/stallion.c b/drivers/char/stallion.c
index 77eef61c46f..e1e0dd89ac9 100644
--- a/drivers/char/stallion.c
+++ b/drivers/char/stallion.c
@@ -833,40 +833,20 @@ static void stl_close(struct tty_struct *tty, struct file *filp)
 	pr_debug("stl_close(tty=%p,filp=%p)\n", tty, filp);
 
 	portp = tty->driver_data;
-	if (portp == NULL)
-		return;
+	BUG_ON(portp == NULL);
+
 	port = &portp->port;
 
-	spin_lock_irqsave(&port->lock, flags);
-	if (tty_hung_up_p(filp)) {
-		spin_unlock_irqrestore(&port->lock, flags);
+	if (tty_port_close_start(port, tty, filp) == 0)
 		return;
-	}
-	if (tty->count == 1 && port->count != 1)
-		port->count = 1;
-	if (port->count-- > 1) {
-		spin_unlock_irqrestore(&port->lock, flags);
-		return;
-	}
-
-	port->count = 0;
-	port->flags |= ASYNC_CLOSING;
-
 /*
  *	May want to wait for any data to drain before closing. The BUSY
  *	flag keeps track of whether we are still sending or not - it is
  *	very accurate for the cd1400, not quite so for the sc26198.
  *	(The sc26198 has no "end-of-data" interrupt only empty FIFO)
  */
-	tty->closing = 1;
-
-	spin_unlock_irqrestore(&port->lock, flags);
-
-	if (portp->closing_wait != ASYNC_CLOSING_WAIT_NONE)
-		tty_wait_until_sent(tty, portp->closing_wait);
 	stl_waituntilsent(tty, (HZ / 2));
 
-
 	spin_lock_irqsave(&port->lock, flags);
 	portp->port.flags &= ~ASYNC_INITIALIZED;
 	spin_unlock_irqrestore(&port->lock, flags);
@@ -883,20 +863,9 @@ static void stl_close(struct tty_struct *tty, struct file *filp)
 		portp->tx.head = NULL;
 		portp->tx.tail = NULL;
 	}
-	set_bit(TTY_IO_ERROR, &tty->flags);
-	tty_ldisc_flush(tty);
 
-	tty->closing = 0;
+	tty_port_close_end(port, tty);
 	tty_port_tty_set(port, NULL);
-
-	if (port->blocked_open) {
-		if (portp->close_delay)
-			msleep_interruptible(jiffies_to_msecs(portp->close_delay));
-		wake_up_interruptible(&portp->port.open_wait);
-	}
-
-	portp->port.flags &= ~(ASYNC_NORMAL_ACTIVE|ASYNC_CLOSING);
-	wake_up_interruptible(&port->close_wait);
 }
 
 /*****************************************************************************/
diff --git a/drivers/char/synclink.c b/drivers/char/synclink.c
index 0ded4ed3da3..fbd5a5ce2e1 100644
--- a/drivers/char/synclink.c
+++ b/drivers/char/synclink.c
@@ -3104,70 +3104,18 @@ static void mgsl_close(struct tty_struct *tty, struct file * filp)
 	if (debug_level >= DEBUG_LEVEL_INFO)
 		printk("%s(%d):mgsl_close(%s) entry, count=%d\n",
 			 __FILE__,__LINE__, info->device_name, info->port.count);
-			 
-	if (!info->port.count)
-		return;
 
-	if (tty_hung_up_p(filp))
+	if (tty_port_close_start(&info->port, tty, filp) == 0)			 
 		goto cleanup;
 			
-	if ((tty->count == 1) && (info->port.count != 1)) {
-		/*
-		 * tty->count is 1 and the tty structure will be freed.
-		 * info->port.count should be one in this case.
-		 * if it's not, correct it so that the port is shutdown.
-		 */
-		printk("mgsl_close: bad refcount; tty->count is 1, "
-		       "info->port.count is %d\n", info->port.count);
-		info->port.count = 1;
-	}
-	
-	info->port.count--;
-	
-	/* if at least one open remaining, leave hardware active */
-	if (info->port.count)
-		goto cleanup;
-	
-	info->port.flags |= ASYNC_CLOSING;
-	
-	/* set tty->closing to notify line discipline to 
-	 * only process XON/XOFF characters. Only the N_TTY
-	 * discipline appears to use this (ppp does not).
-	 */
-	tty->closing = 1;
-	
-	/* wait for transmit data to clear all layers */
-	
-	if (info->port.closing_wait != ASYNC_CLOSING_WAIT_NONE) {
-		if (debug_level >= DEBUG_LEVEL_INFO)
-			printk("%s(%d):mgsl_close(%s) calling tty_wait_until_sent\n",
-				 __FILE__,__LINE__, info->device_name );
-		tty_wait_until_sent(tty, info->port.closing_wait);
-	}
-		
  	if (info->port.flags & ASYNC_INITIALIZED)
  		mgsl_wait_until_sent(tty, info->timeout);
-
 	mgsl_flush_buffer(tty);
-
 	tty_ldisc_flush(tty);
-		
 	shutdown(info);
-	
-	tty->closing = 0;
+
+	tty_port_close_end(&info->port, tty);	
 	info->port.tty = NULL;
-	
-	if (info->port.blocked_open) {
-		if (info->port.close_delay) {
-			msleep_interruptible(jiffies_to_msecs(info->port.close_delay));
-		}
-		wake_up_interruptible(&info->port.open_wait);
-	}
-	
-	info->port.flags &= ~(ASYNC_NORMAL_ACTIVE|ASYNC_CLOSING);
-			 
-	wake_up_interruptible(&info->port.close_wait);
-	
 cleanup:			
 	if (debug_level >= DEBUG_LEVEL_INFO)
 		printk("%s(%d):mgsl_close(%s) exit, count=%d\n", __FILE__,__LINE__,
diff --git a/drivers/char/synclink_gt.c b/drivers/char/synclink_gt.c
index 625c9bde3be..53544e21f19 100644
--- a/drivers/char/synclink_gt.c
+++ b/drivers/char/synclink_gt.c
@@ -720,44 +720,9 @@ static void close(struct tty_struct *tty, struct file *filp)
 		return;
 	DBGINFO(("%s close entry, count=%d\n", info->device_name, info->port.count));
 
-	if (!info->port.count)
-		return;
-
-	if (tty_hung_up_p(filp))
+	if (tty_port_close_start(&info->port, tty, filp) == 0)
 		goto cleanup;
 
-	if ((tty->count == 1) && (info->port.count != 1)) {
-		/*
-		 * tty->count is 1 and the tty structure will be freed.
-		 * info->port.count should be one in this case.
-		 * if it's not, correct it so that the port is shutdown.
-		 */
-		DBGERR(("%s close: bad refcount; tty->count=1, "
-		       "info->port.count=%d\n", info->device_name, info->port.count));
-		info->port.count = 1;
-	}
-
-	info->port.count--;
-
-	/* if at least one open remaining, leave hardware active */
-	if (info->port.count)
-		goto cleanup;
-
-	info->port.flags |= ASYNC_CLOSING;
-
-	/* set tty->closing to notify line discipline to
-	 * only process XON/XOFF characters. Only the N_TTY
-	 * discipline appears to use this (ppp does not).
-	 */
-	tty->closing = 1;
-
-	/* wait for transmit data to clear all layers */
-
-	if (info->port.closing_wait != ASYNC_CLOSING_WAIT_NONE) {
-		DBGINFO(("%s call tty_wait_until_sent\n", info->device_name));
-		tty_wait_until_sent(tty, info->port.closing_wait);
-	}
-
  	if (info->port.flags & ASYNC_INITIALIZED)
  		wait_until_sent(tty, info->timeout);
 	flush_buffer(tty);
@@ -765,20 +730,8 @@ static void close(struct tty_struct *tty, struct file *filp)
 
 	shutdown(info);
 
-	tty->closing = 0;
+	tty_port_close_end(&info->port, tty);
 	info->port.tty = NULL;
-
-	if (info->port.blocked_open) {
-		if (info->port.close_delay) {
-			msleep_interruptible(jiffies_to_msecs(info->port.close_delay));
-		}
-		wake_up_interruptible(&info->port.open_wait);
-	}
-
-	info->port.flags &= ~(ASYNC_NORMAL_ACTIVE|ASYNC_CLOSING);
-
-	wake_up_interruptible(&info->port.close_wait);
-
 cleanup:
 	DBGINFO(("%s close exit, count=%d\n", tty->driver->name, info->port.count));
 }
diff --git a/drivers/char/synclinkmp.c b/drivers/char/synclinkmp.c
index 1f5c21ec4b1..2aac55bcf5f 100644
--- a/drivers/char/synclinkmp.c
+++ b/drivers/char/synclinkmp.c
@@ -810,70 +810,18 @@ static void close(struct tty_struct *tty, struct file *filp)
 		printk("%s(%d):%s close() entry, count=%d\n",
 			 __FILE__,__LINE__, info->device_name, info->port.count);
 
-	if (!info->port.count)
-		return;
-
-	if (tty_hung_up_p(filp))
-		goto cleanup;
-
-	if ((tty->count == 1) && (info->port.count != 1)) {
-		/*
-		 * tty->count is 1 and the tty structure will be freed.
-		 * info->port.count should be one in this case.
-		 * if it's not, correct it so that the port is shutdown.
-		 */
-		printk("%s(%d):%s close: bad refcount; tty->count is 1, "
-		       "info->port.count is %d\n",
-			 __FILE__,__LINE__, info->device_name, info->port.count);
-		info->port.count = 1;
-	}
-
-	info->port.count--;
-
-	/* if at least one open remaining, leave hardware active */
-	if (info->port.count)
+	if (tty_port_close_start(&info->port, tty, filp) == 0)
 		goto cleanup;
-
-	info->port.flags |= ASYNC_CLOSING;
-
-	/* set tty->closing to notify line discipline to
-	 * only process XON/XOFF characters. Only the N_TTY
-	 * discipline appears to use this (ppp does not).
-	 */
-	tty->closing = 1;
-
-	/* wait for transmit data to clear all layers */
-
-	if (info->port.closing_wait != ASYNC_CLOSING_WAIT_NONE) {
-		if (debug_level >= DEBUG_LEVEL_INFO)
-			printk("%s(%d):%s close() calling tty_wait_until_sent\n",
-				 __FILE__,__LINE__, info->device_name );
-		tty_wait_until_sent(tty, info->port.closing_wait);
-	}
-
+		
  	if (info->port.flags & ASYNC_INITIALIZED)
  		wait_until_sent(tty, info->timeout);
 
 	flush_buffer(tty);
-
 	tty_ldisc_flush(tty);
-
 	shutdown(info);
 
-	tty->closing = 0;
+	tty_port_close_end(&info->port, tty);
 	info->port.tty = NULL;
-
-	if (info->port.blocked_open) {
-		if (info->port.close_delay) {
-			msleep_interruptible(jiffies_to_msecs(info->port.close_delay));
-		}
-		wake_up_interruptible(&info->port.open_wait);
-	}
-
-	info->port.flags &= ~(ASYNC_NORMAL_ACTIVE|ASYNC_CLOSING);
-
-	wake_up_interruptible(&info->port.close_wait);
-
 cleanup:
 	if (debug_level >= DEBUG_LEVEL_INFO)
 		printk("%s(%d):%s close() exit, count=%d\n", __FILE__,__LINE__,
diff --git a/drivers/char/tty_port.c b/drivers/char/tty_port.c
index 0723664fe0a..b3175f54fe0 100644
--- a/drivers/char/tty_port.c
+++ b/drivers/char/tty_port.c
@@ -257,3 +257,61 @@ int tty_port_block_til_ready(struct tty_port *port,
 }
 EXPORT_SYMBOL(tty_port_block_til_ready);
 
+int tty_port_close_start(struct tty_port *port, struct tty_struct *tty, struct file *filp)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&port->lock, flags);
+	if (tty_hung_up_p(filp)) {
+		spin_unlock_irqrestore(&port->lock, flags);
+		return 0;
+	}
+
+	if( tty->count == 1 && port->count != 1) {
+		printk(KERN_WARNING
+		    "tty_port_close_start: tty->count = 1 port count = %d.\n",
+								port->count);
+		port->count = 1;
+	}
+	if (--port->count < 0) {
+		printk(KERN_WARNING "tty_port_close_start: count = %d\n",
+								port->count);
+		port->count = 0;
+	}
+
+	if (port->count) {
+		spin_unlock_irqrestore(&port->lock, flags);
+		return 0;
+	}
+	port->flags |= ASYNC_CLOSING;
+	tty->closing = 1;
+	spin_unlock_irqrestore(&port->lock, flags);
+	if (port->closing_wait != ASYNC_CLOSING_WAIT_NONE)
+		tty_wait_until_sent(tty, port->closing_wait);
+	return 1;
+}
+EXPORT_SYMBOL(tty_port_close_start);
+
+void tty_port_close_end(struct tty_port *port, struct tty_struct *tty)
+{
+	unsigned long flags;
+
+	tty_ldisc_flush(tty);
+
+	spin_lock_irqsave(&port->lock, flags);
+	tty->closing = 0;
+
+	if (port->blocked_open) {
+		spin_unlock_irqrestore(&port->lock, flags);
+		if (port->close_delay) {
+			msleep_interruptible(
+				jiffies_to_msecs(port->close_delay));
+		}
+		spin_lock_irqsave(&port->lock, flags);
+		wake_up_interruptible(&port->open_wait);
+	}
+	port->flags &= ~(ASYNC_NORMAL_ACTIVE | ASYNC_CLOSING);
+	wake_up_interruptible(&port->close_wait);
+	spin_unlock_irqrestore(&port->lock, flags);
+}
+EXPORT_SYMBOL(tty_port_close_end);
diff --git a/include/linux/istallion.h b/include/linux/istallion.h
index 053d5aea925..7faca98c7d1 100644
--- a/include/linux/istallion.h
+++ b/include/linux/istallion.h
@@ -59,7 +59,6 @@ struct stliport {
 	unsigned int		devnr;
 	int			baud_base;
 	int			custom_divisor;
-	int			close_delay;
 	int			closing_wait;
 	int			rc;
 	int			argsize;
diff --git a/include/linux/tty.h b/include/linux/tty.h
index 61a0ab32cf1..fc39db95499 100644
--- a/include/linux/tty.h
+++ b/include/linux/tty.h
@@ -441,6 +441,9 @@ extern void tty_port_raise_dtr_rts(struct tty_port *port);
 extern void tty_port_hangup(struct tty_port *port);
 extern int tty_port_block_til_ready(struct tty_port *port,
 				struct tty_struct *tty, struct file *filp);
+extern int tty_port_close_start(struct tty_port *port,
+				struct tty_struct *tty, struct file *filp);
+extern void tty_port_close_end(struct tty_port *port, struct tty_struct *tty);
 
 extern int tty_register_ldisc(int disc, struct tty_ldisc_ops *new_ldisc);
 extern int tty_unregister_ldisc(int disc);
-- 
cgit v1.2.3-70-g09d2


From 39aced68d664291db3324d0fcf0985ab5626aac2 Mon Sep 17 00:00:00 2001
From: Niels de Vos <niels.devos@wincor-nixdorf.com>
Date: Fri, 2 Jan 2009 13:46:58 +0000
Subject: serial: set correct baud_base for Oxford Semiconductor Ltd EXSYS
 EX-41092 Dual 16950 Serial adapter

The PCI-card identified as "Oxford Semiconductor Ltd EXSYS EX-41092 Dual
16950 Serial adapter" is only usable with other devices (i.e. not the same
card) after doing a "setserial /dev/ttyS<n> baud_base 115200".  This
baud_base should be default for this card.

Signed-off-by: Niels de Vos <niels.devos@wincor-nixdorf.com>
Signed-off-by: Alan Cox <alan@redhat.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/serial/8250_pci.c | 3 +++
 include/linux/pci_ids.h   | 1 +
 2 files changed, 4 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/serial/8250_pci.c b/drivers/serial/8250_pci.c
index 0b794138f68..2a2e1c717e8 100644
--- a/drivers/serial/8250_pci.c
+++ b/drivers/serial/8250_pci.c
@@ -2387,6 +2387,9 @@ static struct pci_device_id serial_pci_tbl[] = {
 		 * www.ussg.iu.edu/hypermail/linux/kernel/0303.1/0516.html).
 		 * For now just used the hex ID 0x950a.
 		 */
+	{	PCI_VENDOR_ID_OXSEMI, 0x950a,
+		PCI_SUBVENDOR_ID_SIIG, PCI_SUBDEVICE_ID_SIIG_DUAL_SERIAL, 0, 0,
+		pbn_b0_2_115200 },
 	{	PCI_VENDOR_ID_OXSEMI, 0x950a,
 		PCI_ANY_ID, PCI_ANY_ID, 0, 0,
 		pbn_b0_2_1130000 },
diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h
index b6e69445428..fa83dfefc5e 100644
--- a/include/linux/pci_ids.h
+++ b/include/linux/pci_ids.h
@@ -1766,6 +1766,7 @@
 #define PCI_DEVICE_ID_SIIG_8S_20x_650	0x2081
 #define PCI_DEVICE_ID_SIIG_8S_20x_850	0x2082
 #define PCI_SUBDEVICE_ID_SIIG_QUARTET_SERIAL	0x2050
+#define PCI_SUBDEVICE_ID_SIIG_DUAL_SERIAL	0x2530
 
 #define PCI_VENDOR_ID_RADISYS		0x1331
 
-- 
cgit v1.2.3-70-g09d2


From 60c20fb8c00a2b23308ae4517f145383bc66d291 Mon Sep 17 00:00:00 2001
From: Andy Whitcroft <apw@canonical.com>
Date: Fri, 2 Jan 2009 13:49:04 +0000
Subject: serial: RS485 ioctl structure uses __u32 include linux/types.h
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

In the commit below a new struct serial_rs485 was introduced for a new
ioctl:

    commit c26c56c0f40e200e61d1390629c806f6adaffbcc
    Author: Alan Cox <alan@redhat.com>
    Date:   Mon Oct 13 10:37:48 2008 +0100

	tty: Cris has a nice RS485 ioctl so we should steal it

This structure uses the __u32 types for some of its members, which leads
to the following compile error:

    $ cc -I.../include -c X.c
    In file included from X.c:2: .../include/linux/serial.h:185:
		error: expected specifier-qualifier-list before ‘__u32’
    $

It seems that these types are appropriate for this structure as it is
to be exposed to userspace.  These types are available via linux/types.h
so move the include of that outside the __KERNEL__ section.

Signed-off-by: Andy Whitcroft <apw@canonical.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Alan Cox <alan@redhat.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/serial.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/serial.h b/include/linux/serial.h
index 1ea8d9265bf..9136cc5608c 100644
--- a/include/linux/serial.h
+++ b/include/linux/serial.h
@@ -10,8 +10,9 @@
 #ifndef _LINUX_SERIAL_H
 #define _LINUX_SERIAL_H
 
-#ifdef __KERNEL__
 #include <linux/types.h>
+
+#ifdef __KERNEL__
 #include <asm/page.h>
 
 /*
-- 
cgit v1.2.3-70-g09d2


From f751928e0ddf54ea4fe5546f35e99efc5b5d9938 Mon Sep 17 00:00:00 2001
From: Alan Cox <alan@redhat.com>
Date: Fri, 2 Jan 2009 13:49:21 +0000
Subject: tty: We want the port object to be persistent

Move the tty_port and uart_info bits around a little. By embedding the uart_info
into the uart_port we get rid of lots of corner case testing and also get the
ability to go port<->state<->info which is a bit more elegant than the current
data structures.

Downsides - we allocate a tiny bit more memory for unused ports, upside we've
removed as much code as it saved for most users..

Signed-off-by: Alan Cox <alan@redhat.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/serial/jsm/jsm_tty.c |   2 +-
 drivers/serial/serial_core.c | 144 ++++++++++++++++++-------------------------
 include/linux/serial_core.h  |  62 ++++++++++---------
 3 files changed, 96 insertions(+), 112 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/serial/jsm/jsm_tty.c b/drivers/serial/jsm/jsm_tty.c
index a697914ae3d..3547558d2ca 100644
--- a/drivers/serial/jsm/jsm_tty.c
+++ b/drivers/serial/jsm/jsm_tty.c
@@ -272,7 +272,7 @@ static void jsm_tty_close(struct uart_port *port)
 	jsm_printk(CLOSE, INFO, &channel->ch_bd->pci_dev, "start\n");
 
 	bd = channel->ch_bd;
-	ts = channel->uart_port.info->port.tty->termios;
+	ts = port->info->port.tty->termios;
 
 	channel->ch_flags &= ~(CH_STOPI);
 
diff --git a/drivers/serial/serial_core.c b/drivers/serial/serial_core.c
index 874786a11fe..daeba1c52c8 100644
--- a/drivers/serial/serial_core.c
+++ b/drivers/serial/serial_core.c
@@ -50,7 +50,7 @@ static struct lock_class_key port_lock_key;
 
 #define HIGH_BITS_OFFSET	((sizeof(long)-sizeof(int))*8)
 
-#define uart_users(state)	((state)->count + ((state)->info ? (state)->info->port.blocked_open : 0))
+#define uart_users(state)	((state)->count + (state)->info.port.blocked_open)
 
 #ifdef CONFIG_SERIAL_CORE_CONSOLE
 #define uart_console(port)	((port)->cons && (port)->cons->index == (port)->line)
@@ -94,7 +94,7 @@ static void __uart_start(struct tty_struct *tty)
 	struct uart_state *state = tty->driver_data;
 	struct uart_port *port = state->port;
 
-	if (!uart_circ_empty(&state->info->xmit) && state->info->xmit.buf &&
+	if (!uart_circ_empty(&state->info.xmit) && state->info.xmit.buf &&
 	    !tty->stopped && !tty->hw_stopped)
 		port->ops->start_tx(port);
 }
@@ -113,7 +113,7 @@ static void uart_start(struct tty_struct *tty)
 static void uart_tasklet_action(unsigned long data)
 {
 	struct uart_state *state = (struct uart_state *)data;
-	tty_wakeup(state->info->port.tty);
+	tty_wakeup(state->info.port.tty);
 }
 
 static inline void
@@ -139,7 +139,7 @@ uart_update_mctrl(struct uart_port *port, unsigned int set, unsigned int clear)
  */
 static int uart_startup(struct uart_state *state, int init_hw)
 {
-	struct uart_info *info = state->info;
+	struct uart_info *info = &state->info;
 	struct uart_port *port = state->port;
 	unsigned long page;
 	int retval = 0;
@@ -212,14 +212,15 @@ static int uart_startup(struct uart_state *state, int init_hw)
  */
 static void uart_shutdown(struct uart_state *state)
 {
-	struct uart_info *info = state->info;
+	struct uart_info *info = &state->info;
 	struct uart_port *port = state->port;
+	struct tty_struct *tty = info->port.tty;
 
 	/*
 	 * Set the TTY IO error marker
 	 */
-	if (info->port.tty)
-		set_bit(TTY_IO_ERROR, &info->port.tty->flags);
+	if (tty)
+		set_bit(TTY_IO_ERROR, &tty->flags);
 
 	if (info->flags & UIF_INITIALIZED) {
 		info->flags &= ~UIF_INITIALIZED;
@@ -227,7 +228,7 @@ static void uart_shutdown(struct uart_state *state)
 		/*
 		 * Turn off DTR and RTS early.
 		 */
-		if (!info->port.tty || (info->port.tty->termios->c_cflag & HUPCL))
+		if (!tty || (tty->termios->c_cflag & HUPCL))
 			uart_clear_mctrl(port, TIOCM_DTR | TIOCM_RTS);
 
 		/*
@@ -427,7 +428,7 @@ EXPORT_SYMBOL(uart_get_divisor);
 static void
 uart_change_speed(struct uart_state *state, struct ktermios *old_termios)
 {
-	struct tty_struct *tty = state->info->port.tty;
+	struct tty_struct *tty = state->info.port.tty;
 	struct uart_port *port = state->port;
 	struct ktermios *termios;
 
@@ -444,14 +445,14 @@ uart_change_speed(struct uart_state *state, struct ktermios *old_termios)
 	 * Set flags based on termios cflag
 	 */
 	if (termios->c_cflag & CRTSCTS)
-		state->info->flags |= UIF_CTS_FLOW;
+		state->info.flags |= UIF_CTS_FLOW;
 	else
-		state->info->flags &= ~UIF_CTS_FLOW;
+		state->info.flags &= ~UIF_CTS_FLOW;
 
 	if (termios->c_cflag & CLOCAL)
-		state->info->flags &= ~UIF_CHECK_CD;
+		state->info.flags &= ~UIF_CHECK_CD;
 	else
-		state->info->flags |= UIF_CHECK_CD;
+		state->info.flags |= UIF_CHECK_CD;
 
 	port->ops->set_termios(port, termios, old_termios);
 }
@@ -479,7 +480,7 @@ static int uart_put_char(struct tty_struct *tty, unsigned char ch)
 {
 	struct uart_state *state = tty->driver_data;
 
-	return __uart_put_char(state->port, &state->info->xmit, ch);
+	return __uart_put_char(state->port, &state->info.xmit, ch);
 }
 
 static void uart_flush_chars(struct tty_struct *tty)
@@ -500,13 +501,13 @@ uart_write(struct tty_struct *tty, const unsigned char *buf, int count)
 	 * This means you called this function _after_ the port was
 	 * closed.  No cookie for you.
 	 */
-	if (!state || !state->info) {
+	if (!state) {
 		WARN_ON(1);
 		return -EL3HLT;
 	}
 
 	port = state->port;
-	circ = &state->info->xmit;
+	circ = &state->info.xmit;
 
 	if (!circ->buf)
 		return 0;
@@ -537,7 +538,7 @@ static int uart_write_room(struct tty_struct *tty)
 	int ret;
 
 	spin_lock_irqsave(&state->port->lock, flags);
-	ret = uart_circ_chars_free(&state->info->xmit);
+	ret = uart_circ_chars_free(&state->info.xmit);
 	spin_unlock_irqrestore(&state->port->lock, flags);
 	return ret;
 }
@@ -549,7 +550,7 @@ static int uart_chars_in_buffer(struct tty_struct *tty)
 	int ret;
 
 	spin_lock_irqsave(&state->port->lock, flags);
-	ret = uart_circ_chars_pending(&state->info->xmit);
+	ret = uart_circ_chars_pending(&state->info.xmit);
 	spin_unlock_irqrestore(&state->port->lock, flags);
 	return ret;
 }
@@ -564,7 +565,7 @@ static void uart_flush_buffer(struct tty_struct *tty)
 	 * This means you called this function _after_ the port was
 	 * closed.  No cookie for you.
 	 */
-	if (!state || !state->info) {
+	if (!state) {
 		WARN_ON(1);
 		return;
 	}
@@ -573,7 +574,7 @@ static void uart_flush_buffer(struct tty_struct *tty)
 	pr_debug("uart_flush_buffer(%d) called\n", tty->index);
 
 	spin_lock_irqsave(&port->lock, flags);
-	uart_circ_clear(&state->info->xmit);
+	uart_circ_clear(&state->info.xmit);
 	if (port->ops->flush_buffer)
 		port->ops->flush_buffer(port);
 	spin_unlock_irqrestore(&port->lock, flags);
@@ -837,15 +838,15 @@ static int uart_set_info(struct uart_state *state,
 	state->closing_wait    = closing_wait;
 	if (new_serial.xmit_fifo_size)
 		port->fifosize = new_serial.xmit_fifo_size;
-	if (state->info->port.tty)
-		state->info->port.tty->low_latency =
+	if (state->info.port.tty)
+		state->info.port.tty->low_latency =
 			(port->flags & UPF_LOW_LATENCY) ? 1 : 0;
 
  check_and_exit:
 	retval = 0;
 	if (port->type == PORT_UNKNOWN)
 		goto exit;
-	if (state->info->flags & UIF_INITIALIZED) {
+	if (state->info.flags & UIF_INITIALIZED) {
 		if (((old_flags ^ port->flags) & UPF_SPD_MASK) ||
 		    old_custom_divisor != port->custom_divisor) {
 			/*
@@ -858,7 +859,7 @@ static int uart_set_info(struct uart_state *state,
 				printk(KERN_NOTICE
 				       "%s sets custom speed on %s. This "
 				       "is deprecated.\n", current->comm,
-				       tty_name(state->info->port.tty, buf));
+				       tty_name(state->info.port.tty, buf));
 			}
 			uart_change_speed(state, NULL);
 		}
@@ -889,8 +890,8 @@ static int uart_get_lsr_info(struct uart_state *state,
 	 * interrupt happens).
 	 */
 	if (port->x_char ||
-	    ((uart_circ_chars_pending(&state->info->xmit) > 0) &&
-	     !state->info->port.tty->stopped && !state->info->port.tty->hw_stopped))
+	    ((uart_circ_chars_pending(&state->info.xmit) > 0) &&
+	     !state->info.port.tty->stopped && !state->info.port.tty->hw_stopped))
 		result &= ~TIOCSER_TEMT;
 
 	return put_user(result, value);
@@ -1017,7 +1018,7 @@ uart_wait_modem_status(struct uart_state *state, unsigned long arg)
 	port->ops->enable_ms(port);
 	spin_unlock_irq(&port->lock);
 
-	add_wait_queue(&state->info->delta_msr_wait, &wait);
+	add_wait_queue(&state->info.delta_msr_wait, &wait);
 	for (;;) {
 		spin_lock_irq(&port->lock);
 		memcpy(&cnow, &port->icount, sizeof(struct uart_icount));
@@ -1045,7 +1046,7 @@ uart_wait_modem_status(struct uart_state *state, unsigned long arg)
 	}
 
 	current->state = TASK_RUNNING;
-	remove_wait_queue(&state->info->delta_msr_wait, &wait);
+	remove_wait_queue(&state->info.delta_msr_wait, &wait);
 
 	return ret;
 }
@@ -1241,7 +1242,7 @@ static void uart_set_termios(struct tty_struct *tty,
 	 */
 	if (!(old_termios->c_cflag & CLOCAL) &&
 	    (tty->termios->c_cflag & CLOCAL))
-		wake_up_interruptible(&state->info->port.open_wait);
+		wake_up_interruptible(&info->port.open_wait);
 #endif
 }
 
@@ -1303,7 +1304,7 @@ static void uart_close(struct tty_struct *tty, struct file *filp)
 	 * At this point, we stop accepting input.  To do this, we
 	 * disable the receive line status interrupts.
 	 */
-	if (state->info->flags & UIF_INITIALIZED) {
+	if (state->info.flags & UIF_INITIALIZED) {
 		unsigned long flags;
 		spin_lock_irqsave(&port->lock, flags);
 		port->ops->stop_rx(port);
@@ -1322,9 +1323,9 @@ static void uart_close(struct tty_struct *tty, struct file *filp)
 	tty_ldisc_flush(tty);
 
 	tty->closing = 0;
-	state->info->port.tty = NULL;
+	state->info.port.tty = NULL;
 
-	if (state->info->port.blocked_open) {
+	if (state->info.port.blocked_open) {
 		if (state->close_delay)
 			msleep_interruptible(state->close_delay);
 	} else if (!uart_console(port)) {
@@ -1334,8 +1335,8 @@ static void uart_close(struct tty_struct *tty, struct file *filp)
 	/*
 	 * Wake up anyone trying to open this port.
 	 */
-	state->info->flags &= ~UIF_NORMAL_ACTIVE;
-	wake_up_interruptible(&state->info->port.open_wait);
+	state->info.flags &= ~UIF_NORMAL_ACTIVE;
+	wake_up_interruptible(&state->info.port.open_wait);
 
  done:
 	mutex_unlock(&state->mutex);
@@ -1409,19 +1410,20 @@ static void uart_wait_until_sent(struct tty_struct *tty, int timeout)
 static void uart_hangup(struct tty_struct *tty)
 {
 	struct uart_state *state = tty->driver_data;
+	struct uart_info *info = &state->info;
 
 	BUG_ON(!kernel_locked());
 	pr_debug("uart_hangup(%d)\n", state->port->line);
 
 	mutex_lock(&state->mutex);
-	if (state->info && state->info->flags & UIF_NORMAL_ACTIVE) {
+	if (info->flags & UIF_NORMAL_ACTIVE) {
 		uart_flush_buffer(tty);
 		uart_shutdown(state);
 		state->count = 0;
-		state->info->flags &= ~UIF_NORMAL_ACTIVE;
-		state->info->port.tty = NULL;
-		wake_up_interruptible(&state->info->port.open_wait);
-		wake_up_interruptible(&state->info->delta_msr_wait);
+		info->flags &= ~UIF_NORMAL_ACTIVE;
+		info->port.tty = NULL;
+		wake_up_interruptible(&info->port.open_wait);
+		wake_up_interruptible(&info->delta_msr_wait);
 	}
 	mutex_unlock(&state->mutex);
 }
@@ -1434,7 +1436,7 @@ static void uart_hangup(struct tty_struct *tty)
  */
 static void uart_update_termios(struct uart_state *state)
 {
-	struct tty_struct *tty = state->info->port.tty;
+	struct tty_struct *tty = state->info.port.tty;
 	struct uart_port *port = state->port;
 
 	if (uart_console(port) && port->cons->cflag) {
@@ -1469,7 +1471,7 @@ static int
 uart_block_til_ready(struct file *filp, struct uart_state *state)
 {
 	DECLARE_WAITQUEUE(wait, current);
-	struct uart_info *info = state->info;
+	struct uart_info *info = &state->info;
 	struct uart_port *port = state->port;
 	unsigned int mctrl;
 
@@ -1563,28 +1565,6 @@ static struct uart_state *uart_get(struct uart_driver *drv, int line)
 		ret = -ENXIO;
 		goto err_unlock;
 	}
-
-	/* BKL: RACE HERE - LEAK */
-	/* We should move this into the uart_state structure and kill off
-	   this whole complexity */
-	if (!state->info) {
-		state->info = kzalloc(sizeof(struct uart_info), GFP_KERNEL);
-		if (state->info) {
-			init_waitqueue_head(&state->info->port.open_wait);
-			init_waitqueue_head(&state->info->delta_msr_wait);
-
-			/*
-			 * Link the info into the other structures.
-			 */
-			state->port->info = state->info;
-
-			tasklet_init(&state->info->tlet, uart_tasklet_action,
-				     (unsigned long)state);
-		} else {
-			ret = -ENOMEM;
-			goto err_unlock;
-		}
-	}
 	return state;
 
  err_unlock:
@@ -1641,9 +1621,10 @@ static int uart_open(struct tty_struct *tty, struct file *filp)
 	 * Any failures from here onwards should not touch the count.
 	 */
 	tty->driver_data = state;
+	state->port->info = &state->info;
 	tty->low_latency = (state->port->flags & UPF_LOW_LATENCY) ? 1 : 0;
 	tty->alt_speed = 0;
-	state->info->port.tty = tty;
+	state->info.port.tty = tty;
 
 	/*
 	 * If the port is in the middle of closing, bail out now.
@@ -1676,8 +1657,8 @@ static int uart_open(struct tty_struct *tty, struct file *filp)
 	/*
 	 * If this is the first open to succeed, adjust things to suit.
 	 */
-	if (retval == 0 && !(state->info->flags & UIF_NORMAL_ACTIVE)) {
-		state->info->flags |= UIF_NORMAL_ACTIVE;
+	if (retval == 0 && !(state->info.flags & UIF_NORMAL_ACTIVE)) {
+		state->info.flags |= UIF_NORMAL_ACTIVE;
 
 		uart_update_termios(state);
 	}
@@ -2028,11 +2009,11 @@ int uart_suspend_port(struct uart_driver *drv, struct uart_port *port)
 	}
 	port->suspended = 1;
 
-	if (state->info && state->info->flags & UIF_INITIALIZED) {
+	if (state->info.flags & UIF_INITIALIZED) {
 		const struct uart_ops *ops = port->ops;
 		int tries;
 
-		state->info->flags = (state->info->flags & ~UIF_INITIALIZED)
+		state->info.flags = (state->info.flags & ~UIF_INITIALIZED)
 				     | UIF_SUSPENDED;
 
 		spin_lock_irq(&port->lock);
@@ -2107,15 +2088,15 @@ int uart_resume_port(struct uart_driver *drv, struct uart_port *port)
 		/*
 		 * If that's unset, use the tty termios setting.
 		 */
-		if (state->info && state->info->port.tty && termios.c_cflag == 0)
-			termios = *state->info->port.tty->termios;
+		if (state->info.port.tty && termios.c_cflag == 0)
+			termios = *state->info.port.tty->termios;
 
 		uart_change_pm(state, 0);
 		port->ops->set_termios(port, &termios, NULL);
 		console_start(port->cons);
 	}
 
-	if (state->info && state->info->flags & UIF_SUSPENDED) {
+	if (state->info.flags & UIF_SUSPENDED) {
 		const struct uart_ops *ops = port->ops;
 		int ret;
 
@@ -2130,7 +2111,7 @@ int uart_resume_port(struct uart_driver *drv, struct uart_port *port)
 			ops->set_mctrl(port, port->mctrl);
 			ops->start_tx(port);
 			spin_unlock_irq(&port->lock);
-			state->info->flags |= UIF_INITIALIZED;
+			state->info.flags |= UIF_INITIALIZED;
 		} else {
 			/*
 			 * Failed to resume - maybe hardware went away?
@@ -2140,7 +2121,7 @@ int uart_resume_port(struct uart_driver *drv, struct uart_port *port)
 			uart_shutdown(state);
 		}
 
-		state->info->flags &= ~UIF_SUSPENDED;
+		state->info.flags &= ~UIF_SUSPENDED;
 	}
 
 	mutex_unlock(&state->mutex);
@@ -2383,8 +2364,12 @@ int uart_register_driver(struct uart_driver *drv)
 
 		state->close_delay     = 500;	/* .5 seconds */
 		state->closing_wait    = 30000;	/* 30 seconds */
-
 		mutex_init(&state->mutex);
+
+		tty_port_init(&state->info.port);
+		init_waitqueue_head(&state->info.delta_msr_wait);
+		tasklet_init(&state->info.tlet, uart_tasklet_action,
+			     (unsigned long)state);
 	}
 
 	retval = tty_register_driver(normal);
@@ -2455,7 +2440,7 @@ int uart_add_one_port(struct uart_driver *drv, struct uart_port *port)
 	state->pm_state = -1;
 
 	port->cons = drv->cons;
-	port->info = state->info;
+	port->info = &state->info;
 
 	/*
 	 * If this port is a console, then the spinlock is already
@@ -2527,17 +2512,10 @@ int uart_remove_one_port(struct uart_driver *drv, struct uart_port *port)
 	 */
 	tty_unregister_device(drv->tty_driver, port->line);
 
-	info = state->info;
+	info = &state->info;
 	if (info && info->port.tty)
 		tty_vhangup(info->port.tty);
 
-	/*
-	 * All users of this port should now be disconnected from
-	 * this driver, and the port shut down.  We should be the
-	 * only thread fiddling with this port from now on.
-	 */
-	state->info = NULL;
-
 	/*
 	 * Free the port IO and memory resources, if any.
 	 */
diff --git a/include/linux/serial_core.h b/include/linux/serial_core.h
index feb3b939ec4..2395969faa0 100644
--- a/include/linux/serial_core.h
+++ b/include/linux/serial_core.h
@@ -315,36 +315,14 @@ struct uart_port {
 	void			*private_data;		/* generic platform data pointer */
 };
 
-/*
- * This is the state information which is persistent across opens.
- * The low level driver must not to touch any elements contained
- * within.
- */
-struct uart_state {
-	unsigned int		close_delay;		/* msec */
-	unsigned int		closing_wait;		/* msec */
-
-#define USF_CLOSING_WAIT_INF	(0)
-#define USF_CLOSING_WAIT_NONE	(~0U)
-
-	int			count;
-	int			pm_state;
-	struct uart_info	*info;
-	struct uart_port	*port;
-
-	struct mutex		mutex;
-};
-
-#define UART_XMIT_SIZE	PAGE_SIZE
-
-typedef unsigned int __bitwise__ uif_t;
-
 /*
  * This is the state information which is only valid when the port
- * is open; it may be freed by the core driver once the device has
+ * is open; it may be cleared the core driver once the device has
  * been closed.  Either the low level driver or the core can modify
  * stuff here.
  */
+typedef unsigned int __bitwise__ uif_t;
+
 struct uart_info {
 	struct tty_port		port;
 	struct circ_buf		xmit;
@@ -366,6 +344,29 @@ struct uart_info {
 	wait_queue_head_t	delta_msr_wait;
 };
 
+/*
+ * This is the state information which is persistent across opens.
+ * The low level driver must not to touch any elements contained
+ * within.
+ */
+struct uart_state {
+	unsigned int		close_delay;		/* msec */
+	unsigned int		closing_wait;		/* msec */
+
+#define USF_CLOSING_WAIT_INF	(0)
+#define USF_CLOSING_WAIT_NONE	(~0U)
+
+	int			count;
+	int			pm_state;
+	struct uart_info	info;
+	struct uart_port	*port;
+
+	struct mutex		mutex;
+};
+
+#define UART_XMIT_SIZE	PAGE_SIZE
+
+
 /* number of characters left in xmit buffer before we ask for more */
 #define WAKEUP_CHARS		256
 
@@ -439,8 +440,13 @@ int uart_resume_port(struct uart_driver *reg, struct uart_port *port);
 #define uart_circ_chars_free(circ)	\
 	(CIRC_SPACE((circ)->head, (circ)->tail, UART_XMIT_SIZE))
 
-#define uart_tx_stopped(portp)		\
-	((portp)->info->port.tty->stopped || (portp)->info->port.tty->hw_stopped)
+static inline int uart_tx_stopped(struct uart_port *port)
+{
+	struct tty_struct *tty = port->info->port.tty;
+	if(tty->stopped || tty->hw_stopped)
+		return 1;
+	return 0;
+}
 
 /*
  * The following are helper functions for the low level drivers.
@@ -451,7 +457,7 @@ uart_handle_sysrq_char(struct uart_port *port, unsigned int ch)
 #ifdef SUPPORT_SYSRQ
 	if (port->sysrq) {
 		if (ch && time_before(jiffies, port->sysrq)) {
-			handle_sysrq(ch, port->info ? port->info->port.tty : NULL);
+			handle_sysrq(ch, port->info->port.tty);
 			port->sysrq = 0;
 			return 1;
 		}
-- 
cgit v1.2.3-70-g09d2


From 7d6a07d123b62bf4fa71867420c23da3ca36c995 Mon Sep 17 00:00:00 2001
From: David Daney <ddaney@caviumnetworks.com>
Date: Fri, 2 Jan 2009 13:49:47 +0000
Subject: 8250: Serial driver changes to support future Cavium OCTEON serial
 patches.

In order to use Cavium OCTEON specific serial i/o drivers, we first
patch the 8250 driver to use replaceable I/O functions.  Compatible
I/O functions are added for existing iotypeS.

An added benefit of this change is that it makes it easy to factor
some of the existing special cases out to board/SOC specific support
code.

The alternative is to load up 8250.c with a bunch of OCTEON specific
iotype code and bug work-arounds.

Signed-off-by: David Daney <ddaney@caviumnetworks.com>
Signed-off-by: Tomaso Paoletti <tpaoletti@caviumnetworks.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Alan Cox <alan@redhat.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/serial/8250.c       | 194 +++++++++++++++++++++++++++++++-------------
 include/linux/serial_8250.h |   2 +
 include/linux/serial_core.h |   2 +
 3 files changed, 140 insertions(+), 58 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/serial/8250.c b/drivers/serial/8250.c
index 8e28750a405..849af9d21fe 100644
--- a/drivers/serial/8250.c
+++ b/drivers/serial/8250.c
@@ -303,16 +303,16 @@ static const u8 au_io_out_map[] = {
 };
 
 /* sane hardware needs no mapping */
-static inline int map_8250_in_reg(struct uart_8250_port *up, int offset)
+static inline int map_8250_in_reg(struct uart_port *p, int offset)
 {
-	if (up->port.iotype != UPIO_AU)
+	if (p->iotype != UPIO_AU)
 		return offset;
 	return au_io_in_map[offset];
 }
 
-static inline int map_8250_out_reg(struct uart_8250_port *up, int offset)
+static inline int map_8250_out_reg(struct uart_port *p, int offset)
 {
-	if (up->port.iotype != UPIO_AU)
+	if (p->iotype != UPIO_AU)
 		return offset;
 	return au_io_out_map[offset];
 }
@@ -341,16 +341,16 @@ static const u8
 		[UART_SCR]	= 0x2c
 	};
 
-static inline int map_8250_in_reg(struct uart_8250_port *up, int offset)
+static inline int map_8250_in_reg(struct uart_port *p, int offset)
 {
-	if (up->port.iotype != UPIO_RM9000)
+	if (p->iotype != UPIO_RM9000)
 		return offset;
 	return regmap_in[offset];
 }
 
-static inline int map_8250_out_reg(struct uart_8250_port *up, int offset)
+static inline int map_8250_out_reg(struct uart_port *p, int offset)
 {
-	if (up->port.iotype != UPIO_RM9000)
+	if (p->iotype != UPIO_RM9000)
 		return offset;
 	return regmap_out[offset];
 }
@@ -363,108 +363,170 @@ static inline int map_8250_out_reg(struct uart_8250_port *up, int offset)
 
 #endif
 
-static unsigned int serial_in(struct uart_8250_port *up, int offset)
+static unsigned int hub6_serial_in(struct uart_port *p, int offset)
 {
-	unsigned int tmp;
-	offset = map_8250_in_reg(up, offset) << up->port.regshift;
+	offset = map_8250_in_reg(p, offset) << p->regshift;
+	outb(p->hub6 - 1 + offset, p->iobase);
+	return inb(p->iobase + 1);
+}
 
-	switch (up->port.iotype) {
-	case UPIO_HUB6:
-		outb(up->port.hub6 - 1 + offset, up->port.iobase);
-		return inb(up->port.iobase + 1);
+static void hub6_serial_out(struct uart_port *p, int offset, int value)
+{
+	offset = map_8250_out_reg(p, offset) << p->regshift;
+	outb(p->hub6 - 1 + offset, p->iobase);
+	outb(value, p->iobase + 1);
+}
 
-	case UPIO_MEM:
-	case UPIO_DWAPB:
-		return readb(up->port.membase + offset);
+static unsigned int mem_serial_in(struct uart_port *p, int offset)
+{
+	offset = map_8250_in_reg(p, offset) << p->regshift;
+	return readb(p->membase + offset);
+}
 
-	case UPIO_RM9000:
-	case UPIO_MEM32:
-		return readl(up->port.membase + offset);
+static void mem_serial_out(struct uart_port *p, int offset, int value)
+{
+	offset = map_8250_out_reg(p, offset) << p->regshift;
+	writeb(value, p->membase + offset);
+}
+
+static void mem32_serial_out(struct uart_port *p, int offset, int value)
+{
+	offset = map_8250_out_reg(p, offset) << p->regshift;
+	writel(value, p->membase + offset);
+}
+
+static unsigned int mem32_serial_in(struct uart_port *p, int offset)
+{
+	offset = map_8250_in_reg(p, offset) << p->regshift;
+	return readl(p->membase + offset);
+}
 
 #ifdef CONFIG_SERIAL_8250_AU1X00
-	case UPIO_AU:
-		return __raw_readl(up->port.membase + offset);
+static unsigned int au_serial_in(struct uart_port *p, int offset)
+{
+	offset = map_8250_in_reg(p, offset) << p->regshift;
+	return __raw_readl(p->membase + offset);
+}
+
+static void au_serial_out(struct uart_port *p, int offset, int value)
+{
+	offset = map_8250_out_reg(p, offset) << p->regshift;
+	__raw_writel(value, p->membase + offset);
+}
 #endif
 
-	case UPIO_TSI:
-		if (offset == UART_IIR) {
-			tmp = readl(up->port.membase + (UART_IIR & ~3));
-			return (tmp >> 16) & 0xff; /* UART_IIR % 4 == 2 */
-		} else
-			return readb(up->port.membase + offset);
+static unsigned int tsi_serial_in(struct uart_port *p, int offset)
+{
+	unsigned int tmp;
+	offset = map_8250_in_reg(p, offset) << p->regshift;
+	if (offset == UART_IIR) {
+		tmp = readl(p->membase + (UART_IIR & ~3));
+		return (tmp >> 16) & 0xff; /* UART_IIR % 4 == 2 */
+	} else
+		return readb(p->membase + offset);
+}
 
-	default:
-		return inb(up->port.iobase + offset);
-	}
+static void tsi_serial_out(struct uart_port *p, int offset, int value)
+{
+	offset = map_8250_out_reg(p, offset) << p->regshift;
+	if (!((offset == UART_IER) && (value & UART_IER_UUE)))
+		writeb(value, p->membase + offset);
 }
 
-static void
-serial_out(struct uart_8250_port *up, int offset, int value)
+static void dwapb_serial_out(struct uart_port *p, int offset, int value)
 {
-	/* Save the offset before it's remapped */
 	int save_offset = offset;
-	offset = map_8250_out_reg(up, offset) << up->port.regshift;
+	offset = map_8250_out_reg(p, offset) << p->regshift;
+	/* Save the LCR value so it can be re-written when a
+	 * Busy Detect interrupt occurs. */
+	if (save_offset == UART_LCR) {
+		struct uart_8250_port *up = (struct uart_8250_port *)p;
+		up->lcr = value;
+	}
+	writeb(value, p->membase + offset);
+	/* Read the IER to ensure any interrupt is cleared before
+	 * returning from ISR. */
+	if (save_offset == UART_TX || save_offset == UART_IER)
+		value = p->serial_in(p, UART_IER);
+}
 
-	switch (up->port.iotype) {
+static unsigned int io_serial_in(struct uart_port *p, int offset)
+{
+	offset = map_8250_in_reg(p, offset) << p->regshift;
+	return inb(p->iobase + offset);
+}
+
+static void io_serial_out(struct uart_port *p, int offset, int value)
+{
+	offset = map_8250_out_reg(p, offset) << p->regshift;
+	outb(value, p->iobase + offset);
+}
+
+static void set_io_from_upio(struct uart_port *p)
+{
+	switch (p->iotype) {
 	case UPIO_HUB6:
-		outb(up->port.hub6 - 1 + offset, up->port.iobase);
-		outb(value, up->port.iobase + 1);
+		p->serial_in = hub6_serial_in;
+		p->serial_out = hub6_serial_out;
 		break;
 
 	case UPIO_MEM:
-		writeb(value, up->port.membase + offset);
+		p->serial_in = mem_serial_in;
+		p->serial_out = mem_serial_out;
 		break;
 
 	case UPIO_RM9000:
 	case UPIO_MEM32:
-		writel(value, up->port.membase + offset);
+		p->serial_in = mem32_serial_in;
+		p->serial_out = mem32_serial_out;
 		break;
 
 #ifdef CONFIG_SERIAL_8250_AU1X00
 	case UPIO_AU:
-		__raw_writel(value, up->port.membase + offset);
+		p->serial_in = au_serial_in;
+		p->serial_out = au_serial_out;
 		break;
 #endif
 	case UPIO_TSI:
-		if (!((offset == UART_IER) && (value & UART_IER_UUE)))
-			writeb(value, up->port.membase + offset);
+		p->serial_in = tsi_serial_in;
+		p->serial_out = tsi_serial_out;
 		break;
 
 	case UPIO_DWAPB:
-		/* Save the LCR value so it can be re-written when a
-		 * Busy Detect interrupt occurs. */
-		if (save_offset == UART_LCR)
-			up->lcr = value;
-		writeb(value, up->port.membase + offset);
-		/* Read the IER to ensure any interrupt is cleared before
-		 * returning from ISR. */
-		if (save_offset == UART_TX || save_offset == UART_IER)
-			value = serial_in(up, UART_IER);
+		p->serial_in = mem_serial_in;
+		p->serial_out = dwapb_serial_out;
 		break;
 
 	default:
-		outb(value, up->port.iobase + offset);
+		p->serial_in = io_serial_in;
+		p->serial_out = io_serial_out;
+		break;
 	}
 }
 
 static void
 serial_out_sync(struct uart_8250_port *up, int offset, int value)
 {
-	switch (up->port.iotype) {
+	struct uart_port *p = &up->port;
+	switch (p->iotype) {
 	case UPIO_MEM:
 	case UPIO_MEM32:
 #ifdef CONFIG_SERIAL_8250_AU1X00
 	case UPIO_AU:
 #endif
 	case UPIO_DWAPB:
-		serial_out(up, offset, value);
-		serial_in(up, UART_LCR);	/* safe, no side-effects */
+		p->serial_out(p, offset, value);
+		p->serial_in(p, UART_LCR);	/* safe, no side-effects */
 		break;
 	default:
-		serial_out(up, offset, value);
+		p->serial_out(p, offset, value);
 	}
 }
 
+#define serial_in(up, offset)		\
+	(up->port.serial_in(&(up)->port, (offset)))
+#define serial_out(up, offset, value)	\
+	(up->port.serial_out(&(up)->port, (offset), (value)))
 /*
  * We used to support using pause I/O for certain machines.  We
  * haven't supported this for a while, but just in case it's badly
@@ -2576,6 +2638,7 @@ static void __init serial8250_isa_init_ports(void)
 		up->port.membase  = old_serial_port[i].iomem_base;
 		up->port.iotype   = old_serial_port[i].io_type;
 		up->port.regshift = old_serial_port[i].iomem_reg_shift;
+		set_io_from_upio(&up->port);
 		if (share_irqs)
 			up->port.flags |= UPF_SHARE_IRQ;
 	}
@@ -2769,6 +2832,13 @@ int __init early_serial_setup(struct uart_port *port)
 	p->flags        = port->flags;
 	p->mapbase      = port->mapbase;
 	p->private_data = port->private_data;
+
+	set_io_from_upio(p);
+	if (port->serial_in)
+		p->serial_in = port->serial_in;
+	if (port->serial_out)
+		p->serial_out = port->serial_out;
+
 	return 0;
 }
 
@@ -2833,6 +2903,8 @@ static int __devinit serial8250_probe(struct platform_device *dev)
 		port.mapbase		= p->mapbase;
 		port.hub6		= p->hub6;
 		port.private_data	= p->private_data;
+		port.serial_in		= p->serial_in;
+		port.serial_out		= p->serial_out;
 		port.dev		= &dev->dev;
 		if (share_irqs)
 			port.flags |= UPF_SHARE_IRQ;
@@ -2986,6 +3058,12 @@ int serial8250_register_port(struct uart_port *port)
 		uart->port.private_data = port->private_data;
 		if (port->dev)
 			uart->port.dev = port->dev;
+		set_io_from_upio(&uart->port);
+		/* Possibly override default I/O functions.  */
+		if (port->serial_in)
+			uart->port.serial_in = port->serial_in;
+		if (port->serial_out)
+			uart->port.serial_out = port->serial_out;
 
 		ret = uart_add_one_port(&serial8250_reg, &uart->port);
 		if (ret == 0)
diff --git a/include/linux/serial_8250.h b/include/linux/serial_8250.h
index 3d37c94abbc..77d83d929f2 100644
--- a/include/linux/serial_8250.h
+++ b/include/linux/serial_8250.h
@@ -28,6 +28,8 @@ struct plat_serial8250_port {
 	unsigned char	iotype;		/* UPIO_* */
 	unsigned char	hub6;
 	upf_t		flags;		/* UPF_* flags */
+	unsigned int	(*serial_in)(struct uart_port *, int);
+	void		(*serial_out)(struct uart_port *, int, int);
 };
 
 /*
diff --git a/include/linux/serial_core.h b/include/linux/serial_core.h
index 2395969faa0..60061f44f3d 100644
--- a/include/linux/serial_core.h
+++ b/include/linux/serial_core.h
@@ -248,6 +248,8 @@ struct uart_port {
 	spinlock_t		lock;			/* port lock */
 	unsigned long		iobase;			/* in/out[bwl] */
 	unsigned char __iomem	*membase;		/* read/write[bwl] */
+	unsigned int		(*serial_in)(struct uart_port *, int);
+	void			(*serial_out)(struct uart_port *, int, int);
 	unsigned int		irq;			/* irq number */
 	unsigned int		uartclk;		/* base uart clock */
 	unsigned int		fifosize;		/* tx fifo size */
-- 
cgit v1.2.3-70-g09d2


From 8e23fcc89c8091790903927449f8efb9b4e23960 Mon Sep 17 00:00:00 2001
From: David Daney <ddaney@caviumnetworks.com>
Date: Fri, 2 Jan 2009 13:49:54 +0000
Subject: Serial: Allow port type to be specified when calling
 serial8250_register_port.

Add flag value UPF_FIXED_TYPE which specifies that the UART type is
known and should not be probed.  For this case the UARTs properties
are just copied out of the uart_config entry.

This allows us to keep SOC specific 8250 probe code out of 8250.c.  In
this case we know the serial hardware will not be changing as it is on
the same silicon as the CPU, and we can specify it with certainty in
the board/cpu setup code.

The alternative is to load up 8250.c with a bunch of OCTEON specific
special cases in the probing code.

Signed-off-by: David Daney <ddaney@caviumnetworks.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Alan Cox <alan@redhat.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/serial/8250.c        | 9 +++++++++
 drivers/serial/serial_core.c | 7 +++++--
 include/linux/serial_8250.h  | 1 +
 include/linux/serial_core.h  | 2 ++
 4 files changed, 17 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/serial/8250.c b/drivers/serial/8250.c
index 849af9d21fe..3ae497422db 100644
--- a/drivers/serial/8250.c
+++ b/drivers/serial/8250.c
@@ -2903,6 +2903,7 @@ static int __devinit serial8250_probe(struct platform_device *dev)
 		port.mapbase		= p->mapbase;
 		port.hub6		= p->hub6;
 		port.private_data	= p->private_data;
+		port.type		= p->type;
 		port.serial_in		= p->serial_in;
 		port.serial_out		= p->serial_out;
 		port.dev		= &dev->dev;
@@ -3058,6 +3059,14 @@ int serial8250_register_port(struct uart_port *port)
 		uart->port.private_data = port->private_data;
 		if (port->dev)
 			uart->port.dev = port->dev;
+
+		if (port->flags & UPF_FIXED_TYPE) {
+			uart->port.type = port->type;
+			uart->port.fifosize = uart_config[port->type].fifo_size;
+			uart->capabilities = uart_config[port->type].flags;
+			uart->tx_loadsz = uart_config[port->type].tx_loadsz;
+		}
+
 		set_io_from_upio(&uart->port);
 		/* Possibly override default I/O functions.  */
 		if (port->serial_in)
diff --git a/drivers/serial/serial_core.c b/drivers/serial/serial_core.c
index 9425ed69e0f..dc68b7e0c93 100644
--- a/drivers/serial/serial_core.c
+++ b/drivers/serial/serial_core.c
@@ -2179,11 +2179,14 @@ uart_configure_port(struct uart_driver *drv, struct uart_state *state,
 	 * Now do the auto configuration stuff.  Note that config_port
 	 * is expected to claim the resources and map the port for us.
 	 */
-	flags = UART_CONFIG_TYPE;
+	flags = 0;
 	if (port->flags & UPF_AUTO_IRQ)
 		flags |= UART_CONFIG_IRQ;
 	if (port->flags & UPF_BOOT_AUTOCONF) {
-		port->type = PORT_UNKNOWN;
+		if (!(port->flags & UPF_FIXED_TYPE)) {
+			port->type = PORT_UNKNOWN;
+			flags |= UART_CONFIG_TYPE;
+		}
 		port->ops->config_port(port, flags);
 	}
 
diff --git a/include/linux/serial_8250.h b/include/linux/serial_8250.h
index 77d83d929f2..d4d2a78ad43 100644
--- a/include/linux/serial_8250.h
+++ b/include/linux/serial_8250.h
@@ -28,6 +28,7 @@ struct plat_serial8250_port {
 	unsigned char	iotype;		/* UPIO_* */
 	unsigned char	hub6;
 	upf_t		flags;		/* UPF_* flags */
+	unsigned int	type;		/* If UPF_FIXED_TYPE */
 	unsigned int	(*serial_in)(struct uart_port *, int);
 	void		(*serial_out)(struct uart_port *, int, int);
 };
diff --git a/include/linux/serial_core.h b/include/linux/serial_core.h
index 60061f44f3d..f155252f148 100644
--- a/include/linux/serial_core.h
+++ b/include/linux/serial_core.h
@@ -295,6 +295,8 @@ struct uart_port {
 #define UPF_MAGIC_MULTIPLIER	((__force upf_t) (1 << 16))
 #define UPF_CONS_FLOW		((__force upf_t) (1 << 23))
 #define UPF_SHARE_IRQ		((__force upf_t) (1 << 24))
+/* The exact UART type is known and should not be probed.  */
+#define UPF_FIXED_TYPE		((__force upf_t) (1 << 27))
 #define UPF_BOOT_AUTOCONF	((__force upf_t) (1 << 28))
 #define UPF_FIXED_PORT		((__force upf_t) (1 << 29))
 #define UPF_DEAD		((__force upf_t) (1 << 30))
-- 
cgit v1.2.3-70-g09d2


From 6b06f19151c335ee0c5b61839fa4e6838182ebb8 Mon Sep 17 00:00:00 2001
From: David Daney <ddaney@caviumnetworks.com>
Date: Fri, 2 Jan 2009 13:50:00 +0000
Subject: Serial: UART driver changes for Cavium OCTEON.

Cavium UART implementation is not covered by existing uart_configS.
Define a new uart_config (PORT_OCTEON) which is specified by OCTEON
platform device registration code.

Signed-off-by: Tomaso Paoletti <tpaoletti@caviumnetworks.com>
Signed-off-by: David Daney <ddaney@caviumnetworks.com>
Signed-off-by: Alan Cox <alan@redhat.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/serial/8250.c       | 7 +++++++
 include/linux/serial_core.h | 3 ++-
 2 files changed, 9 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/serial/8250.c b/drivers/serial/8250.c
index 3ae497422db..daa00567bc4 100644
--- a/drivers/serial/8250.c
+++ b/drivers/serial/8250.c
@@ -279,6 +279,13 @@ static const struct serial8250_config uart_config[] = {
 		.fcr		= UART_FCR_ENABLE_FIFO | UART_FCR_R_TRIG_10,
 		.flags		= UART_CAP_FIFO,
 	},
+	[PORT_OCTEON] = {
+		.name		= "OCTEON",
+		.fifo_size	= 64,
+		.tx_loadsz	= 64,
+		.fcr		= UART_FCR_ENABLE_FIFO | UART_FCR_R_TRIG_10,
+		.flags		= UART_CAP_FIFO,
+	},
 };
 
 #if defined (CONFIG_SERIAL_8250_AU1X00)
diff --git a/include/linux/serial_core.h b/include/linux/serial_core.h
index f155252f148..b4199841f1f 100644
--- a/include/linux/serial_core.h
+++ b/include/linux/serial_core.h
@@ -40,7 +40,8 @@
 #define PORT_NS16550A	14
 #define PORT_XSCALE	15
 #define PORT_RM9000	16	/* PMC-Sierra RM9xxx internal UART */
-#define PORT_MAX_8250	16	/* max port ID */
+#define PORT_OCTEON	17	/* Cavium OCTEON internal UART */
+#define PORT_MAX_8250	17	/* max port ID */
 
 /*
  * ARM specific type numbers.  These are not currently guaranteed
-- 
cgit v1.2.3-70-g09d2


From e65f0f8271b1b0452334e5da37fd35413a000de4 Mon Sep 17 00:00:00 2001
From: Flavio Leitner <fleitner@redhat.com>
Date: Fri, 2 Jan 2009 13:50:43 +0000
Subject: serial_8250: support for Sealevel Systems Model 7803 COMM+8

Add support for Sealevel Systems Model 7803 COMM+8

Signed-off-by: Flavio Leitner <fleitner@redhat.com>
Signed-off-by: Alan Cox <alan@redhat.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/serial/8250_pci.c | 3 +++
 include/linux/pci_ids.h   | 1 +
 2 files changed, 4 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/serial/8250_pci.c b/drivers/serial/8250_pci.c
index 2a2e1c717e8..c088146b751 100644
--- a/drivers/serial/8250_pci.c
+++ b/drivers/serial/8250_pci.c
@@ -2287,6 +2287,9 @@ static struct pci_device_id serial_pci_tbl[] = {
 	{	PCI_VENDOR_ID_SEALEVEL, PCI_DEVICE_ID_SEALEVEL_COMM8,
 		PCI_ANY_ID, PCI_ANY_ID, 0, 0,
 		pbn_b2_8_115200 },
+	{	PCI_VENDOR_ID_SEALEVEL, PCI_DEVICE_ID_SEALEVEL_7803,
+		PCI_ANY_ID, PCI_ANY_ID, 0, 0,
+		pbn_b2_8_460800 },
 	{	PCI_VENDOR_ID_SEALEVEL, PCI_DEVICE_ID_SEALEVEL_UCOMM8,
 		PCI_ANY_ID, PCI_ANY_ID, 0, 0,
 		pbn_b2_8_115200 },
diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h
index fa83dfefc5e..218c73b1e6d 100644
--- a/include/linux/pci_ids.h
+++ b/include/linux/pci_ids.h
@@ -1796,6 +1796,7 @@
 #define PCI_DEVICE_ID_SEALEVEL_UCOMM232	0x7202
 #define PCI_DEVICE_ID_SEALEVEL_COMM4	0x7401
 #define PCI_DEVICE_ID_SEALEVEL_COMM8	0x7801
+#define PCI_DEVICE_ID_SEALEVEL_7803	0x7803
 #define PCI_DEVICE_ID_SEALEVEL_UCOMM8	0x7804
 
 #define PCI_VENDOR_ID_HYPERCOPE		0x1365
-- 
cgit v1.2.3-70-g09d2


From aecde8b53b8ee1330a5a8206200f0d6b8845a6e0 Mon Sep 17 00:00:00 2001
From: Hans Verkuil <hverkuil@xs4all.nl>
Date: Tue, 30 Dec 2008 07:14:19 -0300
Subject: V4L/DVB (10141): v4l2: debugging API changed to match against driver
 name instead of ID.

Since the i2c driver ID will be removed in the near future we have to
modify the v4l2 debugging API to use the driver name instead of driver ID.

Note that this API is not used in applications other than v4l2-dbg.cpp
as it is for debugging and testing only.

Should anyone use the old VIDIOC_G_CHIP_IDENT, then this will be logged
with a warning that it is deprecated and will be removed in 2.6.30.

Signed-off-by: Hans Verkuil <hverkuil@xs4all.nl>
Signed-off-by: Mauro Carvalho Chehab <mchehab@redhat.com>
---
 Documentation/video4linux/v4l2-framework.txt    |  2 +-
 drivers/media/video/bt8xx/bttv-driver.c         |  9 +++--
 drivers/media/video/cafe_ccic.c                 |  7 ++--
 drivers/media/video/cs5345.c                    | 13 +++----
 drivers/media/video/cs53l32a.c                  |  2 +-
 drivers/media/video/cx18/cx18-i2c.c             | 28 +-------------
 drivers/media/video/cx18/cx18-i2c.h             |  1 -
 drivers/media/video/cx18/cx18-ioctl.c           | 41 ++++++++------------
 drivers/media/video/cx23885/cx23885-video.c     |  8 ++--
 drivers/media/video/cx25840/cx25840-core.c      | 13 +++----
 drivers/media/video/cx88/cx88-video.c           | 13 ++++---
 drivers/media/video/em28xx/em28xx-video.c       | 28 +++++++-------
 drivers/media/video/ivtv/ivtv-driver.c          |  7 ++--
 drivers/media/video/ivtv/ivtv-ioctl.c           | 21 +++++-----
 drivers/media/video/m52790.c                    | 13 +++----
 drivers/media/video/msp3400-driver.c            |  2 +-
 drivers/media/video/mt9m001.c                   | 19 ++++-----
 drivers/media/video/mt9m111.c                   | 19 ++++-----
 drivers/media/video/mt9t031.c                   | 18 ++++-----
 drivers/media/video/mt9v022.c                   | 19 ++++-----
 drivers/media/video/ov7670.c                    |  2 +-
 drivers/media/video/ov772x.c                    |  7 ++--
 drivers/media/video/pvrusb2/pvrusb2-hdw.c       | 11 +++---
 drivers/media/video/pvrusb2/pvrusb2-hdw.h       |  4 +-
 drivers/media/video/pvrusb2/pvrusb2-v4l2.c      |  6 +--
 drivers/media/video/saa7115.c                   | 13 +++----
 drivers/media/video/saa7127.c                   | 13 +++----
 drivers/media/video/saa7134/saa6752hs.c         |  2 +-
 drivers/media/video/saa7134/saa7134-empress.c   | 14 +++----
 drivers/media/video/saa7134/saa7134-video.c     |  9 +++--
 drivers/media/video/saa717x.c                   |  9 +++--
 drivers/media/video/soc_camera.c                |  6 +--
 drivers/media/video/tvaudio.c                   |  2 +-
 drivers/media/video/tvp5150.c                   | 13 +++----
 drivers/media/video/tw9910.c                    |  6 +--
 drivers/media/video/upd64031a.c                 | 13 +++----
 drivers/media/video/upd64083.c                  | 13 +++----
 drivers/media/video/usbvision/usbvision-video.c |  9 +++--
 drivers/media/video/v4l2-common.c               | 29 +++++++++-----
 drivers/media/video/v4l2-compat-ioctl32.c       |  3 +-
 drivers/media/video/v4l2-ioctl.c                | 15 +++++---
 drivers/media/video/v4l2-subdev.c               |  2 +-
 drivers/media/video/vp27smpx.c                  |  2 +-
 drivers/media/video/wm8739.c                    |  2 +-
 drivers/media/video/wm8775.c                    |  2 +-
 include/linux/videodev2.h                       | 51 ++++++++++++++++++-------
 include/media/soc_camera.h                      |  6 +--
 include/media/v4l2-chip-ident.h                 |  4 +-
 include/media/v4l2-common.h                     |  6 +--
 include/media/v4l2-int-device.h                 |  2 +-
 include/media/v4l2-ioctl.h                      |  6 +--
 include/media/v4l2-subdev.h                     |  6 +--
 52 files changed, 290 insertions(+), 281 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/video4linux/v4l2-framework.txt b/Documentation/video4linux/v4l2-framework.txt
index 3b483c1e012..ff124374e9b 100644
--- a/Documentation/video4linux/v4l2-framework.txt
+++ b/Documentation/video4linux/v4l2-framework.txt
@@ -184,7 +184,7 @@ may be NULL if the subdev driver does not support anything from that category.
 It looks like this:
 
 struct v4l2_subdev_core_ops {
-	int (*g_chip_ident)(struct v4l2_subdev *sd, struct v4l2_chip_ident *chip);
+	int (*g_chip_ident)(struct v4l2_subdev *sd, struct v4l2_dbg_chip_ident *chip);
 	int (*log_status)(struct v4l2_subdev *sd);
 	int (*init)(struct v4l2_subdev *sd, u32 val);
 	...
diff --git a/drivers/media/video/bt8xx/bttv-driver.c b/drivers/media/video/bt8xx/bttv-driver.c
index ebcb8e5e9c4..d2f43bd2f84 100644
--- a/drivers/media/video/bt8xx/bttv-driver.c
+++ b/drivers/media/video/bt8xx/bttv-driver.c
@@ -2039,7 +2039,7 @@ static int bttv_log_status(struct file *file, void *f)
 
 #ifdef CONFIG_VIDEO_ADV_DEBUG
 static int bttv_g_register(struct file *file, void *f,
-					struct v4l2_register *reg)
+					struct v4l2_dbg_register *reg)
 {
 	struct bttv_fh *fh = f;
 	struct bttv *btv = fh->btv;
@@ -2047,18 +2047,19 @@ static int bttv_g_register(struct file *file, void *f,
 	if (!capable(CAP_SYS_ADMIN))
 		return -EPERM;
 
-	if (!v4l2_chip_match_host(reg->match_type, reg->match_chip))
+	if (!v4l2_chip_match_host(&reg->match))
 		return -EINVAL;
 
 	/* bt848 has a 12-bit register space */
 	reg->reg &= 0xfff;
 	reg->val = btread(reg->reg);
+	reg->size = 1;
 
 	return 0;
 }
 
 static int bttv_s_register(struct file *file, void *f,
-					struct v4l2_register *reg)
+					struct v4l2_dbg_register *reg)
 {
 	struct bttv_fh *fh = f;
 	struct bttv *btv = fh->btv;
@@ -2066,7 +2067,7 @@ static int bttv_s_register(struct file *file, void *f,
 	if (!capable(CAP_SYS_ADMIN))
 		return -EPERM;
 
-	if (!v4l2_chip_match_host(reg->match_type, reg->match_chip))
+	if (!v4l2_chip_match_host(&reg->match))
 		return -EINVAL;
 
 	/* bt848 has a 12-bit register space */
diff --git a/drivers/media/video/cafe_ccic.c b/drivers/media/video/cafe_ccic.c
index 476171cf500..34a39d2e470 100644
--- a/drivers/media/video/cafe_ccic.c
+++ b/drivers/media/video/cafe_ccic.c
@@ -859,7 +859,7 @@ static int __cafe_cam_reset(struct cafe_camera *cam)
  */
 static int cafe_cam_init(struct cafe_camera *cam)
 {
-	struct v4l2_chip_ident chip = { V4L2_CHIP_MATCH_I2C_ADDR, 0, 0, 0 };
+	struct v4l2_dbg_chip_ident chip;
 	int ret;
 
 	mutex_lock(&cam->s_mutex);
@@ -869,8 +869,9 @@ static int cafe_cam_init(struct cafe_camera *cam)
 	ret = __cafe_cam_reset(cam);
 	if (ret)
 		goto out;
-	chip.match_chip = cam->sensor->addr;
-	ret = __cafe_cam_cmd(cam, VIDIOC_G_CHIP_IDENT, &chip);
+	chip.match.type = V4L2_CHIP_MATCH_I2C_ADDR;
+	chip.match.addr = cam->sensor->addr;
+	ret = __cafe_cam_cmd(cam, VIDIOC_DBG_G_CHIP_IDENT, &chip);
 	if (ret)
 		goto out;
 	cam->sensor_type = chip.ident;
diff --git a/drivers/media/video/cs5345.c b/drivers/media/video/cs5345.c
index 70fcd0d5de1..14bebf8a116 100644
--- a/drivers/media/video/cs5345.c
+++ b/drivers/media/video/cs5345.c
@@ -95,25 +95,24 @@ static int cs5345_s_ctrl(struct v4l2_subdev *sd, struct v4l2_control *ctrl)
 }
 
 #ifdef CONFIG_VIDEO_ADV_DEBUG
-static int cs5345_g_register(struct v4l2_subdev *sd, struct v4l2_register *reg)
+static int cs5345_g_register(struct v4l2_subdev *sd, struct v4l2_dbg_register *reg)
 {
 	struct i2c_client *client = v4l2_get_subdevdata(sd);
 
-	if (!v4l2_chip_match_i2c_client(client,
-				reg->match_type, reg->match_chip))
+	if (!v4l2_chip_match_i2c_client(client, &reg->match))
 		return -EINVAL;
 	if (!capable(CAP_SYS_ADMIN))
 		return -EPERM;
+	reg->size = 1;
 	reg->val = cs5345_read(sd, reg->reg & 0x1f);
 	return 0;
 }
 
-static int cs5345_s_register(struct v4l2_subdev *sd, struct v4l2_register *reg)
+static int cs5345_s_register(struct v4l2_subdev *sd, struct v4l2_dbg_register *reg)
 {
 	struct i2c_client *client = v4l2_get_subdevdata(sd);
 
-	if (!v4l2_chip_match_i2c_client(client,
-				reg->match_type, reg->match_chip))
+	if (!v4l2_chip_match_i2c_client(client, &reg->match))
 		return -EINVAL;
 	if (!capable(CAP_SYS_ADMIN))
 		return -EPERM;
@@ -122,7 +121,7 @@ static int cs5345_s_register(struct v4l2_subdev *sd, struct v4l2_register *reg)
 }
 #endif
 
-static int cs5345_g_chip_ident(struct v4l2_subdev *sd, struct v4l2_chip_ident *chip)
+static int cs5345_g_chip_ident(struct v4l2_subdev *sd, struct v4l2_dbg_chip_ident *chip)
 {
 	struct i2c_client *client = v4l2_get_subdevdata(sd);
 
diff --git a/drivers/media/video/cs53l32a.c b/drivers/media/video/cs53l32a.c
index cb65d519cf7..7292a6316e6 100644
--- a/drivers/media/video/cs53l32a.c
+++ b/drivers/media/video/cs53l32a.c
@@ -102,7 +102,7 @@ static int cs53l32a_s_ctrl(struct v4l2_subdev *sd, struct v4l2_control *ctrl)
 	return 0;
 }
 
-static int cs53l32a_g_chip_ident(struct v4l2_subdev *sd, struct v4l2_chip_ident *chip)
+static int cs53l32a_g_chip_ident(struct v4l2_subdev *sd, struct v4l2_dbg_chip_ident *chip)
 {
 	struct i2c_client *client = v4l2_get_subdevdata(sd);
 
diff --git a/drivers/media/video/cx18/cx18-i2c.c b/drivers/media/video/cx18/cx18-i2c.c
index 8941f58bed7..83e1c633312 100644
--- a/drivers/media/video/cx18/cx18-i2c.c
+++ b/drivers/media/video/cx18/cx18-i2c.c
@@ -242,7 +242,7 @@ int cx18_call_i2c_client(struct cx18 *cx, int addr, unsigned cmd, void *arg)
 			return retval;
 		}
 	}
-	if (cmd != VIDIOC_G_CHIP_IDENT)
+	if (cmd != VIDIOC_DBG_G_CHIP_IDENT)
 		CX18_ERR("i2c addr 0x%02x not found for cmd 0x%x!\n",
 			       addr, cmd);
 	return -ENODEV;
@@ -268,17 +268,6 @@ static int cx18_i2c_id_addr(struct cx18 *cx, u32 id)
 	return retval;
 }
 
-/* Find the i2c device name matching the DRIVERID */
-static const char *cx18_i2c_id_name(u32 id)
-{
-	int i;
-
-	for (i = 0; i < ARRAY_SIZE(hw_driverids); i++)
-		if (hw_driverids[i] == id)
-			return hw_devicenames[i];
-	return "unknown device";
-}
-
 /* Find the i2c device name matching the CX18_HW_ flag */
 static const char *cx18_i2c_hw_name(u32 hw)
 {
@@ -326,21 +315,6 @@ int cx18_i2c_hw(struct cx18 *cx, u32 hw, unsigned int cmd, void *arg)
 	return cx18_call_i2c_client(cx, addr, cmd, arg);
 }
 
-/* Calls i2c device based on I2C driver ID. */
-int cx18_i2c_id(struct cx18 *cx, u32 id, unsigned int cmd, void *arg)
-{
-	int addr;
-
-	addr = cx18_i2c_id_addr(cx, id);
-	if (addr < 0) {
-		if (cmd != VIDIOC_G_CHIP_IDENT)
-			CX18_ERR("i2c ID 0x%08x (%s) not found for cmd 0x%x!\n",
-				id, cx18_i2c_id_name(id), cmd);
-		return addr;
-	}
-	return cx18_call_i2c_client(cx, addr, cmd, arg);
-}
-
 /* broadcast cmd for all I2C clients and for the gpio subsystem */
 void cx18_call_i2c_clients(struct cx18 *cx, unsigned int cmd, void *arg)
 {
diff --git a/drivers/media/video/cx18/cx18-i2c.h b/drivers/media/video/cx18/cx18-i2c.h
index 113c3f9a2cc..4869739013b 100644
--- a/drivers/media/video/cx18/cx18-i2c.h
+++ b/drivers/media/video/cx18/cx18-i2c.h
@@ -23,7 +23,6 @@
 
 int cx18_i2c_hw_addr(struct cx18 *cx, u32 hw);
 int cx18_i2c_hw(struct cx18 *cx, u32 hw, unsigned int cmd, void *arg);
-int cx18_i2c_id(struct cx18 *cx, u32 id, unsigned int cmd, void *arg);
 int cx18_call_i2c_client(struct cx18 *cx, int addr, unsigned cmd, void *arg);
 void cx18_call_i2c_clients(struct cx18 *cx, unsigned int cmd, void *arg);
 int cx18_i2c_register(struct cx18 *cx, unsigned idx);
diff --git a/drivers/media/video/cx18/cx18-ioctl.c b/drivers/media/video/cx18/cx18-ioctl.c
index 8aa152b3954..7086aaba77d 100644
--- a/drivers/media/video/cx18/cx18-ioctl.c
+++ b/drivers/media/video/cx18/cx18-ioctl.c
@@ -254,30 +254,24 @@ static int cx18_s_fmt_sliced_vbi_cap(struct file *file, void *fh,
 }
 
 static int cx18_g_chip_ident(struct file *file, void *fh,
-				struct v4l2_chip_ident *chip)
+				struct v4l2_dbg_chip_ident *chip)
 {
 	struct cx18 *cx = ((struct cx18_open_id *)fh)->cx;
 
 	chip->ident = V4L2_IDENT_NONE;
 	chip->revision = 0;
-	if (chip->match_type == V4L2_CHIP_MATCH_HOST) {
-		if (v4l2_chip_match_host(chip->match_type, chip->match_chip))
-			chip->ident = V4L2_IDENT_CX23418;
+	if (v4l2_chip_match_host(&chip->match)) {
+		chip->ident = V4L2_IDENT_CX23418;
 		return 0;
 	}
-	if (chip->match_type == V4L2_CHIP_MATCH_I2C_DRIVER)
-		return cx18_i2c_id(cx, chip->match_chip, VIDIOC_G_CHIP_IDENT,
-					chip);
-	if (chip->match_type == V4L2_CHIP_MATCH_I2C_ADDR)
-		return cx18_call_i2c_client(cx, chip->match_chip,
-						VIDIOC_G_CHIP_IDENT, chip);
-	return -EINVAL;
+	cx18_call_i2c_clients(cx, VIDIOC_DBG_G_CHIP_IDENT, chip);
+	return 0;
 }
 
 #ifdef CONFIG_VIDEO_ADV_DEBUG
 static int cx18_cxc(struct cx18 *cx, unsigned int cmd, void *arg)
 {
-	struct v4l2_register *regs = arg;
+	struct v4l2_dbg_register *regs = arg;
 	unsigned long flags;
 
 	if (!capable(CAP_SYS_ADMIN))
@@ -286,6 +280,7 @@ static int cx18_cxc(struct cx18 *cx, unsigned int cmd, void *arg)
 		return -EINVAL;
 
 	spin_lock_irqsave(&cx18_cards_lock, flags);
+	regs->size = 4;
 	if (cmd == VIDIOC_DBG_G_REGISTER)
 		regs->val = cx18_read_enc(cx, regs->reg);
 	else
@@ -295,31 +290,25 @@ static int cx18_cxc(struct cx18 *cx, unsigned int cmd, void *arg)
 }
 
 static int cx18_g_register(struct file *file, void *fh,
-				struct v4l2_register *reg)
+				struct v4l2_dbg_register *reg)
 {
 	struct cx18 *cx = ((struct cx18_open_id *)fh)->cx;
 
-	if (v4l2_chip_match_host(reg->match_type, reg->match_chip))
+	if (v4l2_chip_match_host(&reg->match))
 		return cx18_cxc(cx, VIDIOC_DBG_G_REGISTER, reg);
-	if (reg->match_type == V4L2_CHIP_MATCH_I2C_DRIVER)
-		return cx18_i2c_id(cx, reg->match_chip, VIDIOC_DBG_G_REGISTER,
-					reg);
-	return cx18_call_i2c_client(cx, reg->match_chip, VIDIOC_DBG_G_REGISTER,
-					reg);
+	cx18_call_i2c_clients(cx, VIDIOC_DBG_G_REGISTER, reg);
+	return 0;
 }
 
 static int cx18_s_register(struct file *file, void *fh,
-				struct v4l2_register *reg)
+				struct v4l2_dbg_register *reg)
 {
 	struct cx18 *cx = ((struct cx18_open_id *)fh)->cx;
 
-	if (v4l2_chip_match_host(reg->match_type, reg->match_chip))
+	if (v4l2_chip_match_host(&reg->match))
 		return cx18_cxc(cx, VIDIOC_DBG_S_REGISTER, reg);
-	if (reg->match_type == V4L2_CHIP_MATCH_I2C_DRIVER)
-		return cx18_i2c_id(cx, reg->match_chip, VIDIOC_DBG_S_REGISTER,
-					reg);
-	return cx18_call_i2c_client(cx, reg->match_chip, VIDIOC_DBG_S_REGISTER,
-					reg);
+	cx18_call_i2c_clients(cx, VIDIOC_DBG_S_REGISTER, reg);
+	return 0;
 }
 #endif
 
diff --git a/drivers/media/video/cx23885/cx23885-video.c b/drivers/media/video/cx23885/cx23885-video.c
index 637c4d00884..2d81c4d0434 100644
--- a/drivers/media/video/cx23885/cx23885-video.c
+++ b/drivers/media/video/cx23885/cx23885-video.c
@@ -1326,11 +1326,11 @@ static int vidioc_s_frequency(struct file *file, void *priv,
 
 #ifdef CONFIG_VIDEO_ADV_DEBUG
 static int vidioc_g_register(struct file *file, void *fh,
-				struct v4l2_register *reg)
+				struct v4l2_dbg_register *reg)
 {
 	struct cx23885_dev *dev = ((struct cx23885_fh *)fh)->dev;
 
-	if (!v4l2_chip_match_host(reg->match_type, reg->match_chip))
+	if (!v4l2_chip_match_host(&reg->match))
 		return -EINVAL;
 
 	cx23885_call_i2c_clients(&dev->i2c_bus[2], VIDIOC_DBG_G_REGISTER, reg);
@@ -1339,11 +1339,11 @@ static int vidioc_g_register(struct file *file, void *fh,
 }
 
 static int vidioc_s_register(struct file *file, void *fh,
-				struct v4l2_register *reg)
+				struct v4l2_dbg_register *reg)
 {
 	struct cx23885_dev *dev = ((struct cx23885_fh *)fh)->dev;
 
-	if (!v4l2_chip_match_host(reg->match_type, reg->match_chip))
+	if (!v4l2_chip_match_host(&reg->match))
 		return -EINVAL;
 
 	cx23885_call_i2c_clients(&dev->i2c_bus[2], VIDIOC_DBG_S_REGISTER, reg);
diff --git a/drivers/media/video/cx25840/cx25840-core.c b/drivers/media/video/cx25840/cx25840-core.c
index 2ad277189da..88f2fd32bfe 100644
--- a/drivers/media/video/cx25840/cx25840-core.c
+++ b/drivers/media/video/cx25840/cx25840-core.c
@@ -1120,25 +1120,24 @@ static int cx25840_init(struct v4l2_subdev *sd, u32 val)
 }
 
 #ifdef CONFIG_VIDEO_ADV_DEBUG
-static int cx25840_g_register(struct v4l2_subdev *sd, struct v4l2_register *reg)
+static int cx25840_g_register(struct v4l2_subdev *sd, struct v4l2_dbg_register *reg)
 {
 	struct i2c_client *client = v4l2_get_subdevdata(sd);
 
-	if (!v4l2_chip_match_i2c_client(client,
-				reg->match_type, reg->match_chip))
+	if (!v4l2_chip_match_i2c_client(client, &reg->match))
 		return -EINVAL;
 	if (!capable(CAP_SYS_ADMIN))
 		return -EPERM;
+	reg->size = 1;
 	reg->val = cx25840_read(client, reg->reg & 0x0fff);
 	return 0;
 }
 
-static int cx25840_s_register(struct v4l2_subdev *sd, struct v4l2_register *reg)
+static int cx25840_s_register(struct v4l2_subdev *sd, struct v4l2_dbg_register *reg)
 {
 	struct i2c_client *client = v4l2_get_subdevdata(sd);
 
-	if (!v4l2_chip_match_i2c_client(client,
-				reg->match_type, reg->match_chip))
+	if (!v4l2_chip_match_i2c_client(client, &reg->match))
 		return -EINVAL;
 	if (!capable(CAP_SYS_ADMIN))
 		return -EPERM;
@@ -1362,7 +1361,7 @@ static int cx25840_reset(struct v4l2_subdev *sd, u32 val)
 	return 0;
 }
 
-static int cx25840_g_chip_ident(struct v4l2_subdev *sd, struct v4l2_chip_ident *chip)
+static int cx25840_g_chip_ident(struct v4l2_subdev *sd, struct v4l2_dbg_chip_ident *chip)
 {
 	struct cx25840_state *state = to_state(sd);
 	struct i2c_client *client = v4l2_get_subdevdata(sd);
diff --git a/drivers/media/video/cx88/cx88-video.c b/drivers/media/video/cx88/cx88-video.c
index b93b7ab99d8..791e69d804f 100644
--- a/drivers/media/video/cx88/cx88-video.c
+++ b/drivers/media/video/cx88/cx88-video.c
@@ -1447,25 +1447,26 @@ static int vidioc_s_frequency (struct file *file, void *priv,
 
 #ifdef CONFIG_VIDEO_ADV_DEBUG
 static int vidioc_g_register (struct file *file, void *fh,
-				struct v4l2_register *reg)
+				struct v4l2_dbg_register *reg)
 {
 	struct cx88_core *core = ((struct cx8800_fh*)fh)->dev->core;
 
-	if (!v4l2_chip_match_host(reg->match_type, reg->match_chip))
+	if (!v4l2_chip_match_host(&reg->match))
 		return -EINVAL;
 	/* cx2388x has a 24-bit register space */
-	reg->val = cx_read(reg->reg&0xffffff);
+	reg->val = cx_read(reg->reg & 0xffffff);
+	reg->size = 4;
 	return 0;
 }
 
 static int vidioc_s_register (struct file *file, void *fh,
-				struct v4l2_register *reg)
+				struct v4l2_dbg_register *reg)
 {
 	struct cx88_core *core = ((struct cx8800_fh*)fh)->dev->core;
 
-	if (!v4l2_chip_match_host(reg->match_type, reg->match_chip))
+	if (!v4l2_chip_match_host(&reg->match))
 		return -EINVAL;
-	cx_write(reg->reg&0xffffff, reg->val);
+	cx_write(reg->reg & 0xffffff, reg->val);
 	return 0;
 }
 #endif
diff --git a/drivers/media/video/em28xx/em28xx-video.c b/drivers/media/video/em28xx/em28xx-video.c
index 9cb7c64a88f..416b691c33c 100644
--- a/drivers/media/video/em28xx/em28xx-video.c
+++ b/drivers/media/video/em28xx/em28xx-video.c
@@ -1154,7 +1154,7 @@ static int em28xx_reg_len(int reg)
 }
 
 static int vidioc_g_chip_ident(struct file *file, void *priv,
-	       struct v4l2_chip_ident *chip)
+	       struct v4l2_dbg_chip_ident *chip)
 {
 	struct em28xx_fh      *fh  = priv;
 	struct em28xx         *dev = fh->dev;
@@ -1162,20 +1162,20 @@ static int vidioc_g_chip_ident(struct file *file, void *priv,
 	chip->ident = V4L2_IDENT_NONE;
 	chip->revision = 0;
 
-	em28xx_i2c_call_clients(dev, VIDIOC_G_CHIP_IDENT, chip);
+	em28xx_i2c_call_clients(dev, VIDIOC_DBG_G_CHIP_IDENT, chip);
 
 	return 0;
 }
 
 
 static int vidioc_g_register(struct file *file, void *priv,
-			     struct v4l2_register *reg)
+			     struct v4l2_dbg_register *reg)
 {
 	struct em28xx_fh      *fh  = priv;
 	struct em28xx         *dev = fh->dev;
 	int ret;
 
-	switch (reg->match_type) {
+	switch (reg->match.type) {
 	case V4L2_CHIP_MATCH_AC97:
 		mutex_lock(&dev->lock);
 		ret = em28xx_read_ac97(dev, reg->reg);
@@ -1184,6 +1184,7 @@ static int vidioc_g_register(struct file *file, void *priv,
 			return ret;
 
 		reg->val = ret;
+		reg->size = 1;
 		return 0;
 	case V4L2_CHIP_MATCH_I2C_DRIVER:
 		em28xx_i2c_call_clients(dev, VIDIOC_DBG_G_REGISTER, reg);
@@ -1192,12 +1193,13 @@ static int vidioc_g_register(struct file *file, void *priv,
 		/* Not supported yet */
 		return -EINVAL;
 	default:
-		if (!v4l2_chip_match_host(reg->match_type, reg->match_chip))
+		if (!v4l2_chip_match_host(&reg->match))
 			return -EINVAL;
 	}
 
 	/* Match host */
-	if (em28xx_reg_len(reg->reg) == 1) {
+	reg->size = em28xx_reg_len(reg->reg);
+	if (reg->size == 1) {
 		mutex_lock(&dev->lock);
 		ret = em28xx_read_reg(dev, reg->reg);
 		mutex_unlock(&dev->lock);
@@ -1207,7 +1209,7 @@ static int vidioc_g_register(struct file *file, void *priv,
 
 		reg->val = ret;
 	} else {
-		__le64 val = 0;
+		__le16 val = 0;
 		mutex_lock(&dev->lock);
 		ret = em28xx_read_reg_req_len(dev, USB_REQ_GET_STATUS,
 						   reg->reg, (char *)&val, 2);
@@ -1215,21 +1217,21 @@ static int vidioc_g_register(struct file *file, void *priv,
 		if (ret < 0)
 			return ret;
 
-		reg->val = le64_to_cpu(val);
+		reg->val = le16_to_cpu(val);
 	}
 
 	return 0;
 }
 
 static int vidioc_s_register(struct file *file, void *priv,
-			     struct v4l2_register *reg)
+			     struct v4l2_dbg_register *reg)
 {
 	struct em28xx_fh      *fh  = priv;
 	struct em28xx         *dev = fh->dev;
-	__le64 buf;
+	__le16 buf;
 	int    rc;
 
-	switch (reg->match_type) {
+	switch (reg->match.type) {
 	case V4L2_CHIP_MATCH_AC97:
 		mutex_lock(&dev->lock);
 		rc = em28xx_write_ac97(dev, reg->reg, reg->val);
@@ -1243,12 +1245,12 @@ static int vidioc_s_register(struct file *file, void *priv,
 		/* Not supported yet */
 		return -EINVAL;
 	default:
-		if (!v4l2_chip_match_host(reg->match_type, reg->match_chip))
+		if (!v4l2_chip_match_host(&reg->match))
 			return -EINVAL;
 	}
 
 	/* Match host */
-	buf = cpu_to_le64(reg->val);
+	buf = cpu_to_le16(reg->val);
 
 	mutex_lock(&dev->lock);
 	rc = em28xx_write_regs(dev, reg->reg, (char *)&buf,
diff --git a/drivers/media/video/ivtv/ivtv-driver.c b/drivers/media/video/ivtv/ivtv-driver.c
index 08b76295175..e8e5921cdc3 100644
--- a/drivers/media/video/ivtv/ivtv-driver.c
+++ b/drivers/media/video/ivtv/ivtv-driver.c
@@ -902,18 +902,19 @@ static void ivtv_load_and_init_modules(struct ivtv *itv)
 	}
 
 	if (hw & IVTV_HW_SAA711X) {
-		struct v4l2_chip_ident v = { V4L2_CHIP_MATCH_I2C_DRIVER, I2C_DRIVERID_SAA711X };
+		struct v4l2_dbg_chip_ident v;
 
 		/* determine the exact saa711x model */
 		itv->hw_flags &= ~IVTV_HW_SAA711X;
 
+		v.match.type = V4L2_CHIP_MATCH_I2C_DRIVER;
+		strlcpy(v.match.name, "saa7115", sizeof(v.match.name));
 		ivtv_call_hw(itv, IVTV_HW_SAA711X, core, g_chip_ident, &v);
 		if (v.ident == V4L2_IDENT_SAA7114) {
 			itv->hw_flags |= IVTV_HW_SAA7114;
 			/* VBI is not yet supported by the saa7114 driver. */
 			itv->v4l2_cap &= ~(V4L2_CAP_SLICED_VBI_CAPTURE|V4L2_CAP_VBI_CAPTURE);
-		}
-		else {
+		} else {
 			itv->hw_flags |= IVTV_HW_SAA7115;
 		}
 		itv->vbi.raw_decoder_line_size = 1443;
diff --git a/drivers/media/video/ivtv/ivtv-ioctl.c b/drivers/media/video/ivtv/ivtv-ioctl.c
index 1f6ca93b984..f6b3ef6e691 100644
--- a/drivers/media/video/ivtv/ivtv-ioctl.c
+++ b/drivers/media/video/ivtv/ivtv-ioctl.c
@@ -674,19 +674,19 @@ static int ivtv_s_fmt_vid_out_overlay(struct file *file, void *fh, struct v4l2_f
 	return ret;
 }
 
-static int ivtv_g_chip_ident(struct file *file, void *fh, struct v4l2_chip_ident *chip)
+static int ivtv_g_chip_ident(struct file *file, void *fh, struct v4l2_dbg_chip_ident *chip)
 {
 	struct ivtv *itv = ((struct ivtv_open_id *)fh)->itv;
 
 	chip->ident = V4L2_IDENT_NONE;
 	chip->revision = 0;
-	if (chip->match_type == V4L2_CHIP_MATCH_HOST) {
-		if (v4l2_chip_match_host(chip->match_type, chip->match_chip))
+	if (chip->match.type == V4L2_CHIP_MATCH_HOST) {
+		if (v4l2_chip_match_host(&chip->match))
 			chip->ident = itv->has_cx23415 ? V4L2_IDENT_CX23415 : V4L2_IDENT_CX23416;
 		return 0;
 	}
-	if (chip->match_type != V4L2_CHIP_MATCH_I2C_DRIVER &&
-	    chip->match_type != V4L2_CHIP_MATCH_I2C_ADDR)
+	if (chip->match.type != V4L2_CHIP_MATCH_I2C_DRIVER &&
+	    chip->match.type != V4L2_CHIP_MATCH_I2C_ADDR)
 		return -EINVAL;
 	/* TODO: is this correct? */
 	return ivtv_call_all_err(itv, core, g_chip_ident, chip);
@@ -695,7 +695,7 @@ static int ivtv_g_chip_ident(struct file *file, void *fh, struct v4l2_chip_ident
 #ifdef CONFIG_VIDEO_ADV_DEBUG
 static int ivtv_itvc(struct ivtv *itv, unsigned int cmd, void *arg)
 {
-	struct v4l2_register *regs = arg;
+	struct v4l2_dbg_register *regs = arg;
 	volatile u8 __iomem *reg_start;
 
 	if (!capable(CAP_SYS_ADMIN))
@@ -710,6 +710,7 @@ static int ivtv_itvc(struct ivtv *itv, unsigned int cmd, void *arg)
 	else
 		return -EINVAL;
 
+	regs->size = 4;
 	if (cmd == VIDIOC_DBG_G_REGISTER)
 		regs->val = readl(regs->reg + reg_start);
 	else
@@ -717,11 +718,11 @@ static int ivtv_itvc(struct ivtv *itv, unsigned int cmd, void *arg)
 	return 0;
 }
 
-static int ivtv_g_register(struct file *file, void *fh, struct v4l2_register *reg)
+static int ivtv_g_register(struct file *file, void *fh, struct v4l2_dbg_register *reg)
 {
 	struct ivtv *itv = ((struct ivtv_open_id *)fh)->itv;
 
-	if (v4l2_chip_match_host(reg->match_type, reg->match_chip))
+	if (v4l2_chip_match_host(&reg->match))
 		return ivtv_itvc(itv, VIDIOC_DBG_G_REGISTER, reg);
 	/* TODO: subdev errors should not be ignored, this should become a
 	   subdev helper function. */
@@ -729,11 +730,11 @@ static int ivtv_g_register(struct file *file, void *fh, struct v4l2_register *re
 	return 0;
 }
 
-static int ivtv_s_register(struct file *file, void *fh, struct v4l2_register *reg)
+static int ivtv_s_register(struct file *file, void *fh, struct v4l2_dbg_register *reg)
 {
 	struct ivtv *itv = ((struct ivtv_open_id *)fh)->itv;
 
-	if (v4l2_chip_match_host(reg->match_type, reg->match_chip))
+	if (v4l2_chip_match_host(&reg->match))
 		return ivtv_itvc(itv, VIDIOC_DBG_S_REGISTER, reg);
 	/* TODO: subdev errors should not be ignored, this should become a
 	   subdev helper function. */
diff --git a/drivers/media/video/m52790.c b/drivers/media/video/m52790.c
index 07be14a9fe7..de397ef57b4 100644
--- a/drivers/media/video/m52790.c
+++ b/drivers/media/video/m52790.c
@@ -80,29 +80,28 @@ static int m52790_s_routing(struct v4l2_subdev *sd, const struct v4l2_routing *r
 }
 
 #ifdef CONFIG_VIDEO_ADV_DEBUG
-static int m52790_g_register(struct v4l2_subdev *sd, struct v4l2_register *reg)
+static int m52790_g_register(struct v4l2_subdev *sd, struct v4l2_dbg_register *reg)
 {
 	struct m52790_state *state = to_state(sd);
 	struct i2c_client *client = v4l2_get_subdevdata(sd);
 
-	if (!v4l2_chip_match_i2c_client(client,
-				reg->match_type, reg->match_chip))
+	if (!v4l2_chip_match_i2c_client(client, &reg->match))
 		return -EINVAL;
 	if (!capable(CAP_SYS_ADMIN))
 		return -EPERM;
 	if (reg->reg != 0)
 		return -EINVAL;
+	reg->size = 1;
 	reg->val = state->input | state->output;
 	return 0;
 }
 
-static int m52790_s_register(struct v4l2_subdev *sd, struct v4l2_register *reg)
+static int m52790_s_register(struct v4l2_subdev *sd, struct v4l2_dbg_register *reg)
 {
 	struct m52790_state *state = to_state(sd);
 	struct i2c_client *client = v4l2_get_subdevdata(sd);
 
-	if (!v4l2_chip_match_i2c_client(client,
-				reg->match_type, reg->match_chip))
+	if (!v4l2_chip_match_i2c_client(client, &reg->match))
 		return -EINVAL;
 	if (!capable(CAP_SYS_ADMIN))
 		return -EPERM;
@@ -115,7 +114,7 @@ static int m52790_s_register(struct v4l2_subdev *sd, struct v4l2_register *reg)
 }
 #endif
 
-static int m52790_g_chip_ident(struct v4l2_subdev *sd, struct v4l2_chip_ident *chip)
+static int m52790_g_chip_ident(struct v4l2_subdev *sd, struct v4l2_dbg_chip_ident *chip)
 {
 	struct i2c_client *client = v4l2_get_subdevdata(sd);
 
diff --git a/drivers/media/video/msp3400-driver.c b/drivers/media/video/msp3400-driver.c
index b8577ade405..4d7a9185211 100644
--- a/drivers/media/video/msp3400-driver.c
+++ b/drivers/media/video/msp3400-driver.c
@@ -733,7 +733,7 @@ static int msp_queryctrl(struct v4l2_subdev *sd, struct v4l2_queryctrl *qc)
 	return 0;
 }
 
-static int msp_g_chip_ident(struct v4l2_subdev *sd, struct v4l2_chip_ident *chip)
+static int msp_g_chip_ident(struct v4l2_subdev *sd, struct v4l2_dbg_chip_ident *chip)
 {
 	struct msp_state *state = to_state(sd);
 	struct i2c_client *client = v4l2_get_subdevdata(sd);
diff --git a/drivers/media/video/mt9m001.c b/drivers/media/video/mt9m001.c
index 1a1a1245367..c1bf75ef274 100644
--- a/drivers/media/video/mt9m001.c
+++ b/drivers/media/video/mt9m001.c
@@ -343,14 +343,14 @@ static int mt9m001_try_fmt(struct soc_camera_device *icd,
 }
 
 static int mt9m001_get_chip_id(struct soc_camera_device *icd,
-			       struct v4l2_chip_ident *id)
+			       struct v4l2_dbg_chip_ident *id)
 {
 	struct mt9m001 *mt9m001 = container_of(icd, struct mt9m001, icd);
 
-	if (id->match_type != V4L2_CHIP_MATCH_I2C_ADDR)
+	if (id->match.type != V4L2_CHIP_MATCH_I2C_ADDR)
 		return -EINVAL;
 
-	if (id->match_chip != mt9m001->client->addr)
+	if (id->match.addr != mt9m001->client->addr)
 		return -ENODEV;
 
 	id->ident	= mt9m001->model;
@@ -361,16 +361,17 @@ static int mt9m001_get_chip_id(struct soc_camera_device *icd,
 
 #ifdef CONFIG_VIDEO_ADV_DEBUG
 static int mt9m001_get_register(struct soc_camera_device *icd,
-				struct v4l2_register *reg)
+				struct v4l2_dbg_register *reg)
 {
 	struct mt9m001 *mt9m001 = container_of(icd, struct mt9m001, icd);
 
-	if (reg->match_type != V4L2_CHIP_MATCH_I2C_ADDR || reg->reg > 0xff)
+	if (reg->match.type != V4L2_CHIP_MATCH_I2C_ADDR || reg->reg > 0xff)
 		return -EINVAL;
 
-	if (reg->match_chip != mt9m001->client->addr)
+	if (reg->match.addr != mt9m001->client->addr)
 		return -ENODEV;
 
+	reg->size = 2;
 	reg->val = reg_read(icd, reg->reg);
 
 	if (reg->val > 0xffff)
@@ -380,14 +381,14 @@ static int mt9m001_get_register(struct soc_camera_device *icd,
 }
 
 static int mt9m001_set_register(struct soc_camera_device *icd,
-				struct v4l2_register *reg)
+				struct v4l2_dbg_register *reg)
 {
 	struct mt9m001 *mt9m001 = container_of(icd, struct mt9m001, icd);
 
-	if (reg->match_type != V4L2_CHIP_MATCH_I2C_ADDR || reg->reg > 0xff)
+	if (reg->match.type != V4L2_CHIP_MATCH_I2C_ADDR || reg->reg > 0xff)
 		return -EINVAL;
 
-	if (reg->match_chip != mt9m001->client->addr)
+	if (reg->match.addr != mt9m001->client->addr)
 		return -ENODEV;
 
 	if (reg_write(icd, reg->reg, reg->val) < 0)
diff --git a/drivers/media/video/mt9m111.c b/drivers/media/video/mt9m111.c
index c89ea41fe25..5b8e20979cc 100644
--- a/drivers/media/video/mt9m111.c
+++ b/drivers/media/video/mt9m111.c
@@ -514,14 +514,14 @@ static int mt9m111_try_fmt(struct soc_camera_device *icd,
 }
 
 static int mt9m111_get_chip_id(struct soc_camera_device *icd,
-			       struct v4l2_chip_ident *id)
+			       struct v4l2_dbg_chip_ident *id)
 {
 	struct mt9m111 *mt9m111 = container_of(icd, struct mt9m111, icd);
 
-	if (id->match_type != V4L2_CHIP_MATCH_I2C_ADDR)
+	if (id->match.type != V4L2_CHIP_MATCH_I2C_ADDR)
 		return -EINVAL;
 
-	if (id->match_chip != mt9m111->client->addr)
+	if (id->match.addr != mt9m111->client->addr)
 		return -ENODEV;
 
 	id->ident	= mt9m111->model;
@@ -532,18 +532,19 @@ static int mt9m111_get_chip_id(struct soc_camera_device *icd,
 
 #ifdef CONFIG_VIDEO_ADV_DEBUG
 static int mt9m111_get_register(struct soc_camera_device *icd,
-				struct v4l2_register *reg)
+				struct v4l2_dbg_register *reg)
 {
 	int val;
 
 	struct mt9m111 *mt9m111 = container_of(icd, struct mt9m111, icd);
 
-	if (reg->match_type != V4L2_CHIP_MATCH_I2C_ADDR || reg->reg > 0x2ff)
+	if (reg->match.type != V4L2_CHIP_MATCH_I2C_ADDR || reg->reg > 0x2ff)
 		return -EINVAL;
-	if (reg->match_chip != mt9m111->client->addr)
+	if (reg->match.addr != mt9m111->client->addr)
 		return -ENODEV;
 
 	val = mt9m111_reg_read(icd, reg->reg);
+	reg->size = 2;
 	reg->val = (u64)val;
 
 	if (reg->val > 0xffff)
@@ -553,14 +554,14 @@ static int mt9m111_get_register(struct soc_camera_device *icd,
 }
 
 static int mt9m111_set_register(struct soc_camera_device *icd,
-				struct v4l2_register *reg)
+				struct v4l2_dbg_register *reg)
 {
 	struct mt9m111 *mt9m111 = container_of(icd, struct mt9m111, icd);
 
-	if (reg->match_type != V4L2_CHIP_MATCH_I2C_ADDR || reg->reg > 0x2ff)
+	if (reg->match.type != V4L2_CHIP_MATCH_I2C_ADDR || reg->reg > 0x2ff)
 		return -EINVAL;
 
-	if (reg->match_chip != mt9m111->client->addr)
+	if (reg->match.addr != mt9m111->client->addr)
 		return -ENODEV;
 
 	if (mt9m111_reg_write(icd, reg->reg, reg->val) < 0)
diff --git a/drivers/media/video/mt9t031.c b/drivers/media/video/mt9t031.c
index 1a9d53966d0..349d8e36553 100644
--- a/drivers/media/video/mt9t031.c
+++ b/drivers/media/video/mt9t031.c
@@ -326,14 +326,14 @@ static int mt9t031_try_fmt(struct soc_camera_device *icd,
 }
 
 static int mt9t031_get_chip_id(struct soc_camera_device *icd,
-			       struct v4l2_chip_ident *id)
+			       struct v4l2_dbg_chip_ident *id)
 {
 	struct mt9t031 *mt9t031 = container_of(icd, struct mt9t031, icd);
 
-	if (id->match_type != V4L2_CHIP_MATCH_I2C_ADDR)
+	if (id->match.type != V4L2_CHIP_MATCH_I2C_ADDR)
 		return -EINVAL;
 
-	if (id->match_chip != mt9t031->client->addr)
+	if (id->match.addr != mt9t031->client->addr)
 		return -ENODEV;
 
 	id->ident	= mt9t031->model;
@@ -344,14 +344,14 @@ static int mt9t031_get_chip_id(struct soc_camera_device *icd,
 
 #ifdef CONFIG_VIDEO_ADV_DEBUG
 static int mt9t031_get_register(struct soc_camera_device *icd,
-				struct v4l2_register *reg)
+				struct v4l2_dbg_register *reg)
 {
 	struct mt9t031 *mt9t031 = container_of(icd, struct mt9t031, icd);
 
-	if (reg->match_type != V4L2_CHIP_MATCH_I2C_ADDR || reg->reg > 0xff)
+	if (reg->match.type != V4L2_CHIP_MATCH_I2C_ADDR || reg->reg > 0xff)
 		return -EINVAL;
 
-	if (reg->match_chip != mt9t031->client->addr)
+	if (reg->match.addr != mt9t031->client->addr)
 		return -ENODEV;
 
 	reg->val = reg_read(icd, reg->reg);
@@ -363,14 +363,14 @@ static int mt9t031_get_register(struct soc_camera_device *icd,
 }
 
 static int mt9t031_set_register(struct soc_camera_device *icd,
-				struct v4l2_register *reg)
+				struct v4l2_dbg_register *reg)
 {
 	struct mt9t031 *mt9t031 = container_of(icd, struct mt9t031, icd);
 
-	if (reg->match_type != V4L2_CHIP_MATCH_I2C_ADDR || reg->reg > 0xff)
+	if (reg->match.type != V4L2_CHIP_MATCH_I2C_ADDR || reg->reg > 0xff)
 		return -EINVAL;
 
-	if (reg->match_chip != mt9t031->client->addr)
+	if (reg->match.addr != mt9t031->client->addr)
 		return -ENODEV;
 
 	if (reg_write(icd, reg->reg, reg->val) < 0)
diff --git a/drivers/media/video/mt9v022.c b/drivers/media/video/mt9v022.c
index 14a5f9c21ff..b04c8cb1644 100644
--- a/drivers/media/video/mt9v022.c
+++ b/drivers/media/video/mt9v022.c
@@ -422,14 +422,14 @@ static int mt9v022_try_fmt(struct soc_camera_device *icd,
 }
 
 static int mt9v022_get_chip_id(struct soc_camera_device *icd,
-			       struct v4l2_chip_ident *id)
+			       struct v4l2_dbg_chip_ident *id)
 {
 	struct mt9v022 *mt9v022 = container_of(icd, struct mt9v022, icd);
 
-	if (id->match_type != V4L2_CHIP_MATCH_I2C_ADDR)
+	if (id->match.type != V4L2_CHIP_MATCH_I2C_ADDR)
 		return -EINVAL;
 
-	if (id->match_chip != mt9v022->client->addr)
+	if (id->match.addr != mt9v022->client->addr)
 		return -ENODEV;
 
 	id->ident	= mt9v022->model;
@@ -440,16 +440,17 @@ static int mt9v022_get_chip_id(struct soc_camera_device *icd,
 
 #ifdef CONFIG_VIDEO_ADV_DEBUG
 static int mt9v022_get_register(struct soc_camera_device *icd,
-				struct v4l2_register *reg)
+				struct v4l2_dbg_register *reg)
 {
 	struct mt9v022 *mt9v022 = container_of(icd, struct mt9v022, icd);
 
-	if (reg->match_type != V4L2_CHIP_MATCH_I2C_ADDR || reg->reg > 0xff)
+	if (reg->match.type != V4L2_CHIP_MATCH_I2C_ADDR || reg->reg > 0xff)
 		return -EINVAL;
 
-	if (reg->match_chip != mt9v022->client->addr)
+	if (reg->match.addr != mt9v022->client->addr)
 		return -ENODEV;
 
+	reg->size = 2;
 	reg->val = reg_read(icd, reg->reg);
 
 	if (reg->val > 0xffff)
@@ -459,14 +460,14 @@ static int mt9v022_get_register(struct soc_camera_device *icd,
 }
 
 static int mt9v022_set_register(struct soc_camera_device *icd,
-				struct v4l2_register *reg)
+				struct v4l2_dbg_register *reg)
 {
 	struct mt9v022 *mt9v022 = container_of(icd, struct mt9v022, icd);
 
-	if (reg->match_type != V4L2_CHIP_MATCH_I2C_ADDR || reg->reg > 0xff)
+	if (reg->match.type != V4L2_CHIP_MATCH_I2C_ADDR || reg->reg > 0xff)
 		return -EINVAL;
 
-	if (reg->match_chip != mt9v022->client->addr)
+	if (reg->match.addr != mt9v022->client->addr)
 		return -ENODEV;
 
 	if (reg_write(icd, reg->reg, reg->val) < 0)
diff --git a/drivers/media/video/ov7670.c b/drivers/media/video/ov7670.c
index ea032f5f2f4..ca26b0c50cf 100644
--- a/drivers/media/video/ov7670.c
+++ b/drivers/media/video/ov7670.c
@@ -1310,7 +1310,7 @@ static int ov7670_command(struct i2c_client *client, unsigned int cmd,
 		void *arg)
 {
 	switch (cmd) {
-	case VIDIOC_G_CHIP_IDENT:
+	case VIDIOC_DBG_G_CHIP_IDENT:
 		return v4l2_chip_ident_i2c_client(client, arg, V4L2_IDENT_OV7670, 0);
 
 	case VIDIOC_INT_RESET:
diff --git a/drivers/media/video/ov772x.c b/drivers/media/video/ov772x.c
index 54b736fcc07..3c9e0ba974e 100644
--- a/drivers/media/video/ov772x.c
+++ b/drivers/media/video/ov772x.c
@@ -724,7 +724,7 @@ static unsigned long ov772x_query_bus_param(struct soc_camera_device *icd)
 }
 
 static int ov772x_get_chip_id(struct soc_camera_device *icd,
-			      struct v4l2_chip_ident   *id)
+			      struct v4l2_dbg_chip_ident   *id)
 {
 	struct ov772x_priv *priv = container_of(icd, struct ov772x_priv, icd);
 
@@ -736,11 +736,12 @@ static int ov772x_get_chip_id(struct soc_camera_device *icd,
 
 #ifdef CONFIG_VIDEO_ADV_DEBUG
 static int ov772x_get_register(struct soc_camera_device *icd,
-			       struct v4l2_register *reg)
+			       struct v4l2_dbg_register *reg)
 {
 	struct ov772x_priv *priv = container_of(icd, struct ov772x_priv, icd);
 	int                 ret;
 
+	reg->size = 1;
 	if (reg->reg > 0xff)
 		return -EINVAL;
 
@@ -754,7 +755,7 @@ static int ov772x_get_register(struct soc_camera_device *icd,
 }
 
 static int ov772x_set_register(struct soc_camera_device *icd,
-			       struct v4l2_register *reg)
+			       struct v4l2_dbg_register *reg)
 {
 	struct ov772x_priv *priv = container_of(icd, struct ov772x_priv, icd);
 
diff --git a/drivers/media/video/pvrusb2/pvrusb2-hdw.c b/drivers/media/video/pvrusb2/pvrusb2-hdw.c
index 4358079f196..8fb92ac78c7 100644
--- a/drivers/media/video/pvrusb2/pvrusb2-hdw.c
+++ b/drivers/media/video/pvrusb2/pvrusb2-hdw.c
@@ -4732,26 +4732,25 @@ static int pvr2_hdw_get_eeprom_addr(struct pvr2_hdw *hdw)
 
 
 int pvr2_hdw_register_access(struct pvr2_hdw *hdw,
-			     u32 match_type, u32 match_chip, u64 reg_id,
-			     int setFl,u64 *val_ptr)
+			     struct v4l2_dbg_match *match, u64 reg_id,
+			     int setFl, u64 *val_ptr)
 {
 #ifdef CONFIG_VIDEO_ADV_DEBUG
 	struct pvr2_i2c_client *cp;
-	struct v4l2_register req;
+	struct v4l2_dbg_register req;
 	int stat = 0;
 	int okFl = 0;
 
 	if (!capable(CAP_SYS_ADMIN)) return -EPERM;
 
-	req.match_type = match_type;
-	req.match_chip = match_chip;
+	req.match = *match;
 	req.reg = reg_id;
 	if (setFl) req.val = *val_ptr;
 	mutex_lock(&hdw->i2c_list_lock); do {
 		list_for_each_entry(cp, &hdw->i2c_clients, list) {
 			if (!v4l2_chip_match_i2c_client(
 				    cp->client,
-				    req.match_type, req.match_chip)) {
+				    &req.match)) {
 				continue;
 			}
 			stat = pvr2_i2c_client_cmd(
diff --git a/drivers/media/video/pvrusb2/pvrusb2-hdw.h b/drivers/media/video/pvrusb2/pvrusb2-hdw.h
index 49482d1f2b2..1b4fec337c6 100644
--- a/drivers/media/video/pvrusb2/pvrusb2-hdw.h
+++ b/drivers/media/video/pvrusb2/pvrusb2-hdw.h
@@ -242,8 +242,8 @@ void pvr2_hdw_v4l_store_minor_number(struct pvr2_hdw *,
    setFl   - true to set the register, false to read it
    val_ptr - storage location for source / result. */
 int pvr2_hdw_register_access(struct pvr2_hdw *,
-			     u32 match_type, u32 match_chip,u64 reg_id,
-			     int setFl,u64 *val_ptr);
+			     struct v4l2_dbg_match *match, u64 reg_id,
+			     int setFl, u64 *val_ptr);
 
 /* The following entry points are all lower level things you normally don't
    want to worry about. */
diff --git a/drivers/media/video/pvrusb2/pvrusb2-v4l2.c b/drivers/media/video/pvrusb2/pvrusb2-v4l2.c
index b9aedceb2c4..878fd52a73b 100644
--- a/drivers/media/video/pvrusb2/pvrusb2-v4l2.c
+++ b/drivers/media/video/pvrusb2/pvrusb2-v4l2.c
@@ -851,11 +851,11 @@ static long pvr2_v4l2_do_ioctl(struct file *file, unsigned int cmd, void *arg)
 	case VIDIOC_DBG_G_REGISTER:
 	{
 		u64 val;
-		struct v4l2_register *req = (struct v4l2_register *)arg;
+		struct v4l2_dbg_register *req = (struct v4l2_dbg_register *)arg;
 		if (cmd == VIDIOC_DBG_S_REGISTER) val = req->val;
 		ret = pvr2_hdw_register_access(
-			hdw,req->match_type,req->match_chip,req->reg,
-			cmd == VIDIOC_DBG_S_REGISTER,&val);
+			hdw, &req->match, req->reg,
+			cmd == VIDIOC_DBG_S_REGISTER, &val);
 		if (cmd == VIDIOC_DBG_G_REGISTER) req->val = val;
 		break;
 	}
diff --git a/drivers/media/video/saa7115.c b/drivers/media/video/saa7115.c
index 22708ecdf1b..46c796c3fec 100644
--- a/drivers/media/video/saa7115.c
+++ b/drivers/media/video/saa7115.c
@@ -1371,25 +1371,24 @@ static int saa711x_g_vbi_data(struct v4l2_subdev *sd, struct v4l2_sliced_vbi_dat
 }
 
 #ifdef CONFIG_VIDEO_ADV_DEBUG
-static int saa711x_g_register(struct v4l2_subdev *sd, struct v4l2_register *reg)
+static int saa711x_g_register(struct v4l2_subdev *sd, struct v4l2_dbg_register *reg)
 {
 	struct i2c_client *client = v4l2_get_subdevdata(sd);
 
-	if (!v4l2_chip_match_i2c_client(client,
-				reg->match_type, reg->match_chip))
+	if (!v4l2_chip_match_i2c_client(client, &reg->match))
 		return -EINVAL;
 	if (!capable(CAP_SYS_ADMIN))
 		return -EPERM;
 	reg->val = saa711x_read(sd, reg->reg & 0xff);
+	reg->size = 1;
 	return 0;
 }
 
-static int saa711x_s_register(struct v4l2_subdev *sd, struct v4l2_register *reg)
+static int saa711x_s_register(struct v4l2_subdev *sd, struct v4l2_dbg_register *reg)
 {
 	struct i2c_client *client = v4l2_get_subdevdata(sd);
 
-	if (!v4l2_chip_match_i2c_client(client,
-				reg->match_type, reg->match_chip))
+	if (!v4l2_chip_match_i2c_client(client, &reg->match))
 		return -EINVAL;
 	if (!capable(CAP_SYS_ADMIN))
 		return -EPERM;
@@ -1398,7 +1397,7 @@ static int saa711x_s_register(struct v4l2_subdev *sd, struct v4l2_register *reg)
 }
 #endif
 
-static int saa711x_g_chip_ident(struct v4l2_subdev *sd, struct v4l2_chip_ident *chip)
+static int saa711x_g_chip_ident(struct v4l2_subdev *sd, struct v4l2_dbg_chip_ident *chip)
 {
 	struct saa711x_state *state = to_state(sd);
 	struct i2c_client *client = v4l2_get_subdevdata(sd);
diff --git a/drivers/media/video/saa7127.c b/drivers/media/video/saa7127.c
index bfc85654795..d6848f7a503 100644
--- a/drivers/media/video/saa7127.c
+++ b/drivers/media/video/saa7127.c
@@ -623,25 +623,24 @@ static int saa7127_s_vbi_data(struct v4l2_subdev *sd, const struct v4l2_sliced_v
 }
 
 #ifdef CONFIG_VIDEO_ADV_DEBUG
-static int saa7127_g_register(struct v4l2_subdev *sd, struct v4l2_register *reg)
+static int saa7127_g_register(struct v4l2_subdev *sd, struct v4l2_dbg_register *reg)
 {
 	struct i2c_client *client = v4l2_get_subdevdata(sd);
 
-	if (!v4l2_chip_match_i2c_client(client,
-				reg->match_type, reg->match_chip))
+	if (!v4l2_chip_match_i2c_client(client, &reg->match))
 		return -EINVAL;
 	if (!capable(CAP_SYS_ADMIN))
 		return -EPERM;
 	reg->val = saa7127_read(sd, reg->reg & 0xff);
+	reg->size = 1;
 	return 0;
 }
 
-static int saa7127_s_register(struct v4l2_subdev *sd, struct v4l2_register *reg)
+static int saa7127_s_register(struct v4l2_subdev *sd, struct v4l2_dbg_register *reg)
 {
 	struct i2c_client *client = v4l2_get_subdevdata(sd);
 
-	if (!v4l2_chip_match_i2c_client(client,
-				reg->match_type, reg->match_chip))
+	if (!v4l2_chip_match_i2c_client(client, &reg->match))
 		return -EINVAL;
 	if (!capable(CAP_SYS_ADMIN))
 		return -EPERM;
@@ -650,7 +649,7 @@ static int saa7127_s_register(struct v4l2_subdev *sd, struct v4l2_register *reg)
 }
 #endif
 
-static int saa7127_g_chip_ident(struct v4l2_subdev *sd, struct v4l2_chip_ident *chip)
+static int saa7127_g_chip_ident(struct v4l2_subdev *sd, struct v4l2_dbg_chip_ident *chip)
 {
 	struct saa7127_state *state = to_state(sd);
 	struct i2c_client *client = v4l2_get_subdevdata(sd);
diff --git a/drivers/media/video/saa7134/saa6752hs.c b/drivers/media/video/saa7134/saa6752hs.c
index 1fb6eccdade..1fee6e84a51 100644
--- a/drivers/media/video/saa7134/saa6752hs.c
+++ b/drivers/media/video/saa7134/saa6752hs.c
@@ -838,7 +838,7 @@ saa6752hs_command(struct i2c_client *client, unsigned int cmd, void *arg)
 		h->standard = *((v4l2_std_id *) arg);
 		break;
 
-	case VIDIOC_G_CHIP_IDENT:
+	case VIDIOC_DBG_G_CHIP_IDENT:
 		return v4l2_chip_ident_i2c_client(client,
 				arg, h->chip, h->revision);
 
diff --git a/drivers/media/video/saa7134/saa7134-empress.c b/drivers/media/video/saa7134/saa7134-empress.c
index 3beba480137..c9d8beb87a6 100644
--- a/drivers/media/video/saa7134/saa7134-empress.c
+++ b/drivers/media/video/saa7134/saa7134-empress.c
@@ -405,7 +405,7 @@ static int empress_querymenu(struct file *file, void *priv,
 }
 
 static int empress_g_chip_ident(struct file *file, void *fh,
-	       struct v4l2_chip_ident *chip)
+	       struct v4l2_dbg_chip_ident *chip)
 {
 	struct saa7134_dev *dev = file->private_data;
 
@@ -413,12 +413,12 @@ static int empress_g_chip_ident(struct file *file, void *fh,
 	chip->revision = 0;
 	if (dev->mpeg_i2c_client == NULL)
 		return -EINVAL;
-	if (chip->match_type == V4L2_CHIP_MATCH_I2C_DRIVER &&
-	    chip->match_chip == I2C_DRIVERID_SAA6752HS)
-		return saa7134_i2c_call_saa6752(dev, VIDIOC_G_CHIP_IDENT, chip);
-	if (chip->match_type == V4L2_CHIP_MATCH_I2C_ADDR &&
-	    chip->match_chip == dev->mpeg_i2c_client->addr)
-		return saa7134_i2c_call_saa6752(dev, VIDIOC_G_CHIP_IDENT, chip);
+	if (chip->match.type == V4L2_CHIP_MATCH_I2C_DRIVER &&
+	    !strcmp(chip->match.name, "saa6752hs"))
+		return saa7134_i2c_call_saa6752(dev, VIDIOC_DBG_G_CHIP_IDENT, chip);
+	if (chip->match.type == V4L2_CHIP_MATCH_I2C_ADDR &&
+	    chip->match.addr == dev->mpeg_i2c_client->addr)
+		return saa7134_i2c_call_saa6752(dev, VIDIOC_DBG_G_CHIP_IDENT, chip);
 	return -EINVAL;
 }
 
diff --git a/drivers/media/video/saa7134/saa7134-video.c b/drivers/media/video/saa7134/saa7134-video.c
index 6b2ab57538e..a1f7e351f57 100644
--- a/drivers/media/video/saa7134/saa7134-video.c
+++ b/drivers/media/video/saa7134/saa7134-video.c
@@ -2247,24 +2247,25 @@ static int saa7134_g_parm(struct file *file, void *fh,
 
 #ifdef CONFIG_VIDEO_ADV_DEBUG
 static int vidioc_g_register (struct file *file, void *priv,
-			      struct v4l2_register *reg)
+			      struct v4l2_dbg_register *reg)
 {
 	struct saa7134_fh *fh = priv;
 	struct saa7134_dev *dev = fh->dev;
 
-	if (!v4l2_chip_match_host(reg->match_type, reg->match_chip))
+	if (!v4l2_chip_match_host(&reg->match))
 		return -EINVAL;
 	reg->val = saa_readb(reg->reg);
+	reg->size = 1;
 	return 0;
 }
 
 static int vidioc_s_register (struct file *file, void *priv,
-				struct v4l2_register *reg)
+				struct v4l2_dbg_register *reg)
 {
 	struct saa7134_fh *fh = priv;
 	struct saa7134_dev *dev = fh->dev;
 
-	if (!v4l2_chip_match_host(reg->match_type, reg->match_chip))
+	if (!v4l2_chip_match_host(&reg->match))
 		return -EINVAL;
 	saa_writeb(reg->reg&0xffffff, reg->val);
 	return 0;
diff --git a/drivers/media/video/saa717x.c b/drivers/media/video/saa717x.c
index 9befca65905..454ad1dd750 100644
--- a/drivers/media/video/saa717x.c
+++ b/drivers/media/video/saa717x.c
@@ -1171,25 +1171,26 @@ static int saa717x_queryctrl(struct v4l2_subdev *sd, struct v4l2_queryctrl *qc)
 }
 
 #ifdef CONFIG_VIDEO_ADV_DEBUG
-static int saa717x_g_register(struct v4l2_subdev *sd, struct v4l2_register *reg)
+static int saa717x_g_register(struct v4l2_subdev *sd, struct v4l2_dbg_register *reg)
 {
 	struct i2c_client *client = v4l2_get_subdevdata(sd);
 
-	if (!v4l2_chip_match_i2c_client(client, reg->match_type, reg->match_chip))
+	if (!v4l2_chip_match_i2c_client(client, &reg->match))
 		return -EINVAL;
 	if (!capable(CAP_SYS_ADMIN))
 		return -EPERM;
 	reg->val = saa717x_read(sd, reg->reg);
+	reg->size = 1;
 	return 0;
 }
 
-static int saa717x_s_register(struct v4l2_subdev *sd, struct v4l2_register *reg)
+static int saa717x_s_register(struct v4l2_subdev *sd, struct v4l2_dbg_register *reg)
 {
 	struct i2c_client *client = v4l2_get_subdevdata(sd);
 	u16 addr = reg->reg & 0xffff;
 	u8 val = reg->val & 0xff;
 
-	if (!v4l2_chip_match_i2c_client(client, reg->match_type, reg->match_chip))
+	if (!v4l2_chip_match_i2c_client(client, &reg->match))
 		return -EINVAL;
 	if (!capable(CAP_SYS_ADMIN))
 		return -EPERM;
diff --git a/drivers/media/video/soc_camera.c b/drivers/media/video/soc_camera.c
index 9986e02bcf1..fcb05f06de8 100644
--- a/drivers/media/video/soc_camera.c
+++ b/drivers/media/video/soc_camera.c
@@ -699,7 +699,7 @@ static int soc_camera_s_crop(struct file *file, void *fh,
 }
 
 static int soc_camera_g_chip_ident(struct file *file, void *fh,
-				   struct v4l2_chip_ident *id)
+				   struct v4l2_dbg_chip_ident *id)
 {
 	struct soc_camera_file *icf = file->private_data;
 	struct soc_camera_device *icd = icf->icd;
@@ -712,7 +712,7 @@ static int soc_camera_g_chip_ident(struct file *file, void *fh,
 
 #ifdef CONFIG_VIDEO_ADV_DEBUG
 static int soc_camera_g_register(struct file *file, void *fh,
-				 struct v4l2_register *reg)
+				 struct v4l2_dbg_register *reg)
 {
 	struct soc_camera_file *icf = file->private_data;
 	struct soc_camera_device *icd = icf->icd;
@@ -724,7 +724,7 @@ static int soc_camera_g_register(struct file *file, void *fh,
 }
 
 static int soc_camera_s_register(struct file *file, void *fh,
-				 struct v4l2_register *reg)
+				 struct v4l2_dbg_register *reg)
 {
 	struct soc_camera_file *icf = file->private_data;
 	struct soc_camera_device *icd = icf->icd;
diff --git a/drivers/media/video/tvaudio.c b/drivers/media/video/tvaudio.c
index d0c794da735..5aeccb301ce 100644
--- a/drivers/media/video/tvaudio.c
+++ b/drivers/media/video/tvaudio.c
@@ -1762,7 +1762,7 @@ static int tvaudio_s_frequency(struct v4l2_subdev *sd, struct v4l2_frequency *fr
 	return 0;
 }
 
-static int tvaudio_g_chip_ident(struct v4l2_subdev *sd, struct v4l2_chip_ident *chip)
+static int tvaudio_g_chip_ident(struct v4l2_subdev *sd, struct v4l2_dbg_chip_ident *chip)
 {
 	struct i2c_client *client = v4l2_get_subdevdata(sd);
 
diff --git a/drivers/media/video/tvp5150.c b/drivers/media/video/tvp5150.c
index a388a9f0cb1..2cd64ef27b9 100644
--- a/drivers/media/video/tvp5150.c
+++ b/drivers/media/video/tvp5150.c
@@ -963,7 +963,7 @@ static int tvp5150_g_fmt(struct v4l2_subdev *sd, struct v4l2_format *fmt)
 
 
 static int tvp5150_g_chip_ident(struct v4l2_subdev *sd,
-				struct v4l2_chip_ident *chip)
+				struct v4l2_dbg_chip_ident *chip)
 {
 	int rev;
 	struct i2c_client *client = v4l2_get_subdevdata(sd);
@@ -977,25 +977,24 @@ static int tvp5150_g_chip_ident(struct v4l2_subdev *sd,
 
 
 #ifdef CONFIG_VIDEO_ADV_DEBUG
-static int tvp5150_g_register(struct v4l2_subdev *sd, struct v4l2_register *reg)
+static int tvp5150_g_register(struct v4l2_subdev *sd, struct v4l2_dbg_register *reg)
 {
 	struct i2c_client *client = v4l2_get_subdevdata(sd);
 
-	if (!v4l2_chip_match_i2c_client(client,
-				reg->match_type, reg->match_chip))
+	if (!v4l2_chip_match_i2c_client(client, &reg->match))
 		return -EINVAL;
 	if (!capable(CAP_SYS_ADMIN))
 		return -EPERM;
 	reg->val = tvp5150_read(sd, reg->reg & 0xff);
+	reg->size = 1;
 	return 0;
 }
 
-static int tvp5150_s_register(struct v4l2_subdev *sd, struct v4l2_register *reg)
+static int tvp5150_s_register(struct v4l2_subdev *sd, struct v4l2_dbg_register *reg)
 {
 	struct i2c_client *client = v4l2_get_subdevdata(sd);
 
-	if (!v4l2_chip_match_i2c_client(client,
-				reg->match_type, reg->match_chip))
+	if (!v4l2_chip_match_i2c_client(client, &reg->match))
 		return -EINVAL;
 	if (!capable(CAP_SYS_ADMIN))
 		return -EPERM;
diff --git a/drivers/media/video/tw9910.c b/drivers/media/video/tw9910.c
index d5cdc4be1a3..52c0357faa5 100644
--- a/drivers/media/video/tw9910.c
+++ b/drivers/media/video/tw9910.c
@@ -575,7 +575,7 @@ static unsigned long tw9910_query_bus_param(struct soc_camera_device *icd)
 }
 
 static int tw9910_get_chip_id(struct soc_camera_device *icd,
-			      struct v4l2_chip_ident *id)
+			      struct v4l2_dbg_chip_ident *id)
 {
 	id->ident = V4L2_IDENT_TW9910;
 	id->revision = 0;
@@ -606,7 +606,7 @@ static int tw9910_enum_input(struct soc_camera_device *icd,
 
 #ifdef CONFIG_VIDEO_ADV_DEBUG
 static int tw9910_get_register(struct soc_camera_device *icd,
-			       struct v4l2_register *reg)
+			       struct v4l2_dbg_register *reg)
 {
 	struct tw9910_priv *priv = container_of(icd, struct tw9910_priv, icd);
 	int ret;
@@ -627,7 +627,7 @@ static int tw9910_get_register(struct soc_camera_device *icd,
 }
 
 static int tw9910_set_register(struct soc_camera_device *icd,
-			       struct v4l2_register *reg)
+			       struct v4l2_dbg_register *reg)
 {
 	struct tw9910_priv *priv = container_of(icd, struct tw9910_priv, icd);
 
diff --git a/drivers/media/video/upd64031a.c b/drivers/media/video/upd64031a.c
index 7a609a3a6db..4f16effb530 100644
--- a/drivers/media/video/upd64031a.c
+++ b/drivers/media/video/upd64031a.c
@@ -147,7 +147,7 @@ static int upd64031a_s_routing(struct v4l2_subdev *sd, const struct v4l2_routing
 	return upd64031a_s_frequency(sd, NULL);
 }
 
-static int upd64031a_g_chip_ident(struct v4l2_subdev *sd, struct v4l2_chip_ident *chip)
+static int upd64031a_g_chip_ident(struct v4l2_subdev *sd, struct v4l2_dbg_chip_ident *chip)
 {
 	struct i2c_client *client = v4l2_get_subdevdata(sd);
 
@@ -162,25 +162,24 @@ static int upd64031a_log_status(struct v4l2_subdev *sd)
 }
 
 #ifdef CONFIG_VIDEO_ADV_DEBUG
-static int upd64031a_g_register(struct v4l2_subdev *sd, struct v4l2_register *reg)
+static int upd64031a_g_register(struct v4l2_subdev *sd, struct v4l2_dbg_register *reg)
 {
 	struct i2c_client *client = v4l2_get_subdevdata(sd);
 
-	if (!v4l2_chip_match_i2c_client(client,
-				reg->match_type, reg->match_chip))
+	if (!v4l2_chip_match_i2c_client(client, &reg->match))
 		return -EINVAL;
 	if (!capable(CAP_SYS_ADMIN))
 		return -EPERM;
 	reg->val = upd64031a_read(sd, reg->reg & 0xff);
+	reg->size = 1;
 	return 0;
 }
 
-static int upd64031a_s_register(struct v4l2_subdev *sd, struct v4l2_register *reg)
+static int upd64031a_s_register(struct v4l2_subdev *sd, struct v4l2_dbg_register *reg)
 {
 	struct i2c_client *client = v4l2_get_subdevdata(sd);
 
-	if (!v4l2_chip_match_i2c_client(client,
-				reg->match_type, reg->match_chip))
+	if (!v4l2_chip_match_i2c_client(client, &reg->match))
 		return -EINVAL;
 	if (!capable(CAP_SYS_ADMIN))
 		return -EPERM;
diff --git a/drivers/media/video/upd64083.c b/drivers/media/video/upd64083.c
index 58412cb9c01..4b712f69d1b 100644
--- a/drivers/media/video/upd64083.c
+++ b/drivers/media/video/upd64083.c
@@ -120,25 +120,24 @@ static int upd64083_s_routing(struct v4l2_subdev *sd, const struct v4l2_routing
 }
 
 #ifdef CONFIG_VIDEO_ADV_DEBUG
-static int upd64083_g_register(struct v4l2_subdev *sd, struct v4l2_register *reg)
+static int upd64083_g_register(struct v4l2_subdev *sd, struct v4l2_dbg_register *reg)
 {
 	struct i2c_client *client = v4l2_get_subdevdata(sd);
 
-	if (!v4l2_chip_match_i2c_client(client,
-				reg->match_type, reg->match_chip))
+	if (!v4l2_chip_match_i2c_client(client, &reg->match))
 		return -EINVAL;
 	if (!capable(CAP_SYS_ADMIN))
 		return -EPERM;
 	reg->val = upd64083_read(sd, reg->reg & 0xff);
+	reg->size = 1;
 	return 0;
 }
 
-static int upd64083_s_register(struct v4l2_subdev *sd, struct v4l2_register *reg)
+static int upd64083_s_register(struct v4l2_subdev *sd, struct v4l2_dbg_register *reg)
 {
 	struct i2c_client *client = v4l2_get_subdevdata(sd);
 
-	if (!v4l2_chip_match_i2c_client(client,
-				reg->match_type, reg->match_chip))
+	if (!v4l2_chip_match_i2c_client(client, &reg->match))
 		return -EINVAL;
 	if (!capable(CAP_SYS_ADMIN))
 		return -EPERM;
@@ -147,7 +146,7 @@ static int upd64083_s_register(struct v4l2_subdev *sd, struct v4l2_register *reg
 }
 #endif
 
-static int upd64083_g_chip_ident(struct v4l2_subdev *sd, struct v4l2_chip_ident *chip)
+static int upd64083_g_chip_ident(struct v4l2_subdev *sd, struct v4l2_dbg_chip_ident *chip)
 {
 	struct i2c_client *client = v4l2_get_subdevdata(sd);
 
diff --git a/drivers/media/video/usbvision/usbvision-video.c b/drivers/media/video/usbvision/usbvision-video.c
index 7c61c6d5ced..2be5e47ed08 100644
--- a/drivers/media/video/usbvision/usbvision-video.c
+++ b/drivers/media/video/usbvision/usbvision-video.c
@@ -477,12 +477,12 @@ static int usbvision_v4l2_close(struct file *file)
  */
 #ifdef CONFIG_VIDEO_ADV_DEBUG
 static int vidioc_g_register (struct file *file, void *priv,
-				struct v4l2_register *reg)
+				struct v4l2_dbg_register *reg)
 {
 	struct usb_usbvision *usbvision = video_drvdata(file);
 	int errCode;
 
-	if (!v4l2_chip_match_host(reg->match_type, reg->match_chip))
+	if (!v4l2_chip_match_host(&reg->match))
 		return -EINVAL;
 	/* NT100x has a 8-bit register space */
 	errCode = usbvision_read_reg(usbvision, reg->reg&0xff);
@@ -492,16 +492,17 @@ static int vidioc_g_register (struct file *file, void *priv,
 		return errCode;
 	}
 	reg->val = errCode;
+	reg->size = 1;
 	return 0;
 }
 
 static int vidioc_s_register (struct file *file, void *priv,
-				struct v4l2_register *reg)
+				struct v4l2_dbg_register *reg)
 {
 	struct usb_usbvision *usbvision = video_drvdata(file);
 	int errCode;
 
-	if (!v4l2_chip_match_host(reg->match_type, reg->match_chip))
+	if (!v4l2_chip_match_host(&reg->match))
 		return -EINVAL;
 	/* NT100x has a 8-bit register space */
 	errCode = usbvision_write_reg(usbvision, reg->reg&0xff, reg->val);
diff --git a/drivers/media/video/v4l2-common.c b/drivers/media/video/v4l2-common.c
index c676b0b0f70..b8f2be8d5c0 100644
--- a/drivers/media/video/v4l2-common.c
+++ b/drivers/media/video/v4l2-common.c
@@ -797,11 +797,11 @@ u32 v4l2_ctrl_next(const u32 * const * ctrl_classes, u32 id)
 }
 EXPORT_SYMBOL(v4l2_ctrl_next);
 
-int v4l2_chip_match_host(u32 match_type, u32 match_chip)
+int v4l2_chip_match_host(const struct v4l2_dbg_match *match)
 {
-	switch (match_type) {
+	switch (match->type) {
 	case V4L2_CHIP_MATCH_HOST:
-		return match_chip == 0;
+		return match->addr == 0;
 	default:
 		return 0;
 	}
@@ -809,23 +809,34 @@ int v4l2_chip_match_host(u32 match_type, u32 match_chip)
 EXPORT_SYMBOL(v4l2_chip_match_host);
 
 #if defined(CONFIG_I2C) || (defined(CONFIG_I2C_MODULE) && defined(MODULE))
-int v4l2_chip_match_i2c_client(struct i2c_client *c, u32 match_type, u32 match_chip)
+int v4l2_chip_match_i2c_client(struct i2c_client *c, const struct v4l2_dbg_match *match)
 {
-	switch (match_type) {
+	int len;
+
+	if (c == NULL || match == NULL)
+		return 0;
+
+	switch (match->type) {
 	case V4L2_CHIP_MATCH_I2C_DRIVER:
-		return (c != NULL && c->driver != NULL && c->driver->id == match_chip);
+		if (c->driver == NULL || c->driver->driver.name == NULL)
+			return 0;
+		len = strlen(c->driver->driver.name);
+		/* legacy drivers have a ' suffix, don't try to match that */
+		if (len && c->driver->driver.name[len - 1] == '\'')
+			len--;
+		return len && !strncmp(c->driver->driver.name, match->name, len);
 	case V4L2_CHIP_MATCH_I2C_ADDR:
-		return (c != NULL && c->addr == match_chip);
+		return c->addr == match->addr;
 	default:
 		return 0;
 	}
 }
 EXPORT_SYMBOL(v4l2_chip_match_i2c_client);
 
-int v4l2_chip_ident_i2c_client(struct i2c_client *c, struct v4l2_chip_ident *chip,
+int v4l2_chip_ident_i2c_client(struct i2c_client *c, struct v4l2_dbg_chip_ident *chip,
 		u32 ident, u32 revision)
 {
-	if (!v4l2_chip_match_i2c_client(c, chip->match_type, chip->match_chip))
+	if (!v4l2_chip_match_i2c_client(c, &chip->match))
 		return 0;
 	if (chip->ident == V4L2_IDENT_NONE) {
 		chip->ident = ident;
diff --git a/drivers/media/video/v4l2-compat-ioctl32.c b/drivers/media/video/v4l2-compat-ioctl32.c
index ec81b9737bd..110376be5d2 100644
--- a/drivers/media/video/v4l2-compat-ioctl32.c
+++ b/drivers/media/video/v4l2-compat-ioctl32.c
@@ -1046,7 +1046,8 @@ long v4l2_compat_ioctl32(struct file *file, unsigned int cmd, unsigned long arg)
 	case VIDIOC_TRY_ENCODER_CMD:
 	case VIDIOC_DBG_S_REGISTER:
 	case VIDIOC_DBG_G_REGISTER:
-	case VIDIOC_G_CHIP_IDENT:
+	case VIDIOC_DBG_G_CHIP_IDENT:
+	case VIDIOC_G_CHIP_IDENT_OLD:
 	case VIDIOC_S_HW_FREQ_SEEK:
 		ret = do_video_ioctl(file, cmd, arg);
 		break;
diff --git a/drivers/media/video/v4l2-ioctl.c b/drivers/media/video/v4l2-ioctl.c
index 8f629ef5b9e..52d687b165e 100644
--- a/drivers/media/video/v4l2-ioctl.c
+++ b/drivers/media/video/v4l2-ioctl.c
@@ -266,7 +266,7 @@ static const char *v4l2_ioctls[] = {
 	[_IOC_NR(VIDIOC_DBG_S_REGISTER)]   = "VIDIOC_DBG_S_REGISTER",
 	[_IOC_NR(VIDIOC_DBG_G_REGISTER)]   = "VIDIOC_DBG_G_REGISTER",
 
-	[_IOC_NR(VIDIOC_G_CHIP_IDENT)]     = "VIDIOC_G_CHIP_IDENT",
+	[_IOC_NR(VIDIOC_DBG_G_CHIP_IDENT)] = "VIDIOC_DBG_G_CHIP_IDENT",
 	[_IOC_NR(VIDIOC_S_HW_FREQ_SEEK)]   = "VIDIOC_S_HW_FREQ_SEEK",
 #endif
 };
@@ -1720,7 +1720,7 @@ static long __video_do_ioctl(struct file *file,
 #ifdef CONFIG_VIDEO_ADV_DEBUG
 	case VIDIOC_DBG_G_REGISTER:
 	{
-		struct v4l2_register *p = arg;
+		struct v4l2_dbg_register *p = arg;
 
 		if (!capable(CAP_SYS_ADMIN))
 			ret = -EPERM;
@@ -1730,7 +1730,7 @@ static long __video_do_ioctl(struct file *file,
 	}
 	case VIDIOC_DBG_S_REGISTER:
 	{
-		struct v4l2_register *p = arg;
+		struct v4l2_dbg_register *p = arg;
 
 		if (!capable(CAP_SYS_ADMIN))
 			ret = -EPERM;
@@ -1739,9 +1739,9 @@ static long __video_do_ioctl(struct file *file,
 		break;
 	}
 #endif
-	case VIDIOC_G_CHIP_IDENT:
+	case VIDIOC_DBG_G_CHIP_IDENT:
 	{
-		struct v4l2_chip_ident *p = arg;
+		struct v4l2_dbg_chip_ident *p = arg;
 
 		if (!ops->vidioc_g_chip_ident)
 			break;
@@ -1750,6 +1750,11 @@ static long __video_do_ioctl(struct file *file,
 			dbgarg(cmd, "chip_ident=%u, revision=0x%x\n", p->ident, p->revision);
 		break;
 	}
+	case VIDIOC_G_CHIP_IDENT_OLD:
+		printk(KERN_ERR "VIDIOC_G_CHIP_IDENT has been deprecated and will disappear in 2.6.30.\n");
+		printk(KERN_ERR "It is a debugging ioctl and must not be used in applications!\n");
+		return -EINVAL;
+
 	case VIDIOC_S_HW_FREQ_SEEK:
 	{
 		struct v4l2_hw_freq_seek *p = arg;
diff --git a/drivers/media/video/v4l2-subdev.c b/drivers/media/video/v4l2-subdev.c
index e3612f29d0d..fbe9cc0d433 100644
--- a/drivers/media/video/v4l2-subdev.c
+++ b/drivers/media/video/v4l2-subdev.c
@@ -37,7 +37,7 @@ int v4l2_subdev_command(struct v4l2_subdev *sd, unsigned cmd, void *arg)
 		return v4l2_subdev_call(sd, core, queryctrl, arg);
 	case VIDIOC_LOG_STATUS:
 		return v4l2_subdev_call(sd, core, log_status);
-	case VIDIOC_G_CHIP_IDENT:
+	case VIDIOC_DBG_G_CHIP_IDENT:
 		return v4l2_subdev_call(sd, core, g_chip_ident, arg);
 	case VIDIOC_INT_S_STANDBY:
 		return v4l2_subdev_call(sd, core, s_standby, arg ? (*(u32 *)arg) : 0);
diff --git a/drivers/media/video/vp27smpx.c b/drivers/media/video/vp27smpx.c
index f72b859486a..5d73f66d9f5 100644
--- a/drivers/media/video/vp27smpx.c
+++ b/drivers/media/video/vp27smpx.c
@@ -113,7 +113,7 @@ static int vp27smpx_g_tuner(struct v4l2_subdev *sd, struct v4l2_tuner *vt)
 	return 0;
 }
 
-static int vp27smpx_g_chip_ident(struct v4l2_subdev *sd, struct v4l2_chip_ident *chip)
+static int vp27smpx_g_chip_ident(struct v4l2_subdev *sd, struct v4l2_dbg_chip_ident *chip)
 {
 	struct i2c_client *client = v4l2_get_subdevdata(sd);
 
diff --git a/drivers/media/video/wm8739.c b/drivers/media/video/wm8739.c
index 12a31e7a5f6..f2864d5cd18 100644
--- a/drivers/media/video/wm8739.c
+++ b/drivers/media/video/wm8739.c
@@ -233,7 +233,7 @@ static int wm8739_queryctrl(struct v4l2_subdev *sd, struct v4l2_queryctrl *qc)
 	return -EINVAL;
 }
 
-static int wm8739_g_chip_ident(struct v4l2_subdev *sd, struct v4l2_chip_ident *chip)
+static int wm8739_g_chip_ident(struct v4l2_subdev *sd, struct v4l2_dbg_chip_ident *chip)
 {
 	struct i2c_client *client = v4l2_get_subdevdata(sd);
 
diff --git a/drivers/media/video/wm8775.c b/drivers/media/video/wm8775.c
index d0220b0ec0b..53fcd42843e 100644
--- a/drivers/media/video/wm8775.c
+++ b/drivers/media/video/wm8775.c
@@ -130,7 +130,7 @@ static int wm8775_s_ctrl(struct v4l2_subdev *sd, struct v4l2_control *ctrl)
 	return 0;
 }
 
-static int wm8775_g_chip_ident(struct v4l2_subdev *sd, struct v4l2_chip_ident *chip)
+static int wm8775_g_chip_ident(struct v4l2_subdev *sd, struct v4l2_dbg_chip_ident *chip)
 {
 	struct i2c_client *client = v4l2_get_subdevdata(sd);
 
diff --git a/include/linux/videodev2.h b/include/linux/videodev2.h
index 1f126e30766..5571dbe1c0a 100644
--- a/include/linux/videodev2.h
+++ b/include/linux/videodev2.h
@@ -1370,25 +1370,41 @@ struct v4l2_streamparm {
 /*
  *	A D V A N C E D   D E B U G G I N G
  *
- *	NOTE: EXPERIMENTAL API
+ *	NOTE: EXPERIMENTAL API, NEVER RELY ON THIS IN APPLICATIONS!
+ *	FOR DEBUGGING, TESTING AND INTERNAL USE ONLY!
  */
 
 /* VIDIOC_DBG_G_REGISTER and VIDIOC_DBG_S_REGISTER */
 
 #define V4L2_CHIP_MATCH_HOST       0  /* Match against chip ID on host (0 for the host) */
-#define V4L2_CHIP_MATCH_I2C_DRIVER 1  /* Match against I2C driver ID */
+#define V4L2_CHIP_MATCH_I2C_DRIVER 1  /* Match against I2C driver name */
 #define V4L2_CHIP_MATCH_I2C_ADDR   2  /* Match against I2C 7-bit address */
 #define V4L2_CHIP_MATCH_AC97       3  /* Match against anciliary AC97 chip */
 
-struct v4l2_register {
-	__u32 match_type; /* Match type */
-	__u32 match_chip; /* Match this chip, meaning determined by match_type */
+struct v4l2_dbg_match {
+	__u32 type; /* Match type */
+	union {     /* Match this chip, meaning determined by type */
+		__u32 addr;
+		char name[32];
+	};
+} __attribute__ ((packed));
+
+struct v4l2_dbg_register {
+	struct v4l2_dbg_match match;
+	__u32 size;	/* register size in bytes */
 	__u64 reg;
 	__u64 val;
-};
+} __attribute__ ((packed));
+
+/* VIDIOC_DBG_G_CHIP_IDENT */
+struct v4l2_dbg_chip_ident {
+	struct v4l2_dbg_match match;
+	__u32 ident;       /* chip identifier as specified in <media/v4l2-chip-ident.h> */
+	__u32 revision;    /* chip revision, chip specific */
+} __attribute__ ((packed));
 
-/* VIDIOC_G_CHIP_IDENT */
-struct v4l2_chip_ident {
+/* VIDIOC_G_CHIP_IDENT_OLD: Deprecated, do not use */
+struct v4l2_chip_ident_old {
 	__u32 match_type;  /* Match type */
 	__u32 match_chip;  /* Match this chip, meaning determined by match_type */
 	__u32 ident;       /* chip identifier as specified in <media/v4l2-chip-ident.h> */
@@ -1460,13 +1476,22 @@ struct v4l2_chip_ident {
 #define VIDIOC_G_ENC_INDEX       _IOR('V', 76, struct v4l2_enc_idx)
 #define VIDIOC_ENCODER_CMD      _IOWR('V', 77, struct v4l2_encoder_cmd)
 #define VIDIOC_TRY_ENCODER_CMD  _IOWR('V', 78, struct v4l2_encoder_cmd)
+#endif
 
-/* Experimental, only implemented if CONFIG_VIDEO_ADV_DEBUG is defined */
-#define	VIDIOC_DBG_S_REGISTER 	 _IOW('V', 79, struct v4l2_register)
-#define	VIDIOC_DBG_G_REGISTER 	_IOWR('V', 80, struct v4l2_register)
-
-#define VIDIOC_G_CHIP_IDENT     _IOWR('V', 81, struct v4l2_chip_ident)
+#if 1
+/* Experimental, meant for debugging, testing and internal use.
+   Only implemented if CONFIG_VIDEO_ADV_DEBUG is defined.
+   You must be root to use these ioctls. Never use these in applications! */
+#define	VIDIOC_DBG_S_REGISTER 	 _IOW('V', 79, struct v4l2_dbg_register)
+#define	VIDIOC_DBG_G_REGISTER 	_IOWR('V', 80, struct v4l2_dbg_register)
+
+/* Experimental, meant for debugging, testing and internal use.
+   Never use this ioctl in applications! */
+#define VIDIOC_DBG_G_CHIP_IDENT _IOWR('V', 81, struct v4l2_dbg_chip_ident)
+/* This is deprecated and will go away in 2.6.30 */
+#define VIDIOC_G_CHIP_IDENT_OLD _IOWR('V', 81, struct v4l2_chip_ident_old)
 #endif
+
 #define VIDIOC_S_HW_FREQ_SEEK	 _IOW('V', 82, struct v4l2_hw_freq_seek)
 /* Reminder: when adding new ioctls please add support for them to
    drivers/media/video/v4l2-compat-ioctl32.c as well! */
diff --git a/include/media/soc_camera.h b/include/media/soc_camera.h
index 425b6a98c95..7440d925066 100644
--- a/include/media/soc_camera.h
+++ b/include/media/soc_camera.h
@@ -164,12 +164,12 @@ struct soc_camera_ops {
 	unsigned long (*query_bus_param)(struct soc_camera_device *);
 	int (*set_bus_param)(struct soc_camera_device *, unsigned long);
 	int (*get_chip_id)(struct soc_camera_device *,
-			   struct v4l2_chip_ident *);
+			   struct v4l2_dbg_chip_ident *);
 	int (*set_std)(struct soc_camera_device *, v4l2_std_id *);
 	int (*enum_input)(struct soc_camera_device *, struct v4l2_input *);
 #ifdef CONFIG_VIDEO_ADV_DEBUG
-	int (*get_register)(struct soc_camera_device *, struct v4l2_register *);
-	int (*set_register)(struct soc_camera_device *, struct v4l2_register *);
+	int (*get_register)(struct soc_camera_device *, struct v4l2_dbg_register *);
+	int (*set_register)(struct soc_camera_device *, struct v4l2_dbg_register *);
 #endif
 	int (*get_control)(struct soc_camera_device *, struct v4l2_control *);
 	int (*set_control)(struct soc_camera_device *, struct v4l2_control *);
diff --git a/include/media/v4l2-chip-ident.h b/include/media/v4l2-chip-ident.h
index 43dbb659f1f..9aaf652b20e 100644
--- a/include/media/v4l2-chip-ident.h
+++ b/include/media/v4l2-chip-ident.h
@@ -2,7 +2,7 @@
     v4l2 chip identifiers header
 
     This header provides a list of chip identifiers that can be returned
-    through the VIDIOC_G_CHIP_IDENT ioctl.
+    through the VIDIOC_DBG_G_CHIP_IDENT ioctl.
 
     Copyright (C) 2007 Hans Verkuil <hverkuil@xs4all.nl>
 
@@ -24,7 +24,7 @@
 #ifndef V4L2_CHIP_IDENT_H_
 #define V4L2_CHIP_IDENT_H_
 
-/* VIDIOC_G_CHIP_IDENT: identifies the actual chip installed on the board */
+/* VIDIOC_DBG_G_CHIP_IDENT: identifies the actual chip installed on the board */
 enum {
 	/* general idents: reserved range 0-49 */
 	V4L2_IDENT_NONE      = 0,       /* No chip matched */
diff --git a/include/media/v4l2-common.h b/include/media/v4l2-common.h
index f99c866d8c3..95e74f1874e 100644
--- a/include/media/v4l2-common.h
+++ b/include/media/v4l2-common.h
@@ -114,10 +114,10 @@ u32 v4l2_ctrl_next(const u32 * const *ctrl_classes, u32 id);
 /* Register/chip ident helper function */
 
 struct i2c_client; /* forward reference */
-int v4l2_chip_match_i2c_client(struct i2c_client *c, u32 id_type, u32 chip_id);
-int v4l2_chip_ident_i2c_client(struct i2c_client *c, struct v4l2_chip_ident *chip,
+int v4l2_chip_match_i2c_client(struct i2c_client *c, const struct v4l2_dbg_match *match);
+int v4l2_chip_ident_i2c_client(struct i2c_client *c, struct v4l2_dbg_chip_ident *chip,
 		u32 ident, u32 revision);
-int v4l2_chip_match_host(u32 id_type, u32 chip_id);
+int v4l2_chip_match_host(const struct v4l2_dbg_match *match);
 
 /* ------------------------------------------------------------------------- */
 
diff --git a/include/media/v4l2-int-device.h b/include/media/v4l2-int-device.h
index ecda3c72583..fbf58556157 100644
--- a/include/media/v4l2-int-device.h
+++ b/include/media/v4l2-int-device.h
@@ -219,7 +219,7 @@ enum v4l2_int_ioctl_num {
 	vidioc_int_reset_num,
 	/* VIDIOC_INT_INIT */
 	vidioc_int_init_num,
-	/* VIDIOC_INT_G_CHIP_IDENT */
+	/* VIDIOC_DBG_G_CHIP_IDENT */
 	vidioc_int_g_chip_ident_num,
 
 	/*
diff --git a/include/media/v4l2-ioctl.h b/include/media/v4l2-ioctl.h
index bf0e723a99c..b01c044868d 100644
--- a/include/media/v4l2-ioctl.h
+++ b/include/media/v4l2-ioctl.h
@@ -225,12 +225,12 @@ struct v4l2_ioctl_ops {
 	/* Debugging ioctls */
 #ifdef CONFIG_VIDEO_ADV_DEBUG
 	int (*vidioc_g_register)       (struct file *file, void *fh,
-					struct v4l2_register *reg);
+					struct v4l2_dbg_register *reg);
 	int (*vidioc_s_register)       (struct file *file, void *fh,
-					struct v4l2_register *reg);
+					struct v4l2_dbg_register *reg);
 #endif
 	int (*vidioc_g_chip_ident)     (struct file *file, void *fh,
-					struct v4l2_chip_ident *chip);
+					struct v4l2_dbg_chip_ident *chip);
 
 	int (*vidioc_enum_framesizes)   (struct file *file, void *fh,
 					 struct v4l2_frmsizeenum *fsize);
diff --git a/include/media/v4l2-subdev.h b/include/media/v4l2-subdev.h
index 2517344313b..37b09e56e94 100644
--- a/include/media/v4l2-subdev.h
+++ b/include/media/v4l2-subdev.h
@@ -69,7 +69,7 @@ struct tuner_setup;
    not yet implemented) since ops provide proper type-checking.
  */
 struct v4l2_subdev_core_ops {
-	int (*g_chip_ident)(struct v4l2_subdev *sd, struct v4l2_chip_ident *chip);
+	int (*g_chip_ident)(struct v4l2_subdev *sd, struct v4l2_dbg_chip_ident *chip);
 	int (*log_status)(struct v4l2_subdev *sd);
 	int (*init)(struct v4l2_subdev *sd, u32 val);
 	int (*s_standby)(struct v4l2_subdev *sd, u32 standby);
@@ -81,8 +81,8 @@ struct v4l2_subdev_core_ops {
 	int (*querymenu)(struct v4l2_subdev *sd, struct v4l2_querymenu *qm);
 	long (*ioctl)(struct v4l2_subdev *sd, unsigned int cmd, void *arg);
 #ifdef CONFIG_VIDEO_ADV_DEBUG
-	int (*g_register)(struct v4l2_subdev *sd, struct v4l2_register *reg);
-	int (*s_register)(struct v4l2_subdev *sd, struct v4l2_register *reg);
+	int (*g_register)(struct v4l2_subdev *sd, struct v4l2_dbg_register *reg);
+	int (*s_register)(struct v4l2_subdev *sd, struct v4l2_dbg_register *reg);
 #endif
 };
 
-- 
cgit v1.2.3-70-g09d2


From cb889a2f3515b140bef193cf6ffcdb099349b8aa Mon Sep 17 00:00:00 2001
From: Klaus Schmidinger <Klaus.Schmidinger@cadsoft.de>
Date: Wed, 31 Dec 2008 14:11:23 -0300
Subject: V4L/DVB (10164): Add missing S2 caps flag to S2API

The attached patch adds a capability flag that allows an application
to determine whether a particular device can handle "second generation
modulation" transponders. This is necessary in order for applications
to be able to decide which device to use for a given channel in
a multi device environment, where DVB-S and DVB-S2 devices are mixed.

It is assumed that a device capable of handling "second generation
modulation" can implicitly handle "first generation modulation".
The flag is not named anything with DVBS2 in order to allow its
use with future DVBT2 devices as well (should they ever come).

Signed-off by: Klaus Schmidinger <Klaus.Schmidinger@cadsoft.de>

Acked-by: Steven Toth <stoth@linuxtv.org>
Signed-off-by: Mauro Carvalho Chehab <mchehab@redhat.com>
---
 include/linux/dvb/frontend.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/dvb/frontend.h b/include/linux/dvb/frontend.h
index 79a8ed8e6a7..926d28d526e 100644
--- a/include/linux/dvb/frontend.h
+++ b/include/linux/dvb/frontend.h
@@ -63,6 +63,7 @@ typedef enum fe_caps {
 	FE_CAN_8VSB			= 0x200000,
 	FE_CAN_16VSB			= 0x400000,
 	FE_HAS_EXTENDED_CAPS		= 0x800000,   // We need more bitspace for newer APIs, indicate this.
+	FE_CAN_2G_MODULATION		= 0x10000000, // frontend supports "2nd generation modulation" (DVB-S2)
 	FE_NEEDS_BENDING		= 0x20000000, // not supported anymore, don't use (frontend requires frequency bending)
 	FE_CAN_RECOVER			= 0x40000000, // frontend can recover from a cable unplug automatically
 	FE_CAN_MUTE_TS			= 0x80000000  // frontend can stop spurious TS data output
-- 
cgit v1.2.3-70-g09d2


From e4cda3e0728156c6be1d03e72ef20ea811da4ad5 Mon Sep 17 00:00:00 2001
From: Mauro Carvalho Chehab <mchehab@redhat.com>
Date: Wed, 31 Dec 2008 14:26:57 -0300
Subject: V4L/DVB (10166): dvb frontend: stop using non-C99 compliant comments

Signed-off-by: Mauro Carvalho Chehab <mchehab@redhat.com>
---
 include/linux/dvb/frontend.h | 28 ++++++++++++++--------------
 1 file changed, 14 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/dvb/frontend.h b/include/linux/dvb/frontend.h
index 926d28d526e..55026b1a40b 100644
--- a/include/linux/dvb/frontend.h
+++ b/include/linux/dvb/frontend.h
@@ -62,11 +62,11 @@ typedef enum fe_caps {
 	FE_CAN_HIERARCHY_AUTO		= 0x100000,
 	FE_CAN_8VSB			= 0x200000,
 	FE_CAN_16VSB			= 0x400000,
-	FE_HAS_EXTENDED_CAPS		= 0x800000,   // We need more bitspace for newer APIs, indicate this.
-	FE_CAN_2G_MODULATION		= 0x10000000, // frontend supports "2nd generation modulation" (DVB-S2)
-	FE_NEEDS_BENDING		= 0x20000000, // not supported anymore, don't use (frontend requires frequency bending)
-	FE_CAN_RECOVER			= 0x40000000, // frontend can recover from a cable unplug automatically
-	FE_CAN_MUTE_TS			= 0x80000000  // frontend can stop spurious TS data output
+	FE_HAS_EXTENDED_CAPS		= 0x800000,   /* We need more bitspace for newer APIs, indicate this. */
+	FE_CAN_2G_MODULATION		= 0x10000000, /* frontend supports "2nd generation modulation" (DVB-S2) */
+	FE_NEEDS_BENDING		= 0x20000000, /* not supported anymore, don't use (frontend requires frequency bending) */
+	FE_CAN_RECOVER			= 0x40000000, /* frontend can recover from a cable unplug automatically */
+	FE_CAN_MUTE_TS			= 0x80000000  /* frontend can stop spurious TS data output */
 } fe_caps_t;
 
 
@@ -122,15 +122,15 @@ typedef enum fe_sec_mini_cmd {
 
 
 typedef enum fe_status {
-	FE_HAS_SIGNAL	= 0x01,   /*  found something above the noise level */
-	FE_HAS_CARRIER	= 0x02,   /*  found a DVB signal  */
-	FE_HAS_VITERBI	= 0x04,   /*  FEC is stable  */
-	FE_HAS_SYNC	= 0x08,   /*  found sync bytes  */
-	FE_HAS_LOCK	= 0x10,   /*  everything's working... */
-	FE_TIMEDOUT	= 0x20,   /*  no lock within the last ~2 seconds */
-	FE_REINIT	= 0x40    /*  frontend was reinitialized,  */
-} fe_status_t;			  /*  application is recommended to reset */
-				  /*  DiSEqC, tone and parameters */
+	FE_HAS_SIGNAL	= 0x01,   /* found something above the noise level */
+	FE_HAS_CARRIER	= 0x02,   /* found a DVB signal  */
+	FE_HAS_VITERBI	= 0x04,   /* FEC is stable  */
+	FE_HAS_SYNC	= 0x08,   /* found sync bytes  */
+	FE_HAS_LOCK	= 0x10,   /* everything's working... */
+	FE_TIMEDOUT	= 0x20,   /* no lock within the last ~2 seconds */
+	FE_REINIT	= 0x40    /* frontend was reinitialized,  */
+} fe_status_t;			  /* application is recommended to reset */
+				  /* DiSEqC, tone and parameters */
 
 typedef enum fe_spectral_inversion {
 	INVERSION_OFF,
-- 
cgit v1.2.3-70-g09d2


From 6680598b44ed3c0052d155522eb21fc5a00de5f3 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 2 Jan 2009 18:53:14 +0100
Subject: Disallow gcc versions 3.{0,1}

GCC 3.0 and 3.1 are too old to build a working kernel.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
[ This check got dropped as obsolete when I simplified the gcc header
  inclusion mess in f153b82121b0366fe0e5f9553545cce237335175, but Willy
  Tarreau reports actually having those old versions still..  -Linus ]
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/compiler-gcc3.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/compiler-gcc3.h b/include/linux/compiler-gcc3.h
index 2befe6513ce..8005effc04f 100644
--- a/include/linux/compiler-gcc3.h
+++ b/include/linux/compiler-gcc3.h
@@ -2,6 +2,10 @@
 #error "Please don't include <linux/compiler-gcc3.h> directly, include <linux/compiler.h> instead."
 #endif
 
+#if __GNUC_MINOR__ < 2
+# error Sorry, your compiler is too old - please upgrade it.
+#endif
+
 #if __GNUC_MINOR__ >= 3
 # define __used			__attribute__((__used__))
 #else
-- 
cgit v1.2.3-70-g09d2


From 015ab17dc2e9de805c26e74f498b12ee5e8de07e Mon Sep 17 00:00:00 2001
From: Mark McLoughlin <markmc@redhat.com>
Date: Thu, 20 Nov 2008 14:04:20 +0000
Subject: intel-iommu: remove some unused struct intel_iommu fields

The seg, saved_msg and sysdev fields appear to be unused since
before the code was first merged.

linux/msi.h is not needed in linux/intel-iommu.h anymore since
there is no longer a reference to struct msi_msg. The MSI code
in drivers/pci/intel-iommu.c still has linux/msi.h included
via linux/dmar.h.

linux/sysdev.h isn't needed because there is no reference to
struct sys_device.

Signed-off-by: Mark McLoughlin <markmc@redhat.com>
Signed-off-by: David Woodhouse <David.Woodhouse@intel.com>
---
 drivers/pci/intel-iommu.c   | 1 -
 include/linux/intel-iommu.h | 5 -----
 2 files changed, 6 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/pci/intel-iommu.c b/drivers/pci/intel-iommu.c
index 5c8baa43ac9..8e5a445ba93 100644
--- a/drivers/pci/intel-iommu.c
+++ b/drivers/pci/intel-iommu.c
@@ -27,7 +27,6 @@
 #include <linux/slab.h>
 #include <linux/irq.h>
 #include <linux/interrupt.h>
-#include <linux/sysdev.h>
 #include <linux/spinlock.h>
 #include <linux/pci.h>
 #include <linux/dmar.h>
diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h
index 3d017cfd245..1bff7bf1bc2 100644
--- a/include/linux/intel-iommu.h
+++ b/include/linux/intel-iommu.h
@@ -23,8 +23,6 @@
 #define _INTEL_IOMMU_H_
 
 #include <linux/types.h>
-#include <linux/msi.h>
-#include <linux/sysdev.h>
 #include <linux/iova.h>
 #include <linux/io.h>
 #include <linux/dma_remapping.h>
@@ -289,7 +287,6 @@ struct intel_iommu {
 	void __iomem	*reg; /* Pointer to hardware regs, virtual addr */
 	u64		cap;
 	u64		ecap;
-	int		seg;
 	u32		gcmd; /* Holds TE, EAFL. Don't need SRTP, SFL, WBF */
 	spinlock_t	register_lock; /* protect register handling */
 	int		seq_id;	/* sequence id of the iommu */
@@ -302,8 +299,6 @@ struct intel_iommu {
 
 	unsigned int irq;
 	unsigned char name[7];    /* Device Name */
-	struct msi_msg saved_msg;
-	struct sys_device sysdev;
 	struct iommu_flush flush;
 #endif
 	struct q_inval  *qi;            /* Queued invalidation info */
-- 
cgit v1.2.3-70-g09d2


From 519a05491586dad04e687660e54c57882315b22b Mon Sep 17 00:00:00 2001
From: Mark McLoughlin <markmc@redhat.com>
Date: Thu, 20 Nov 2008 14:21:13 +0000
Subject: intel-iommu: make init_dmars() static

init_dmars() is not used outside of drivers/pci/intel-iommu.c

Signed-off-by: Mark McLoughlin <markmc@redhat.com>
Signed-off-by: David Woodhouse <David.Woodhouse@intel.com>
---
 drivers/pci/intel-iommu.c     | 2 +-
 include/linux/dma_remapping.h | 1 -
 2 files changed, 1 insertion(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/pci/intel-iommu.c b/drivers/pci/intel-iommu.c
index 8e5a445ba93..95ae3a9aea8 100644
--- a/drivers/pci/intel-iommu.c
+++ b/drivers/pci/intel-iommu.c
@@ -1589,7 +1589,7 @@ static inline void iommu_prepare_isa(void)
 }
 #endif /* !CONFIG_DMAR_FLPY_WA */
 
-int __init init_dmars(void)
+static int __init init_dmars(void)
 {
 	struct dmar_drhd_unit *drhd;
 	struct dmar_rmrr_unit *rmrr;
diff --git a/include/linux/dma_remapping.h b/include/linux/dma_remapping.h
index 952df39c989..cf92c4924b8 100644
--- a/include/linux/dma_remapping.h
+++ b/include/linux/dma_remapping.h
@@ -141,7 +141,6 @@ struct device_domain_info {
 	struct dmar_domain *domain; /* pointer to domain */
 };
 
-extern int init_dmars(void);
 extern void free_dmar_iommu(struct intel_iommu *iommu);
 
 extern int dmar_disabled;
-- 
cgit v1.2.3-70-g09d2


From f27be03b271851fd54529f292c0f25b4c1f1a553 Mon Sep 17 00:00:00 2001
From: Mark McLoughlin <markmc@redhat.com>
Date: Thu, 20 Nov 2008 15:49:43 +0000
Subject: intel-iommu: move DMA_32/64BIT_PFN into intel-iommu.c

Signed-off-by: Mark McLoughlin <markmc@redhat.com>
Signed-off-by: David Woodhouse <David.Woodhouse@intel.com>
---
 drivers/pci/intel-iommu.c     | 3 +++
 include/linux/dma_remapping.h | 5 -----
 2 files changed, 3 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/pci/intel-iommu.c b/drivers/pci/intel-iommu.c
index 95ae3a9aea8..6fadbb9bc18 100644
--- a/drivers/pci/intel-iommu.c
+++ b/drivers/pci/intel-iommu.c
@@ -53,6 +53,9 @@
 
 #define DOMAIN_MAX_ADDR(gaw) ((((u64)1) << gaw) - 1)
 
+#define IOVA_PFN(addr)		((addr) >> PAGE_SHIFT)
+#define DMA_32BIT_PFN		IOVA_PFN(DMA_32BIT_MASK)
+#define DMA_64BIT_PFN		IOVA_PFN(DMA_64BIT_MASK)
 
 static void flush_unmaps_timeout(unsigned long data);
 
diff --git a/include/linux/dma_remapping.h b/include/linux/dma_remapping.h
index cf92c4924b8..2e5a5c0b6ac 100644
--- a/include/linux/dma_remapping.h
+++ b/include/linux/dma_remapping.h
@@ -9,11 +9,6 @@
 #define VTD_PAGE_MASK		(((u64)-1) << VTD_PAGE_SHIFT)
 #define VTD_PAGE_ALIGN(addr)	(((addr) + VTD_PAGE_SIZE - 1) & VTD_PAGE_MASK)
 
-#define IOVA_PFN(addr)		((addr) >> PAGE_SHIFT)
-#define DMA_32BIT_PFN		IOVA_PFN(DMA_32BIT_MASK)
-#define DMA_64BIT_PFN		IOVA_PFN(DMA_64BIT_MASK)
-
-
 /*
  * 0: Present
  * 1-11: Reserved
-- 
cgit v1.2.3-70-g09d2


From 46b08e1a76b758193b0e7b889c6486a16eb1e9e2 Mon Sep 17 00:00:00 2001
From: Mark McLoughlin <markmc@redhat.com>
Date: Thu, 20 Nov 2008 15:49:44 +0000
Subject: intel-iommu: move root entry defs from dma_remapping.h

We keep the struct root_entry forward declaration for the
pointer in struct intel_iommu.

Signed-off-by: Mark McLoughlin <markmc@redhat.com>
Signed-off-by: David Woodhouse <David.Woodhouse@intel.com>
---
 drivers/pci/intel-iommu.c     | 33 +++++++++++++++++++++++++++++++++
 include/linux/dma_remapping.h | 34 +---------------------------------
 2 files changed, 34 insertions(+), 33 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/pci/intel-iommu.c b/drivers/pci/intel-iommu.c
index 6fadbb9bc18..29bf2d8176e 100644
--- a/drivers/pci/intel-iommu.c
+++ b/drivers/pci/intel-iommu.c
@@ -57,6 +57,39 @@
 #define DMA_32BIT_PFN		IOVA_PFN(DMA_32BIT_MASK)
 #define DMA_64BIT_PFN		IOVA_PFN(DMA_64BIT_MASK)
 
+/*
+ * 0: Present
+ * 1-11: Reserved
+ * 12-63: Context Ptr (12 - (haw-1))
+ * 64-127: Reserved
+ */
+struct root_entry {
+	u64	val;
+	u64	rsvd1;
+};
+#define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
+static inline bool root_present(struct root_entry *root)
+{
+	return (root->val & 1);
+}
+static inline void set_root_present(struct root_entry *root)
+{
+	root->val |= 1;
+}
+static inline void set_root_value(struct root_entry *root, unsigned long value)
+{
+	root->val |= value & VTD_PAGE_MASK;
+}
+
+static inline struct context_entry *
+get_context_addr_from_root(struct root_entry *root)
+{
+	return (struct context_entry *)
+		(root_present(root)?phys_to_virt(
+		root->val & VTD_PAGE_MASK) :
+		NULL);
+}
+
 static void flush_unmaps_timeout(unsigned long data);
 
 DEFINE_TIMER(unmap_timer,  flush_unmaps_timeout, 0, 0);
diff --git a/include/linux/dma_remapping.h b/include/linux/dma_remapping.h
index 2e5a5c0b6ac..d8521662a49 100644
--- a/include/linux/dma_remapping.h
+++ b/include/linux/dma_remapping.h
@@ -9,39 +9,7 @@
 #define VTD_PAGE_MASK		(((u64)-1) << VTD_PAGE_SHIFT)
 #define VTD_PAGE_ALIGN(addr)	(((addr) + VTD_PAGE_SIZE - 1) & VTD_PAGE_MASK)
 
-/*
- * 0: Present
- * 1-11: Reserved
- * 12-63: Context Ptr (12 - (haw-1))
- * 64-127: Reserved
- */
-struct root_entry {
-	u64	val;
-	u64	rsvd1;
-};
-#define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
-static inline bool root_present(struct root_entry *root)
-{
-	return (root->val & 1);
-}
-static inline void set_root_present(struct root_entry *root)
-{
-	root->val |= 1;
-}
-static inline void set_root_value(struct root_entry *root, unsigned long value)
-{
-	root->val |= value & VTD_PAGE_MASK;
-}
-
-struct context_entry;
-static inline struct context_entry *
-get_context_addr_from_root(struct root_entry *root)
-{
-	return (struct context_entry *)
-		(root_present(root)?phys_to_virt(
-		root->val & VTD_PAGE_MASK) :
-		NULL);
-}
+struct root_entry;
 
 /*
  * low 64 bits:
-- 
cgit v1.2.3-70-g09d2


From 7a8fc25e0cc6e75fa6fdb0a856490e324218550b Mon Sep 17 00:00:00 2001
From: Mark McLoughlin <markmc@redhat.com>
Date: Thu, 20 Nov 2008 15:49:45 +0000
Subject: intel-iommu: move context entry defs out from dma_remapping.h

Signed-off-by: Mark McLoughlin <markmc@redhat.com>
Signed-off-by: David Woodhouse <David.Woodhouse@intel.com>
---
 drivers/pci/intel-iommu.c     | 38 ++++++++++++++++++++++++++++++++++++++
 include/linux/dma_remapping.h | 38 --------------------------------------
 2 files changed, 38 insertions(+), 38 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/pci/intel-iommu.c b/drivers/pci/intel-iommu.c
index 29bf2d8176e..9d06f4bb6b5 100644
--- a/drivers/pci/intel-iommu.c
+++ b/drivers/pci/intel-iommu.c
@@ -90,6 +90,44 @@ get_context_addr_from_root(struct root_entry *root)
 		NULL);
 }
 
+/*
+ * low 64 bits:
+ * 0: present
+ * 1: fault processing disable
+ * 2-3: translation type
+ * 12-63: address space root
+ * high 64 bits:
+ * 0-2: address width
+ * 3-6: aval
+ * 8-23: domain id
+ */
+struct context_entry {
+	u64 lo;
+	u64 hi;
+};
+#define context_present(c) ((c).lo & 1)
+#define context_fault_disable(c) (((c).lo >> 1) & 1)
+#define context_translation_type(c) (((c).lo >> 2) & 3)
+#define context_address_root(c) ((c).lo & VTD_PAGE_MASK)
+#define context_address_width(c) ((c).hi &  7)
+#define context_domain_id(c) (((c).hi >> 8) & ((1 << 16) - 1))
+
+#define context_set_present(c) do {(c).lo |= 1;} while (0)
+#define context_set_fault_enable(c) \
+	do {(c).lo &= (((u64)-1) << 2) | 1;} while (0)
+#define context_set_translation_type(c, val) \
+	do { \
+		(c).lo &= (((u64)-1) << 4) | 3; \
+		(c).lo |= ((val) & 3) << 2; \
+	} while (0)
+#define CONTEXT_TT_MULTI_LEVEL 0
+#define context_set_address_root(c, val) \
+	do {(c).lo |= (val) & VTD_PAGE_MASK; } while (0)
+#define context_set_address_width(c, val) do {(c).hi |= (val) & 7;} while (0)
+#define context_set_domain_id(c, val) \
+	do {(c).hi |= ((val) & ((1 << 16) - 1)) << 8;} while (0)
+#define context_clear_entry(c) do {(c).lo = 0; (c).hi = 0;} while (0)
+
 static void flush_unmaps_timeout(unsigned long data);
 
 DEFINE_TIMER(unmap_timer,  flush_unmaps_timeout, 0, 0);
diff --git a/include/linux/dma_remapping.h b/include/linux/dma_remapping.h
index d8521662a49..9a88f7d0262 100644
--- a/include/linux/dma_remapping.h
+++ b/include/linux/dma_remapping.h
@@ -11,44 +11,6 @@
 
 struct root_entry;
 
-/*
- * low 64 bits:
- * 0: present
- * 1: fault processing disable
- * 2-3: translation type
- * 12-63: address space root
- * high 64 bits:
- * 0-2: address width
- * 3-6: aval
- * 8-23: domain id
- */
-struct context_entry {
-	u64 lo;
-	u64 hi;
-};
-#define context_present(c) ((c).lo & 1)
-#define context_fault_disable(c) (((c).lo >> 1) & 1)
-#define context_translation_type(c) (((c).lo >> 2) & 3)
-#define context_address_root(c) ((c).lo & VTD_PAGE_MASK)
-#define context_address_width(c) ((c).hi &  7)
-#define context_domain_id(c) (((c).hi >> 8) & ((1 << 16) - 1))
-
-#define context_set_present(c) do {(c).lo |= 1;} while (0)
-#define context_set_fault_enable(c) \
-	do {(c).lo &= (((u64)-1) << 2) | 1;} while (0)
-#define context_set_translation_type(c, val) \
-	do { \
-		(c).lo &= (((u64)-1) << 4) | 3; \
-		(c).lo |= ((val) & 3) << 2; \
-	} while (0)
-#define CONTEXT_TT_MULTI_LEVEL 0
-#define context_set_address_root(c, val) \
-	do {(c).lo |= (val) & VTD_PAGE_MASK; } while (0)
-#define context_set_address_width(c, val) do {(c).hi |= (val) & 7;} while (0)
-#define context_set_domain_id(c, val) \
-	do {(c).hi |= ((val) & ((1 << 16) - 1)) << 8;} while (0)
-#define context_clear_entry(c) do {(c).lo = 0; (c).hi = 0;} while (0)
-
 /*
  * 0: readable
  * 1: writable
-- 
cgit v1.2.3-70-g09d2


From 622ba12a4c2148999bda9b891bfd0c6ddcb6c57e Mon Sep 17 00:00:00 2001
From: Mark McLoughlin <markmc@redhat.com>
Date: Thu, 20 Nov 2008 15:49:46 +0000
Subject: intel-iommu: move DMA PTE defs out of dma_remapping.h

DMA_PTE_READ/WRITE are needed by kvm.

Signed-off-by: Mark McLoughlin <markmc@redhat.com>
Signed-off-by: David Woodhouse <David.Woodhouse@intel.com>
---
 drivers/pci/intel-iommu.c     | 22 ++++++++++++++++++++++
 include/linux/dma_remapping.h | 22 ----------------------
 2 files changed, 22 insertions(+), 22 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/pci/intel-iommu.c b/drivers/pci/intel-iommu.c
index 9d06f4bb6b5..26c5402b6f7 100644
--- a/drivers/pci/intel-iommu.c
+++ b/drivers/pci/intel-iommu.c
@@ -128,6 +128,28 @@ struct context_entry {
 	do {(c).hi |= ((val) & ((1 << 16) - 1)) << 8;} while (0)
 #define context_clear_entry(c) do {(c).lo = 0; (c).hi = 0;} while (0)
 
+/*
+ * 0: readable
+ * 1: writable
+ * 2-6: reserved
+ * 7: super page
+ * 8-11: available
+ * 12-63: Host physcial address
+ */
+struct dma_pte {
+	u64 val;
+};
+#define dma_clear_pte(p)	do {(p).val = 0;} while (0)
+
+#define dma_set_pte_readable(p) do {(p).val |= DMA_PTE_READ;} while (0)
+#define dma_set_pte_writable(p) do {(p).val |= DMA_PTE_WRITE;} while (0)
+#define dma_set_pte_prot(p, prot) \
+		do {(p).val = ((p).val & ~3) | ((prot) & 3); } while (0)
+#define dma_pte_addr(p) ((p).val & VTD_PAGE_MASK)
+#define dma_set_pte_addr(p, addr) do {\
+		(p).val |= ((addr) & VTD_PAGE_MASK); } while (0)
+#define dma_pte_present(p) (((p).val & 3) != 0)
+
 static void flush_unmaps_timeout(unsigned long data);
 
 DEFINE_TIMER(unmap_timer,  flush_unmaps_timeout, 0, 0);
diff --git a/include/linux/dma_remapping.h b/include/linux/dma_remapping.h
index 9a88f7d0262..9d5874e3bec 100644
--- a/include/linux/dma_remapping.h
+++ b/include/linux/dma_remapping.h
@@ -11,31 +11,9 @@
 
 struct root_entry;
 
-/*
- * 0: readable
- * 1: writable
- * 2-6: reserved
- * 7: super page
- * 8-11: available
- * 12-63: Host physcial address
- */
-struct dma_pte {
-	u64 val;
-};
-#define dma_clear_pte(p)	do {(p).val = 0;} while (0)
-
 #define DMA_PTE_READ (1)
 #define DMA_PTE_WRITE (2)
 
-#define dma_set_pte_readable(p) do {(p).val |= DMA_PTE_READ;} while (0)
-#define dma_set_pte_writable(p) do {(p).val |= DMA_PTE_WRITE;} while (0)
-#define dma_set_pte_prot(p, prot) \
-		do {(p).val = ((p).val & ~3) | ((prot) & 3); } while (0)
-#define dma_pte_addr(p) ((p).val & VTD_PAGE_MASK)
-#define dma_set_pte_addr(p, addr) do {\
-		(p).val |= ((addr) & VTD_PAGE_MASK); } while (0)
-#define dma_pte_present(p) (((p).val & 3) != 0)
-
 struct intel_iommu;
 
 struct dmar_domain {
-- 
cgit v1.2.3-70-g09d2


From 99126f7ce14aff5f9371b2fa81fddb82be815794 Mon Sep 17 00:00:00 2001
From: Mark McLoughlin <markmc@redhat.com>
Date: Thu, 20 Nov 2008 15:49:47 +0000
Subject: intel-iommu: move struct dmar_domain def out dma_remapping.h

Signed-off-by: Mark McLoughlin <markmc@redhat.com>
Signed-off-by: David Woodhouse <David.Woodhouse@intel.com>
---
 drivers/pci/intel-iommu.c     | 18 ++++++++++++++++++
 include/linux/dma_remapping.h | 22 ++--------------------
 2 files changed, 20 insertions(+), 20 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/pci/intel-iommu.c b/drivers/pci/intel-iommu.c
index 26c5402b6f7..97c36b2ee61 100644
--- a/drivers/pci/intel-iommu.c
+++ b/drivers/pci/intel-iommu.c
@@ -150,6 +150,24 @@ struct dma_pte {
 		(p).val |= ((addr) & VTD_PAGE_MASK); } while (0)
 #define dma_pte_present(p) (((p).val & 3) != 0)
 
+struct dmar_domain {
+	int	id;			/* domain id */
+	struct intel_iommu *iommu;	/* back pointer to owning iommu */
+
+	struct list_head devices; 	/* all devices' list */
+	struct iova_domain iovad;	/* iova's that belong to this domain */
+
+	struct dma_pte	*pgd;		/* virtual address */
+	spinlock_t	mapping_lock;	/* page table lock */
+	int		gaw;		/* max guest address width */
+
+	/* adjusted guest address width, 0 is level 2 30-bit */
+	int		agaw;
+
+#define DOMAIN_FLAG_MULTIPLE_DEVICES 1
+	int		flags;
+};
+
 static void flush_unmaps_timeout(unsigned long data);
 
 DEFINE_TIMER(unmap_timer,  flush_unmaps_timeout, 0, 0);
diff --git a/include/linux/dma_remapping.h b/include/linux/dma_remapping.h
index 9d5874e3bec..333014468f1 100644
--- a/include/linux/dma_remapping.h
+++ b/include/linux/dma_remapping.h
@@ -9,30 +9,12 @@
 #define VTD_PAGE_MASK		(((u64)-1) << VTD_PAGE_SHIFT)
 #define VTD_PAGE_ALIGN(addr)	(((addr) + VTD_PAGE_SIZE - 1) & VTD_PAGE_MASK)
 
-struct root_entry;
-
 #define DMA_PTE_READ (1)
 #define DMA_PTE_WRITE (2)
 
 struct intel_iommu;
-
-struct dmar_domain {
-	int	id;			/* domain id */
-	struct intel_iommu *iommu;	/* back pointer to owning iommu */
-
-	struct list_head devices; 	/* all devices' list */
-	struct iova_domain iovad;	/* iova's that belong to this domain */
-
-	struct dma_pte	*pgd;		/* virtual address */
-	spinlock_t	mapping_lock;	/* page table lock */
-	int		gaw;		/* max guest address width */
-
-	/* adjusted guest address width, 0 is level 2 30-bit */
-	int		agaw;
-
-#define DOMAIN_FLAG_MULTIPLE_DEVICES 1
-	int		flags;
-};
+struct dmar_domain;
+struct root_entry;
 
 /* PCI domain-device relationship */
 struct device_domain_info {
-- 
cgit v1.2.3-70-g09d2


From a647dacbb1389aa6a5fa631766c1eaea35905890 Mon Sep 17 00:00:00 2001
From: Mark McLoughlin <markmc@redhat.com>
Date: Thu, 20 Nov 2008 15:49:48 +0000
Subject: intel-iommu: move struct device_domain_info out of dma_remapping.h

Signed-off-by: Mark McLoughlin <markmc@redhat.com>
Signed-off-by: David Woodhouse <David.Woodhouse@intel.com>
---
 drivers/pci/intel-iommu.c     | 10 ++++++++++
 include/linux/dma_remapping.h | 10 ----------
 2 files changed, 10 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/pci/intel-iommu.c b/drivers/pci/intel-iommu.c
index 97c36b2ee61..f23a02054bf 100644
--- a/drivers/pci/intel-iommu.c
+++ b/drivers/pci/intel-iommu.c
@@ -168,6 +168,16 @@ struct dmar_domain {
 	int		flags;
 };
 
+/* PCI domain-device relationship */
+struct device_domain_info {
+	struct list_head link;	/* link to domain siblings */
+	struct list_head global; /* link to global list */
+	u8 bus;			/* PCI bus numer */
+	u8 devfn;		/* PCI devfn number */
+	struct pci_dev *dev; /* it's NULL for PCIE-to-PCI bridge */
+	struct dmar_domain *domain; /* pointer to domain */
+};
+
 static void flush_unmaps_timeout(unsigned long data);
 
 DEFINE_TIMER(unmap_timer,  flush_unmaps_timeout, 0, 0);
diff --git a/include/linux/dma_remapping.h b/include/linux/dma_remapping.h
index 333014468f1..4ef5f6bc0d6 100644
--- a/include/linux/dma_remapping.h
+++ b/include/linux/dma_remapping.h
@@ -16,16 +16,6 @@ struct intel_iommu;
 struct dmar_domain;
 struct root_entry;
 
-/* PCI domain-device relationship */
-struct device_domain_info {
-	struct list_head link;	/* link to domain siblings */
-	struct list_head global; /* link to global list */
-	u8 bus;			/* PCI bus numer */
-	u8 devfn;		/* PCI devfn number */
-	struct pci_dev *dev; /* it's NULL for PCIE-to-PCI bridge */
-	struct dmar_domain *domain; /* pointer to domain */
-};
-
 extern void free_dmar_iommu(struct intel_iommu *iommu);
 
 extern int dmar_disabled;
-- 
cgit v1.2.3-70-g09d2


From 58fa7304a2c2bfd46e505c293ef779aa1d9715c2 Mon Sep 17 00:00:00 2001
From: Mark McLoughlin <markmc@redhat.com>
Date: Thu, 20 Nov 2008 15:49:49 +0000
Subject: intel-iommu: kill off duplicate def of dmar_disabled

This is only used in dmar.c and intel-iommu.h, so dma_remapping.h
seems like the appropriate place for it.

Signed-off-by: Mark McLoughlin <markmc@redhat.com>
Signed-off-by: David Woodhouse <David.Woodhouse@intel.com>
---
 include/linux/dmar.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/dmar.h b/include/linux/dmar.h
index f1984fc3e06..f28440784cf 100644
--- a/include/linux/dmar.h
+++ b/include/linux/dmar.h
@@ -144,7 +144,6 @@ struct dmar_rmrr_unit {
 	list_for_each_entry(rmrr, &dmar_rmrr_units, list)
 /* Intel DMAR  initialization functions */
 extern int intel_iommu_init(void);
-extern int dmar_disabled;
 #else
 static inline int intel_iommu_init(void)
 {
-- 
cgit v1.2.3-70-g09d2


From 2abd7e167c1b281f99bb58d302225872bfae9123 Mon Sep 17 00:00:00 2001
From: Mark McLoughlin <markmc@redhat.com>
Date: Thu, 20 Nov 2008 15:49:50 +0000
Subject: intel-iommu: move iommu_prepare_gfx_mapping() out of dma_remapping.h

Signed-off-by: Mark McLoughlin <markmc@redhat.com>
Signed-off-by: David Woodhouse <David.Woodhouse@intel.com>
---
 drivers/pci/intel-iommu.c     | 5 +++++
 include/linux/dma_remapping.h | 7 -------
 2 files changed, 5 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/pci/intel-iommu.c b/drivers/pci/intel-iommu.c
index f23a02054bf..c1c59a61965 100644
--- a/drivers/pci/intel-iommu.c
+++ b/drivers/pci/intel-iommu.c
@@ -1686,6 +1686,11 @@ static void __init iommu_prepare_gfx_mapping(void)
 			printk(KERN_ERR "IOMMU: mapping reserved region failed\n");
 	}
 }
+#else /* !CONFIG_DMAR_GFX_WA */
+static inline void iommu_prepare_gfx_mapping(void)
+{
+	return;
+}
 #endif
 
 #ifdef CONFIG_DMAR_FLOPPY_WA
diff --git a/include/linux/dma_remapping.h b/include/linux/dma_remapping.h
index 4ef5f6bc0d6..7799a85614c 100644
--- a/include/linux/dma_remapping.h
+++ b/include/linux/dma_remapping.h
@@ -20,11 +20,4 @@ extern void free_dmar_iommu(struct intel_iommu *iommu);
 
 extern int dmar_disabled;
 
-#ifndef CONFIG_DMAR_GFX_WA
-static inline void iommu_prepare_gfx_mapping(void)
-{
-	return;
-}
-#endif /* !CONFIG_DMAR_GFX_WA */
-
 #endif
-- 
cgit v1.2.3-70-g09d2


From 1b5736839ae13dadc5947940144f95dd0f4a4a8c Mon Sep 17 00:00:00 2001
From: Weidong Han <weidong.han@intel.com>
Date: Mon, 8 Dec 2008 15:34:06 +0800
Subject: calculate agaw for each iommu

"SAGAW" capability may be different across iommus. Use a default agaw, but if default agaw is not supported in some iommus, choose a less supported agaw.

Signed-off-by: Weidong Han <weidong.han@intel.com>
Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
---
 drivers/pci/dmar.c            | 10 ++++++++++
 drivers/pci/intel-iommu.c     | 22 ++++++++++++++++++++++
 include/linux/dma_remapping.h |  1 +
 include/linux/intel-iommu.h   |  1 +
 4 files changed, 34 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/pci/dmar.c b/drivers/pci/dmar.c
index 5f164ff3026..f5a662a50ac 100644
--- a/drivers/pci/dmar.c
+++ b/drivers/pci/dmar.c
@@ -491,6 +491,7 @@ int alloc_iommu(struct dmar_drhd_unit *drhd)
 	int map_size;
 	u32 ver;
 	static int iommu_allocated = 0;
+	int agaw;
 
 	iommu = kzalloc(sizeof(*iommu), GFP_KERNEL);
 	if (!iommu)
@@ -506,6 +507,15 @@ int alloc_iommu(struct dmar_drhd_unit *drhd)
 	iommu->cap = dmar_readq(iommu->reg + DMAR_CAP_REG);
 	iommu->ecap = dmar_readq(iommu->reg + DMAR_ECAP_REG);
 
+	agaw = iommu_calculate_agaw(iommu);
+	if (agaw < 0) {
+		printk(KERN_ERR
+			"Cannot get a valid agaw for iommu (seq_id = %d)\n",
+			iommu->seq_id);
+		goto error;
+	}
+	iommu->agaw = agaw;
+
 	/* the registers might be more than one page */
 	map_size = max_t(int, ecap_max_iotlb_offset(iommu->ecap),
 		cap_max_fault_reg_offset(iommu->cap));
diff --git a/drivers/pci/intel-iommu.c b/drivers/pci/intel-iommu.c
index 9dca689215e..3ecfa2304c2 100644
--- a/drivers/pci/intel-iommu.c
+++ b/drivers/pci/intel-iommu.c
@@ -362,6 +362,28 @@ void free_iova_mem(struct iova *iova)
 	kmem_cache_free(iommu_iova_cache, iova);
 }
 
+
+static inline int width_to_agaw(int width);
+
+/* calculate agaw for each iommu.
+ * "SAGAW" may be different across iommus, use a default agaw, and
+ * get a supported less agaw for iommus that don't support the default agaw.
+ */
+int iommu_calculate_agaw(struct intel_iommu *iommu)
+{
+	unsigned long sagaw;
+	int agaw = -1;
+
+	sagaw = cap_sagaw(iommu->cap);
+	for (agaw = width_to_agaw(DEFAULT_DOMAIN_ADDRESS_WIDTH);
+	     agaw >= 0; agaw--) {
+		if (test_bit(agaw, &sagaw))
+			break;
+	}
+
+	return agaw;
+}
+
 /* in native case, each domain is related to only one iommu */
 static struct intel_iommu *domain_get_iommu(struct dmar_domain *domain)
 {
diff --git a/include/linux/dma_remapping.h b/include/linux/dma_remapping.h
index 7799a85614c..136f170cecc 100644
--- a/include/linux/dma_remapping.h
+++ b/include/linux/dma_remapping.h
@@ -17,6 +17,7 @@ struct dmar_domain;
 struct root_entry;
 
 extern void free_dmar_iommu(struct intel_iommu *iommu);
+extern int iommu_calculate_agaw(struct intel_iommu *iommu);
 
 extern int dmar_disabled;
 
diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h
index 1bff7bf1bc2..06349fd5871 100644
--- a/include/linux/intel-iommu.h
+++ b/include/linux/intel-iommu.h
@@ -290,6 +290,7 @@ struct intel_iommu {
 	u32		gcmd; /* Holds TE, EAFL. Don't need SRTP, SFL, WBF */
 	spinlock_t	register_lock; /* protect register handling */
 	int		seq_id;	/* sequence id of the iommu */
+	int		agaw; /* agaw of this iommu */
 
 #ifdef CONFIG_DMAR
 	unsigned long 	*domain_ids; /* bitmap of domains */
-- 
cgit v1.2.3-70-g09d2


From faa3d6f5ffe7bf60ebfd0d36513fbcda0eb0ea1a Mon Sep 17 00:00:00 2001
From: Weidong Han <weidong.han@intel.com>
Date: Mon, 8 Dec 2008 23:09:29 +0800
Subject: Change intel iommu APIs of virtual machine domain

These APIs are used by KVM to use VT-d

Signed-off-by: Weidong Han <weidong.han@intel.com>
Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
---
 drivers/pci/intel-iommu.c   | 129 ++++++++++++++++++++------------------------
 include/linux/intel-iommu.h |  20 +++----
 2 files changed, 70 insertions(+), 79 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/pci/intel-iommu.c b/drivers/pci/intel-iommu.c
index 8a204d5bb42..f1380269cab 100644
--- a/drivers/pci/intel-iommu.c
+++ b/drivers/pci/intel-iommu.c
@@ -2944,96 +2944,87 @@ static void vm_domain_exit(struct dmar_domain *domain)
 	free_domain_mem(domain);
 }
 
-void intel_iommu_domain_exit(struct dmar_domain *domain)
+struct dmar_domain *intel_iommu_alloc_domain(void)
 {
-	u64 end;
-
-	/* Domain 0 is reserved, so dont process it */
-	if (!domain)
-		return;
-
-	end = DOMAIN_MAX_ADDR(domain->gaw);
-	end = end & (~VTD_PAGE_MASK);
-
-	/* clear ptes */
-	dma_pte_clear_range(domain, 0, end);
-
-	/* free page tables */
-	dma_pte_free_pagetable(domain, 0, end);
-
-	iommu_free_domain(domain);
-	free_domain_mem(domain);
-}
-EXPORT_SYMBOL_GPL(intel_iommu_domain_exit);
-
-struct dmar_domain *intel_iommu_domain_alloc(struct pci_dev *pdev)
-{
-	struct dmar_drhd_unit *drhd;
 	struct dmar_domain *domain;
-	struct intel_iommu *iommu;
 
-	drhd = dmar_find_matched_drhd_unit(pdev);
-	if (!drhd) {
-		printk(KERN_ERR "intel_iommu_domain_alloc: drhd == NULL\n");
-		return NULL;
-	}
-
-	iommu = drhd->iommu;
-	if (!iommu) {
-		printk(KERN_ERR
-			"intel_iommu_domain_alloc: iommu == NULL\n");
-		return NULL;
-	}
-	domain = iommu_alloc_domain(iommu);
+	domain = iommu_alloc_vm_domain();
 	if (!domain) {
 		printk(KERN_ERR
 			"intel_iommu_domain_alloc: domain == NULL\n");
 		return NULL;
 	}
-	if (domain_init(domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
+	if (vm_domain_init(domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
 		printk(KERN_ERR
 			"intel_iommu_domain_alloc: domain_init() failed\n");
-		intel_iommu_domain_exit(domain);
+		vm_domain_exit(domain);
 		return NULL;
 	}
+
 	return domain;
 }
-EXPORT_SYMBOL_GPL(intel_iommu_domain_alloc);
+EXPORT_SYMBOL_GPL(intel_iommu_alloc_domain);
 
-int intel_iommu_context_mapping(
-	struct dmar_domain *domain, struct pci_dev *pdev)
+void intel_iommu_free_domain(struct dmar_domain *domain)
 {
-	int rc;
-	rc = domain_context_mapping(domain, pdev);
-	return rc;
+	vm_domain_exit(domain);
 }
-EXPORT_SYMBOL_GPL(intel_iommu_context_mapping);
+EXPORT_SYMBOL_GPL(intel_iommu_free_domain);
 
-int intel_iommu_page_mapping(
-	struct dmar_domain *domain, dma_addr_t iova,
-	u64 hpa, size_t size, int prot)
+int intel_iommu_attach_device(struct dmar_domain *domain,
+			      struct pci_dev *pdev)
 {
-	int rc;
-	rc = domain_page_mapping(domain, iova, hpa, size, prot);
-	return rc;
+	int ret;
+
+	/* normally pdev is not mapped */
+	if (unlikely(domain_context_mapped(pdev))) {
+		struct dmar_domain *old_domain;
+
+		old_domain = find_domain(pdev);
+		if (old_domain) {
+			if (domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE)
+				vm_domain_remove_one_dev_info(old_domain, pdev);
+			else
+				domain_remove_dev_info(old_domain);
+		}
+	}
+
+	ret = domain_context_mapping(domain, pdev);
+	if (ret)
+		return ret;
+
+	ret = vm_domain_add_dev_info(domain, pdev);
+	return ret;
 }
-EXPORT_SYMBOL_GPL(intel_iommu_page_mapping);
+EXPORT_SYMBOL_GPL(intel_iommu_attach_device);
 
-void intel_iommu_detach_dev(struct dmar_domain *domain, u8 bus, u8 devfn)
+void intel_iommu_detach_device(struct dmar_domain *domain,
+			       struct pci_dev *pdev)
 {
-	struct intel_iommu *iommu;
+	vm_domain_remove_one_dev_info(domain, pdev);
+}
+EXPORT_SYMBOL_GPL(intel_iommu_detach_device);
 
-	iommu = device_to_iommu(bus, devfn);
-	iommu_detach_dev(iommu, bus, devfn);
+int intel_iommu_map_address(struct dmar_domain *domain, dma_addr_t iova,
+			    u64 hpa, size_t size, int prot)
+{
+	int ret;
+	ret = domain_page_mapping(domain, iova, hpa, size, prot);
+	return ret;
 }
-EXPORT_SYMBOL_GPL(intel_iommu_detach_dev);
+EXPORT_SYMBOL_GPL(intel_iommu_map_address);
 
-struct dmar_domain *
-intel_iommu_find_domain(struct pci_dev *pdev)
+void intel_iommu_unmap_address(struct dmar_domain *domain,
+			       dma_addr_t iova, size_t size)
 {
-	return find_domain(pdev);
+	dma_addr_t base;
+
+	/* The address might not be aligned */
+	base = iova & VTD_PAGE_MASK;
+	size = VTD_PAGE_ALIGN(size);
+	dma_pte_clear_range(domain, base, base + size);
 }
-EXPORT_SYMBOL_GPL(intel_iommu_find_domain);
+EXPORT_SYMBOL_GPL(intel_iommu_unmap_address);
 
 int intel_iommu_found(void)
 {
@@ -3041,17 +3032,15 @@ int intel_iommu_found(void)
 }
 EXPORT_SYMBOL_GPL(intel_iommu_found);
 
-u64 intel_iommu_iova_to_pfn(struct dmar_domain *domain, u64 iova)
+u64 intel_iommu_iova_to_phys(struct dmar_domain *domain, u64 iova)
 {
 	struct dma_pte *pte;
-	u64 pfn;
+	u64 phys = 0;
 
-	pfn = 0;
 	pte = addr_to_dma_pte(domain, iova);
-
 	if (pte)
-		pfn = dma_pte_addr(pte);
+		phys = dma_pte_addr(pte);
 
-	return pfn >> VTD_PAGE_SHIFT;
+	return phys;
 }
-EXPORT_SYMBOL_GPL(intel_iommu_iova_to_pfn);
+EXPORT_SYMBOL_GPL(intel_iommu_iova_to_phys);
diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h
index 06349fd5871..07973c4e4ac 100644
--- a/include/linux/intel-iommu.h
+++ b/include/linux/intel-iommu.h
@@ -330,15 +330,17 @@ extern int qi_flush_iotlb(struct intel_iommu *iommu, u16 did, u64 addr,
 
 extern void qi_submit_sync(struct qi_desc *desc, struct intel_iommu *iommu);
 
-void intel_iommu_domain_exit(struct dmar_domain *domain);
-struct dmar_domain *intel_iommu_domain_alloc(struct pci_dev *pdev);
-int intel_iommu_context_mapping(struct dmar_domain *domain,
-				struct pci_dev *pdev);
-int intel_iommu_page_mapping(struct dmar_domain *domain, dma_addr_t iova,
-			     u64 hpa, size_t size, int prot);
-void intel_iommu_detach_dev(struct dmar_domain *domain, u8 bus, u8 devfn);
-struct dmar_domain *intel_iommu_find_domain(struct pci_dev *pdev);
-u64 intel_iommu_iova_to_pfn(struct dmar_domain *domain, u64 iova);
+struct dmar_domain *intel_iommu_alloc_domain(void);
+void intel_iommu_free_domain(struct dmar_domain *domain);
+int intel_iommu_attach_device(struct dmar_domain *domain,
+			      struct pci_dev *pdev);
+void intel_iommu_detach_device(struct dmar_domain *domain,
+			       struct pci_dev *pdev);
+int intel_iommu_map_address(struct dmar_domain *domain, dma_addr_t iova,
+			    u64 hpa, size_t size, int prot);
+void intel_iommu_unmap_address(struct dmar_domain *domain,
+			       dma_addr_t iova, size_t size);
+u64 intel_iommu_iova_to_phys(struct dmar_domain *domain, u64 iova);
 
 #ifdef CONFIG_DMAR
 int intel_iommu_found(void);
-- 
cgit v1.2.3-70-g09d2


From 260782bcfdaaa7850f29d6bb2ec6603019168c57 Mon Sep 17 00:00:00 2001
From: Weidong Han <weidong.han@intel.com>
Date: Tue, 2 Dec 2008 21:03:39 +0800
Subject: KVM: use the new intel iommu APIs

intel iommu APIs are updated, use the new APIs.

In addition, change kvm_iommu_map_guest() to just create the domain, let kvm_iommu_assign_device() assign device.

Signed-off-by: Weidong Han <weidong.han@intel.com>
Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
---
 include/linux/kvm_host.h | 15 +++++---
 virt/kvm/kvm_main.c      |  7 +++-
 virt/kvm/vtd.c           | 98 +++++++++++++++++++++++++++---------------------
 3 files changed, 71 insertions(+), 49 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index eafabd5c66b..c96739b4b7a 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -330,9 +330,10 @@ void kvm_free_irq_source_id(struct kvm *kvm, int irq_source_id);
 #ifdef CONFIG_DMAR
 int kvm_iommu_map_pages(struct kvm *kvm, gfn_t base_gfn,
 			unsigned long npages);
-int kvm_iommu_map_guest(struct kvm *kvm,
-			struct kvm_assigned_dev_kernel *assigned_dev);
+int kvm_iommu_map_guest(struct kvm *kvm);
 int kvm_iommu_unmap_guest(struct kvm *kvm);
+int kvm_assign_device(struct kvm *kvm,
+		      struct kvm_assigned_dev_kernel *assigned_dev);
 #else /* CONFIG_DMAR */
 static inline int kvm_iommu_map_pages(struct kvm *kvm,
 				      gfn_t base_gfn,
@@ -341,9 +342,7 @@ static inline int kvm_iommu_map_pages(struct kvm *kvm,
 	return 0;
 }
 
-static inline int kvm_iommu_map_guest(struct kvm *kvm,
-				      struct kvm_assigned_dev_kernel
-				      *assigned_dev)
+static inline int kvm_iommu_map_guest(struct kvm *kvm)
 {
 	return -ENODEV;
 }
@@ -352,6 +351,12 @@ static inline int kvm_iommu_unmap_guest(struct kvm *kvm)
 {
 	return 0;
 }
+
+static inline int kvm_assign_device(struct kvm *kvm,
+		struct kvm_assigned_dev_kernel *assigned_dev)
+{
+	return 0;
+}
 #endif /* CONFIG_DMAR */
 
 static inline void kvm_guest_enter(void)
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index fc6127cbea1..c92b63462b7 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -503,7 +503,12 @@ static int kvm_vm_ioctl_assign_device(struct kvm *kvm,
 	list_add(&match->list, &kvm->arch.assigned_dev_head);
 
 	if (assigned_dev->flags & KVM_DEV_ASSIGN_ENABLE_IOMMU) {
-		r = kvm_iommu_map_guest(kvm, match);
+		if (!kvm->arch.intel_iommu_domain) {
+			r = kvm_iommu_map_guest(kvm);
+			if (r)
+				goto out_list_del;
+		}
+		r = kvm_assign_device(kvm, match);
 		if (r)
 			goto out_list_del;
 	}
diff --git a/virt/kvm/vtd.c b/virt/kvm/vtd.c
index a770874f3a3..44bb58a395a 100644
--- a/virt/kvm/vtd.c
+++ b/virt/kvm/vtd.c
@@ -45,20 +45,18 @@ int kvm_iommu_map_pages(struct kvm *kvm,
 
 	for (i = 0; i < npages; i++) {
 		/* check if already mapped */
-		pfn = (pfn_t)intel_iommu_iova_to_pfn(domain,
-						     gfn_to_gpa(gfn));
-		if (pfn)
+		if (intel_iommu_iova_to_phys(domain,
+					     gfn_to_gpa(gfn)))
 			continue;
 
 		pfn = gfn_to_pfn(kvm, gfn);
-		r = intel_iommu_page_mapping(domain,
-					     gfn_to_gpa(gfn),
-					     pfn_to_hpa(pfn),
-					     PAGE_SIZE,
-					     DMA_PTE_READ |
-					     DMA_PTE_WRITE);
+		r = intel_iommu_map_address(domain,
+					    gfn_to_gpa(gfn),
+					    pfn_to_hpa(pfn),
+					    PAGE_SIZE,
+					    DMA_PTE_READ | DMA_PTE_WRITE);
 		if (r) {
-			printk(KERN_ERR "kvm_iommu_map_pages:"
+			printk(KERN_ERR "kvm_iommu_map_address:"
 			       "iommu failed to map pfn=%lx\n", pfn);
 			goto unmap_pages;
 		}
@@ -86,50 +84,55 @@ static int kvm_iommu_map_memslots(struct kvm *kvm)
 	return r;
 }
 
-int kvm_iommu_map_guest(struct kvm *kvm,
-			struct kvm_assigned_dev_kernel *assigned_dev)
+int kvm_assign_device(struct kvm *kvm,
+		      struct kvm_assigned_dev_kernel *assigned_dev)
 {
 	struct pci_dev *pdev = NULL;
+	struct dmar_domain *domain = kvm->arch.intel_iommu_domain;
 	int r;
 
-	if (!intel_iommu_found()) {
-		printk(KERN_ERR "%s: intel iommu not found\n", __func__);
+	/* check if iommu exists and in use */
+	if (!domain)
+		return 0;
+
+	pdev = assigned_dev->dev;
+	if (pdev == NULL)
 		return -ENODEV;
+
+	r = intel_iommu_attach_device(domain, pdev);
+	if (r) {
+		printk(KERN_ERR "assign device %x:%x.%x failed",
+			pdev->bus->number,
+			PCI_SLOT(pdev->devfn),
+			PCI_FUNC(pdev->devfn));
+		return r;
 	}
 
-	printk(KERN_DEBUG "VT-d direct map: host bdf = %x:%x:%x\n",
-	       assigned_dev->host_busnr,
-	       PCI_SLOT(assigned_dev->host_devfn),
-	       PCI_FUNC(assigned_dev->host_devfn));
+	printk(KERN_DEBUG "assign device: host bdf = %x:%x:%x\n",
+		assigned_dev->host_busnr,
+		PCI_SLOT(assigned_dev->host_devfn),
+		PCI_FUNC(assigned_dev->host_devfn));
 
-	pdev = assigned_dev->dev;
+	return 0;
+}
 
-	if (pdev == NULL) {
-		if (kvm->arch.intel_iommu_domain) {
-			intel_iommu_domain_exit(kvm->arch.intel_iommu_domain);
-			kvm->arch.intel_iommu_domain = NULL;
-		}
+int kvm_iommu_map_guest(struct kvm *kvm)
+{
+	int r;
+
+	if (!intel_iommu_found()) {
+		printk(KERN_ERR "%s: intel iommu not found\n", __func__);
 		return -ENODEV;
 	}
 
-	kvm->arch.intel_iommu_domain = intel_iommu_domain_alloc(pdev);
+	kvm->arch.intel_iommu_domain = intel_iommu_alloc_domain();
 	if (!kvm->arch.intel_iommu_domain)
-		return -ENODEV;
+		return -ENOMEM;
 
 	r = kvm_iommu_map_memslots(kvm);
 	if (r)
 		goto out_unmap;
 
-	intel_iommu_detach_dev(kvm->arch.intel_iommu_domain,
-			       pdev->bus->number, pdev->devfn);
-
-	r = intel_iommu_context_mapping(kvm->arch.intel_iommu_domain,
-					pdev);
-	if (r) {
-		printk(KERN_ERR "Domain context map for %s failed",
-		       pci_name(pdev));
-		goto out_unmap;
-	}
 	return 0;
 
 out_unmap:
@@ -138,19 +141,29 @@ out_unmap:
 }
 
 static void kvm_iommu_put_pages(struct kvm *kvm,
-			       gfn_t base_gfn, unsigned long npages)
+				gfn_t base_gfn, unsigned long npages)
 {
 	gfn_t gfn = base_gfn;
 	pfn_t pfn;
 	struct dmar_domain *domain = kvm->arch.intel_iommu_domain;
-	int i;
+	unsigned long i;
+	u64 phys;
+
+	/* check if iommu exists and in use */
+	if (!domain)
+		return;
 
 	for (i = 0; i < npages; i++) {
-		pfn = (pfn_t)intel_iommu_iova_to_pfn(domain,
-						     gfn_to_gpa(gfn));
+		phys = intel_iommu_iova_to_phys(domain,
+						gfn_to_gpa(gfn));
+		pfn = phys >> PAGE_SHIFT;
 		kvm_release_pfn_clean(pfn);
 		gfn++;
 	}
+
+	intel_iommu_unmap_address(domain,
+				  gfn_to_gpa(base_gfn),
+				  PAGE_SIZE * npages);
 }
 
 static int kvm_iommu_unmap_memslots(struct kvm *kvm)
@@ -182,10 +195,9 @@ int kvm_iommu_unmap_guest(struct kvm *kvm)
 		       PCI_FUNC(entry->host_devfn));
 
 		/* detach kvm dmar domain */
-		intel_iommu_detach_dev(domain, entry->host_busnr,
-				       entry->host_devfn);
+		intel_iommu_detach_device(domain, entry->dev);
 	}
 	kvm_iommu_unmap_memslots(kvm);
-	intel_iommu_domain_exit(domain);
+	intel_iommu_free_domain(domain);
 	return 0;
 }
-- 
cgit v1.2.3-70-g09d2


From 0a920356748df4fb06e86c21c23d2ed6d31d37ad Mon Sep 17 00:00:00 2001
From: Weidong Han <weidong.han@intel.com>
Date: Tue, 2 Dec 2008 21:24:23 +0800
Subject: KVM: support device deassignment

Support device deassignment, it can be used in device hotplug.

Signed-off-by: Weidong Han <weidong.han@intel.com>
Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
---
 include/linux/kvm_host.h |  8 ++++++++
 virt/kvm/kvm_main.c      | 42 ++++++++++++++++++++++++++++++++++++++++++
 virt/kvm/vtd.c           | 24 ++++++++++++++++++++++++
 3 files changed, 74 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index c96739b4b7a..ce5d1c17ce2 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -334,6 +334,8 @@ int kvm_iommu_map_guest(struct kvm *kvm);
 int kvm_iommu_unmap_guest(struct kvm *kvm);
 int kvm_assign_device(struct kvm *kvm,
 		      struct kvm_assigned_dev_kernel *assigned_dev);
+int kvm_deassign_device(struct kvm *kvm,
+			struct kvm_assigned_dev_kernel *assigned_dev);
 #else /* CONFIG_DMAR */
 static inline int kvm_iommu_map_pages(struct kvm *kvm,
 				      gfn_t base_gfn,
@@ -357,6 +359,12 @@ static inline int kvm_assign_device(struct kvm *kvm,
 {
 	return 0;
 }
+
+static inline int kvm_deassign_device(struct kvm *kvm,
+		struct kvm_assigned_dev_kernel *assigned_dev)
+{
+	return 0;
+}
 #endif /* CONFIG_DMAR */
 
 static inline void kvm_guest_enter(void)
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index c92b63462b7..3238e08e465 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -530,6 +530,35 @@ out_free:
 }
 #endif
 
+#ifdef KVM_CAP_DEVICE_DEASSIGNMENT
+static int kvm_vm_ioctl_deassign_device(struct kvm *kvm,
+		struct kvm_assigned_pci_dev *assigned_dev)
+{
+	int r = 0;
+	struct kvm_assigned_dev_kernel *match;
+
+	mutex_lock(&kvm->lock);
+
+	match = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head,
+				      assigned_dev->assigned_dev_id);
+	if (!match) {
+		printk(KERN_INFO "%s: device hasn't been assigned before, "
+		  "so cannot be deassigned\n", __func__);
+		r = -EINVAL;
+		goto out;
+	}
+
+	if (assigned_dev->flags & KVM_DEV_ASSIGN_ENABLE_IOMMU)
+		kvm_deassign_device(kvm, match);
+
+	kvm_free_assigned_device(kvm, match);
+
+out:
+	mutex_unlock(&kvm->lock);
+	return r;
+}
+#endif
+
 static inline int valid_vcpu(int n)
 {
 	return likely(n >= 0 && n < KVM_MAX_VCPUS);
@@ -1862,6 +1891,19 @@ static long kvm_vm_ioctl(struct file *filp,
 			goto out;
 		break;
 	}
+#endif
+#ifdef KVM_CAP_DEVICE_DEASSIGNMENT
+	case KVM_DEASSIGN_PCI_DEVICE: {
+		struct kvm_assigned_pci_dev assigned_dev;
+
+		r = -EFAULT;
+		if (copy_from_user(&assigned_dev, argp, sizeof assigned_dev))
+			goto out;
+		r = kvm_vm_ioctl_deassign_device(kvm, &assigned_dev);
+		if (r)
+			goto out;
+		break;
+	}
 #endif
 	default:
 		r = kvm_arch_vm_ioctl(filp, ioctl, arg);
diff --git a/virt/kvm/vtd.c b/virt/kvm/vtd.c
index 44bb58a395a..174ea1f8cee 100644
--- a/virt/kvm/vtd.c
+++ b/virt/kvm/vtd.c
@@ -116,6 +116,30 @@ int kvm_assign_device(struct kvm *kvm,
 	return 0;
 }
 
+int kvm_deassign_device(struct kvm *kvm,
+			struct kvm_assigned_dev_kernel *assigned_dev)
+{
+	struct dmar_domain *domain = kvm->arch.intel_iommu_domain;
+	struct pci_dev *pdev = NULL;
+
+	/* check if iommu exists and in use */
+	if (!domain)
+		return 0;
+
+	pdev = assigned_dev->dev;
+	if (pdev == NULL)
+		return -ENODEV;
+
+	intel_iommu_detach_device(domain, pdev);
+
+	printk(KERN_DEBUG "deassign device: host bdf = %x:%x:%x\n",
+		assigned_dev->host_busnr,
+		PCI_SLOT(assigned_dev->host_devfn),
+		PCI_FUNC(assigned_dev->host_devfn));
+
+	return 0;
+}
+
 int kvm_iommu_map_guest(struct kvm *kvm)
 {
 	int r;
-- 
cgit v1.2.3-70-g09d2


From b653574a7d14b663cc812cb20be6a114939ba186 Mon Sep 17 00:00:00 2001
From: Weidong Han <weidong.han@intel.com>
Date: Mon, 8 Dec 2008 23:29:53 +0800
Subject: Deassign device in kvm_free_assgined_device

In kvm_iommu_unmap_memslots(), assigned_dev_head is already empty.

Signed-off-by: Weidong Han <weidong.han@intel.com>
Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
---
 include/linux/kvm_host.h |  1 +
 virt/kvm/kvm_main.c      |  1 +
 virt/kvm/vtd.c           | 10 ----------
 3 files changed, 2 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index ce5d1c17ce2..e62a4629e51 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -316,6 +316,7 @@ struct kvm_assigned_dev_kernel {
 #define KVM_ASSIGNED_DEV_HOST_MSI	(1 << 9)
 	unsigned long irq_requested_type;
 	int irq_source_id;
+	int flags;
 	struct pci_dev *dev;
 	struct kvm *kvm;
 };
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 3238e08e465..4ef0fb43d1f 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -496,6 +496,7 @@ static int kvm_vm_ioctl_assign_device(struct kvm *kvm,
 	match->assigned_dev_id = assigned_dev->assigned_dev_id;
 	match->host_busnr = assigned_dev->busnr;
 	match->host_devfn = assigned_dev->devfn;
+	match->flags = assigned_dev->flags;
 	match->dev = dev;
 	match->irq_source_id = -1;
 	match->kvm = kvm;
diff --git a/virt/kvm/vtd.c b/virt/kvm/vtd.c
index 174ea1f8cee..d46de9af838 100644
--- a/virt/kvm/vtd.c
+++ b/virt/kvm/vtd.c
@@ -205,22 +205,12 @@ static int kvm_iommu_unmap_memslots(struct kvm *kvm)
 
 int kvm_iommu_unmap_guest(struct kvm *kvm)
 {
-	struct kvm_assigned_dev_kernel *entry;
 	struct dmar_domain *domain = kvm->arch.intel_iommu_domain;
 
 	/* check if iommu exists and in use */
 	if (!domain)
 		return 0;
 
-	list_for_each_entry(entry, &kvm->arch.assigned_dev_head, list) {
-		printk(KERN_DEBUG "VT-d unmap: host bdf = %x:%x:%x\n",
-		       entry->host_busnr,
-		       PCI_SLOT(entry->host_devfn),
-		       PCI_FUNC(entry->host_devfn));
-
-		/* detach kvm dmar domain */
-		intel_iommu_detach_device(domain, entry->dev);
-	}
 	kvm_iommu_unmap_memslots(kvm);
 	intel_iommu_free_domain(domain);
 	return 0;
-- 
cgit v1.2.3-70-g09d2


From 4a77a6cf6d9bf9f5c74b27f62bd2bfe6dcc88392 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Wed, 26 Nov 2008 17:02:33 +0100
Subject: introcude linux/iommu.h for an iommu api

This patch introduces the API to abstract the exported VT-d functions
for KVM into a generic API. This way the AMD IOMMU implementation can
plug into this API later.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
---
 include/linux/iommu.h | 112 ++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 112 insertions(+)
 create mode 100644 include/linux/iommu.h

(limited to 'include/linux')

diff --git a/include/linux/iommu.h b/include/linux/iommu.h
new file mode 100644
index 00000000000..8a7bfb1b6ca
--- /dev/null
+++ b/include/linux/iommu.h
@@ -0,0 +1,112 @@
+/*
+ * Copyright (C) 2007-2008 Advanced Micro Devices, Inc.
+ * Author: Joerg Roedel <joerg.roedel@amd.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
+ */
+
+#ifndef __LINUX_IOMMU_H
+#define __LINUX_IOMMU_H
+
+#define IOMMU_READ	(1)
+#define IOMMU_WRITE	(2)
+
+struct device;
+
+struct iommu_domain {
+	void *priv;
+};
+
+struct iommu_ops {
+	int (*domain_init)(struct iommu_domain *domain);
+	void (*domain_destroy)(struct iommu_domain *domain);
+	int (*attach_dev)(struct iommu_domain *domain, struct device *dev);
+	void (*detach_dev)(struct iommu_domain *domain, struct device *dev);
+	int (*map)(struct iommu_domain *domain, unsigned long iova,
+		   phys_addr_t paddr, size_t size, int prot);
+	void (*unmap)(struct iommu_domain *domain, unsigned long iova,
+		      size_t size);
+	phys_addr_t (*iova_to_phys)(struct iommu_domain *domain,
+				    unsigned long iova);
+};
+
+#ifdef CONFIG_IOMMU_API
+
+extern void register_iommu(struct iommu_ops *ops);
+extern bool iommu_found(void);
+extern struct iommu_domain *iommu_domain_alloc(void);
+extern void iommu_domain_free(struct iommu_domain *domain);
+extern int iommu_attach_device(struct iommu_domain *domain,
+			       struct device *dev);
+extern void iommu_detach_device(struct iommu_domain *domain,
+				struct device *dev);
+extern int iommu_map_range(struct iommu_domain *domain, unsigned long iova,
+			   phys_addr_t paddr, size_t size, int prot);
+extern void iommu_unmap_range(struct iommu_domain *domain, unsigned long iova,
+			      size_t size);
+extern phys_addr_t iommu_iova_to_phys(struct iommu_domain *domain,
+				      unsigned long iova);
+
+#else /* CONFIG_IOMMU_API */
+
+static inline void register_iommu(struct iommu_ops *ops)
+{
+}
+
+static inline bool iommu_found(void)
+{
+	return false;
+}
+
+static inline struct iommu_domain *iommu_domain_alloc(void)
+{
+	return NULL;
+}
+
+static inline void iommu_domain_free(struct iommu_domain *domain)
+{
+}
+
+static inline int iommu_attach_device(struct iommu_domain *domain,
+				      struct device *dev)
+{
+	return -ENODEV;
+}
+
+static inline void iommu_detach_device(struct iommu_domain *domain,
+				       struct device *dev)
+{
+}
+
+static inline int iommu_map_range(struct iommu_domain *domain,
+				  unsigned long iova, phys_addr_t paddr,
+				  size_t size, int prot)
+{
+	return -ENODEV;
+}
+
+static inline void iommu_unmap_range(struct iommu_domain *domain,
+				     unsigned long iova, size_t size)
+{
+}
+
+static inline phys_addr_t iommu_iova_to_phys(struct iommu_domain *domain,
+					     unsigned long iova)
+{
+	return 0;
+}
+
+#endif /* CONFIG_IOMMU_API */
+
+#endif /* __LINUX_IOMMU_H */
-- 
cgit v1.2.3-70-g09d2


From 19de40a8472fa64693eab844911eec277d489f6c Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Wed, 3 Dec 2008 14:43:34 +0100
Subject: KVM: change KVM to use IOMMU API

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
---
 arch/ia64/include/asm/kvm_host.h |  2 +-
 arch/ia64/kvm/Makefile           |  2 +-
 arch/ia64/kvm/kvm-ia64.c         |  3 ++-
 arch/x86/include/asm/kvm_host.h  |  2 +-
 arch/x86/kvm/Makefile            |  2 +-
 arch/x86/kvm/x86.c               |  3 ++-
 include/linux/kvm_host.h         |  6 +++---
 virt/kvm/iommu.c                 | 45 +++++++++++++++++++---------------------
 virt/kvm/kvm_main.c              |  2 +-
 9 files changed, 33 insertions(+), 34 deletions(-)

(limited to 'include/linux')

diff --git a/arch/ia64/include/asm/kvm_host.h b/arch/ia64/include/asm/kvm_host.h
index 0560f3fae53..34866366165 100644
--- a/arch/ia64/include/asm/kvm_host.h
+++ b/arch/ia64/include/asm/kvm_host.h
@@ -467,7 +467,7 @@ struct kvm_arch {
 	struct kvm_sal_data rdv_sal_data;
 
 	struct list_head assigned_dev_head;
-	struct dmar_domain *intel_iommu_domain;
+	struct iommu_domain *iommu_domain;
 	struct hlist_head irq_ack_notifier_list;
 
 	unsigned long irq_sources_bitmap;
diff --git a/arch/ia64/kvm/Makefile b/arch/ia64/kvm/Makefile
index cb69dfcfb1a..0bb99b73290 100644
--- a/arch/ia64/kvm/Makefile
+++ b/arch/ia64/kvm/Makefile
@@ -51,7 +51,7 @@ EXTRA_AFLAGS += -Ivirt/kvm -Iarch/ia64/kvm/
 common-objs = $(addprefix ../../../virt/kvm/, kvm_main.o ioapic.o \
 		coalesced_mmio.o irq_comm.o)
 
-ifeq ($(CONFIG_DMAR),y)
+ifeq ($(CONFIG_IOMMU_API),y)
 common-objs += $(addprefix ../../../virt/kvm/, iommu.o)
 endif
 
diff --git a/arch/ia64/kvm/kvm-ia64.c b/arch/ia64/kvm/kvm-ia64.c
index 0f5ebd94843..4e586f6110a 100644
--- a/arch/ia64/kvm/kvm-ia64.c
+++ b/arch/ia64/kvm/kvm-ia64.c
@@ -31,6 +31,7 @@
 #include <linux/bitops.h>
 #include <linux/hrtimer.h>
 #include <linux/uaccess.h>
+#include <linux/iommu.h>
 #include <linux/intel-iommu.h>
 
 #include <asm/pgtable.h>
@@ -188,7 +189,7 @@ int kvm_dev_ioctl_check_extension(long ext)
 		r = KVM_COALESCED_MMIO_PAGE_OFFSET;
 		break;
 	case KVM_CAP_IOMMU:
-		r = intel_iommu_found();
+		r = iommu_found();
 		break;
 	default:
 		r = 0;
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 97215a458e5..730843d1d2f 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -360,7 +360,7 @@ struct kvm_arch{
 	struct list_head active_mmu_pages;
 	struct list_head assigned_dev_head;
 	struct list_head oos_global_pages;
-	struct dmar_domain *intel_iommu_domain;
+	struct iommu_domain *iommu_domain;
 	struct kvm_pic *vpic;
 	struct kvm_ioapic *vioapic;
 	struct kvm_pit *vpit;
diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile
index 00f46c2d14b..d3ec292f00f 100644
--- a/arch/x86/kvm/Makefile
+++ b/arch/x86/kvm/Makefile
@@ -7,7 +7,7 @@ common-objs = $(addprefix ../../../virt/kvm/, kvm_main.o ioapic.o \
 ifeq ($(CONFIG_KVM_TRACE),y)
 common-objs += $(addprefix ../../../virt/kvm/, kvm_trace.o)
 endif
-ifeq ($(CONFIG_DMAR),y)
+ifeq ($(CONFIG_IOMMU_API),y)
 common-objs += $(addprefix ../../../virt/kvm/, iommu.o)
 endif
 
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 0e6aa8141dc..cc17546a240 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -34,6 +34,7 @@
 #include <linux/module.h>
 #include <linux/mman.h>
 #include <linux/highmem.h>
+#include <linux/iommu.h>
 #include <linux/intel-iommu.h>
 
 #include <asm/uaccess.h>
@@ -989,7 +990,7 @@ int kvm_dev_ioctl_check_extension(long ext)
 		r = !tdp_enabled;
 		break;
 	case KVM_CAP_IOMMU:
-		r = intel_iommu_found();
+		r = iommu_found();
 		break;
 	default:
 		r = 0;
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index e62a4629e51..ec49d0be7f5 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -328,7 +328,7 @@ void kvm_unregister_irq_ack_notifier(struct kvm_irq_ack_notifier *kian);
 int kvm_request_irq_source_id(struct kvm *kvm);
 void kvm_free_irq_source_id(struct kvm *kvm, int irq_source_id);
 
-#ifdef CONFIG_DMAR
+#ifdef CONFIG_IOMMU_API
 int kvm_iommu_map_pages(struct kvm *kvm, gfn_t base_gfn,
 			unsigned long npages);
 int kvm_iommu_map_guest(struct kvm *kvm);
@@ -337,7 +337,7 @@ int kvm_assign_device(struct kvm *kvm,
 		      struct kvm_assigned_dev_kernel *assigned_dev);
 int kvm_deassign_device(struct kvm *kvm,
 			struct kvm_assigned_dev_kernel *assigned_dev);
-#else /* CONFIG_DMAR */
+#else /* CONFIG_IOMMU_API */
 static inline int kvm_iommu_map_pages(struct kvm *kvm,
 				      gfn_t base_gfn,
 				      unsigned long npages)
@@ -366,7 +366,7 @@ static inline int kvm_deassign_device(struct kvm *kvm,
 {
 	return 0;
 }
-#endif /* CONFIG_DMAR */
+#endif /* CONFIG_IOMMU_API */
 
 static inline void kvm_guest_enter(void)
 {
diff --git a/virt/kvm/iommu.c b/virt/kvm/iommu.c
index d46de9af838..d0bebaa5bf0 100644
--- a/virt/kvm/iommu.c
+++ b/virt/kvm/iommu.c
@@ -25,6 +25,7 @@
 #include <linux/kvm_host.h>
 #include <linux/pci.h>
 #include <linux/dmar.h>
+#include <linux/iommu.h>
 #include <linux/intel-iommu.h>
 
 static int kvm_iommu_unmap_memslots(struct kvm *kvm);
@@ -37,7 +38,7 @@ int kvm_iommu_map_pages(struct kvm *kvm,
 	gfn_t gfn = base_gfn;
 	pfn_t pfn;
 	int i, r = 0;
-	struct dmar_domain *domain = kvm->arch.intel_iommu_domain;
+	struct iommu_domain *domain = kvm->arch.iommu_domain;
 
 	/* check if iommu exists and in use */
 	if (!domain)
@@ -45,16 +46,15 @@ int kvm_iommu_map_pages(struct kvm *kvm,
 
 	for (i = 0; i < npages; i++) {
 		/* check if already mapped */
-		if (intel_iommu_iova_to_phys(domain,
-					     gfn_to_gpa(gfn)))
+		if (iommu_iova_to_phys(domain, gfn_to_gpa(gfn)))
 			continue;
 
 		pfn = gfn_to_pfn(kvm, gfn);
-		r = intel_iommu_map_address(domain,
-					    gfn_to_gpa(gfn),
-					    pfn_to_hpa(pfn),
-					    PAGE_SIZE,
-					    DMA_PTE_READ | DMA_PTE_WRITE);
+		r = iommu_map_range(domain,
+				    gfn_to_gpa(gfn),
+				    pfn_to_hpa(pfn),
+				    PAGE_SIZE,
+				    IOMMU_READ | IOMMU_WRITE);
 		if (r) {
 			printk(KERN_ERR "kvm_iommu_map_address:"
 			       "iommu failed to map pfn=%lx\n", pfn);
@@ -88,7 +88,7 @@ int kvm_assign_device(struct kvm *kvm,
 		      struct kvm_assigned_dev_kernel *assigned_dev)
 {
 	struct pci_dev *pdev = NULL;
-	struct dmar_domain *domain = kvm->arch.intel_iommu_domain;
+	struct iommu_domain *domain = kvm->arch.iommu_domain;
 	int r;
 
 	/* check if iommu exists and in use */
@@ -99,7 +99,7 @@ int kvm_assign_device(struct kvm *kvm,
 	if (pdev == NULL)
 		return -ENODEV;
 
-	r = intel_iommu_attach_device(domain, pdev);
+	r = iommu_attach_device(domain, &pdev->dev);
 	if (r) {
 		printk(KERN_ERR "assign device %x:%x.%x failed",
 			pdev->bus->number,
@@ -119,7 +119,7 @@ int kvm_assign_device(struct kvm *kvm,
 int kvm_deassign_device(struct kvm *kvm,
 			struct kvm_assigned_dev_kernel *assigned_dev)
 {
-	struct dmar_domain *domain = kvm->arch.intel_iommu_domain;
+	struct iommu_domain *domain = kvm->arch.iommu_domain;
 	struct pci_dev *pdev = NULL;
 
 	/* check if iommu exists and in use */
@@ -130,7 +130,7 @@ int kvm_deassign_device(struct kvm *kvm,
 	if (pdev == NULL)
 		return -ENODEV;
 
-	intel_iommu_detach_device(domain, pdev);
+	iommu_detach_device(domain, &pdev->dev);
 
 	printk(KERN_DEBUG "deassign device: host bdf = %x:%x:%x\n",
 		assigned_dev->host_busnr,
@@ -144,13 +144,13 @@ int kvm_iommu_map_guest(struct kvm *kvm)
 {
 	int r;
 
-	if (!intel_iommu_found()) {
-		printk(KERN_ERR "%s: intel iommu not found\n", __func__);
+	if (!iommu_found()) {
+		printk(KERN_ERR "%s: iommu not found\n", __func__);
 		return -ENODEV;
 	}
 
-	kvm->arch.intel_iommu_domain = intel_iommu_alloc_domain();
-	if (!kvm->arch.intel_iommu_domain)
+	kvm->arch.iommu_domain = iommu_domain_alloc();
+	if (!kvm->arch.iommu_domain)
 		return -ENOMEM;
 
 	r = kvm_iommu_map_memslots(kvm);
@@ -169,7 +169,7 @@ static void kvm_iommu_put_pages(struct kvm *kvm,
 {
 	gfn_t gfn = base_gfn;
 	pfn_t pfn;
-	struct dmar_domain *domain = kvm->arch.intel_iommu_domain;
+	struct iommu_domain *domain = kvm->arch.iommu_domain;
 	unsigned long i;
 	u64 phys;
 
@@ -178,16 +178,13 @@ static void kvm_iommu_put_pages(struct kvm *kvm,
 		return;
 
 	for (i = 0; i < npages; i++) {
-		phys = intel_iommu_iova_to_phys(domain,
-						gfn_to_gpa(gfn));
+		phys = iommu_iova_to_phys(domain, gfn_to_gpa(gfn));
 		pfn = phys >> PAGE_SHIFT;
 		kvm_release_pfn_clean(pfn);
 		gfn++;
 	}
 
-	intel_iommu_unmap_address(domain,
-				  gfn_to_gpa(base_gfn),
-				  PAGE_SIZE * npages);
+	iommu_unmap_range(domain, gfn_to_gpa(base_gfn), PAGE_SIZE * npages);
 }
 
 static int kvm_iommu_unmap_memslots(struct kvm *kvm)
@@ -205,13 +202,13 @@ static int kvm_iommu_unmap_memslots(struct kvm *kvm)
 
 int kvm_iommu_unmap_guest(struct kvm *kvm)
 {
-	struct dmar_domain *domain = kvm->arch.intel_iommu_domain;
+	struct iommu_domain *domain = kvm->arch.iommu_domain;
 
 	/* check if iommu exists and in use */
 	if (!domain)
 		return 0;
 
 	kvm_iommu_unmap_memslots(kvm);
-	intel_iommu_free_domain(domain);
+	iommu_domain_free(domain);
 	return 0;
 }
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 4ef0fb43d1f..3a5a08298aa 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -504,7 +504,7 @@ static int kvm_vm_ioctl_assign_device(struct kvm *kvm,
 	list_add(&match->list, &kvm->arch.assigned_dev_head);
 
 	if (assigned_dev->flags & KVM_DEV_ASSIGN_ENABLE_IOMMU) {
-		if (!kvm->arch.intel_iommu_domain) {
+		if (!kvm->arch.iommu_domain) {
 			r = kvm_iommu_map_guest(kvm);
 			if (r)
 				goto out_list_del;
-- 
cgit v1.2.3-70-g09d2


From 5d450806eb0e569c5846a5825e7f535980b0da32 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Wed, 3 Dec 2008 14:52:32 +0100
Subject: VT-d: adapt domain init and destroy functions for IOMMU API

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
---
 drivers/pci/intel-iommu.c   | 33 ++++++++++++++++++---------------
 include/linux/intel-iommu.h |  2 --
 2 files changed, 18 insertions(+), 17 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/pci/intel-iommu.c b/drivers/pci/intel-iommu.c
index 772fb22e1be..5c95a5a6544 100644
--- a/drivers/pci/intel-iommu.c
+++ b/drivers/pci/intel-iommu.c
@@ -34,6 +34,7 @@
 #include <linux/mempool.h>
 #include <linux/timer.h>
 #include <linux/iova.h>
+#include <linux/iommu.h>
 #include <linux/intel-iommu.h>
 #include <asm/cacheflush.h>
 #include <asm/iommu.h>
@@ -2962,32 +2963,34 @@ static void vm_domain_exit(struct dmar_domain *domain)
 	free_domain_mem(domain);
 }
 
-struct dmar_domain *intel_iommu_alloc_domain(void)
+static int intel_iommu_domain_init(struct iommu_domain *domain)
 {
-	struct dmar_domain *domain;
+	struct dmar_domain *dmar_domain;
 
-	domain = iommu_alloc_vm_domain();
-	if (!domain) {
+	dmar_domain = iommu_alloc_vm_domain();
+	if (!dmar_domain) {
 		printk(KERN_ERR
-			"intel_iommu_domain_alloc: domain == NULL\n");
-		return NULL;
+			"intel_iommu_domain_init: dmar_domain == NULL\n");
+		return -ENOMEM;
 	}
-	if (vm_domain_init(domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
+	if (vm_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
 		printk(KERN_ERR
-			"intel_iommu_domain_alloc: domain_init() failed\n");
-		vm_domain_exit(domain);
-		return NULL;
+			"intel_iommu_domain_init() failed\n");
+		vm_domain_exit(dmar_domain);
+		return -ENOMEM;
 	}
+	domain->priv = dmar_domain;
 
-	return domain;
+	return 0;
 }
-EXPORT_SYMBOL_GPL(intel_iommu_alloc_domain);
 
-void intel_iommu_free_domain(struct dmar_domain *domain)
+static void intel_iommu_domain_destroy(struct iommu_domain *domain)
 {
-	vm_domain_exit(domain);
+	struct dmar_domain *dmar_domain = domain->priv;
+
+	domain->priv = NULL;
+	vm_domain_exit(dmar_domain);
 }
-EXPORT_SYMBOL_GPL(intel_iommu_free_domain);
 
 int intel_iommu_attach_device(struct dmar_domain *domain,
 			      struct pci_dev *pdev)
diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h
index 07973c4e4ac..0a7ba0cefc7 100644
--- a/include/linux/intel-iommu.h
+++ b/include/linux/intel-iommu.h
@@ -330,8 +330,6 @@ extern int qi_flush_iotlb(struct intel_iommu *iommu, u16 did, u64 addr,
 
 extern void qi_submit_sync(struct qi_desc *desc, struct intel_iommu *iommu);
 
-struct dmar_domain *intel_iommu_alloc_domain(void);
-void intel_iommu_free_domain(struct dmar_domain *domain);
 int intel_iommu_attach_device(struct dmar_domain *domain,
 			      struct pci_dev *pdev);
 void intel_iommu_detach_device(struct dmar_domain *domain,
-- 
cgit v1.2.3-70-g09d2


From 4c5478c94eb29e6101f1f13175f7455bc8b5d953 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Wed, 3 Dec 2008 14:58:24 +0100
Subject: VT-d: adapt device attach and detach functions for IOMMU API

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
---
 drivers/pci/intel-iommu.c   | 27 +++++++++++++++------------
 include/linux/intel-iommu.h |  4 ----
 2 files changed, 15 insertions(+), 16 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/pci/intel-iommu.c b/drivers/pci/intel-iommu.c
index 5c95a5a6544..db9a26cfeb8 100644
--- a/drivers/pci/intel-iommu.c
+++ b/drivers/pci/intel-iommu.c
@@ -2992,9 +2992,11 @@ static void intel_iommu_domain_destroy(struct iommu_domain *domain)
 	vm_domain_exit(dmar_domain);
 }
 
-int intel_iommu_attach_device(struct dmar_domain *domain,
-			      struct pci_dev *pdev)
+static int intel_iommu_attach_device(struct iommu_domain *domain,
+				     struct device *dev)
 {
+	struct dmar_domain *dmar_domain = domain->priv;
+	struct pci_dev *pdev = to_pci_dev(dev);
 	struct intel_iommu *iommu;
 	int addr_width;
 	u64 end;
@@ -3006,7 +3008,7 @@ int intel_iommu_attach_device(struct dmar_domain *domain,
 
 		old_domain = find_domain(pdev);
 		if (old_domain) {
-			if (domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE)
+			if (dmar_domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE)
 				vm_domain_remove_one_dev_info(old_domain, pdev);
 			else
 				domain_remove_dev_info(old_domain);
@@ -3021,28 +3023,29 @@ int intel_iommu_attach_device(struct dmar_domain *domain,
 	addr_width = agaw_to_width(iommu->agaw);
 	end = DOMAIN_MAX_ADDR(addr_width);
 	end = end & VTD_PAGE_MASK;
-	if (end < domain->max_addr) {
+	if (end < dmar_domain->max_addr) {
 		printk(KERN_ERR "%s: iommu agaw (%d) is not "
 		       "sufficient for the mapped address (%llx)\n",
-		       __func__, iommu->agaw, domain->max_addr);
+		       __func__, iommu->agaw, dmar_domain->max_addr);
 		return -EFAULT;
 	}
 
-	ret = domain_context_mapping(domain, pdev);
+	ret = domain_context_mapping(dmar_domain, pdev);
 	if (ret)
 		return ret;
 
-	ret = vm_domain_add_dev_info(domain, pdev);
+	ret = vm_domain_add_dev_info(dmar_domain, pdev);
 	return ret;
 }
-EXPORT_SYMBOL_GPL(intel_iommu_attach_device);
 
-void intel_iommu_detach_device(struct dmar_domain *domain,
-			       struct pci_dev *pdev)
+static void intel_iommu_detach_device(struct iommu_domain *domain,
+				      struct device *dev)
 {
-	vm_domain_remove_one_dev_info(domain, pdev);
+	struct dmar_domain *dmar_domain = domain->priv;
+	struct pci_dev *pdev = to_pci_dev(dev);
+
+	vm_domain_remove_one_dev_info(dmar_domain, pdev);
 }
-EXPORT_SYMBOL_GPL(intel_iommu_detach_device);
 
 int intel_iommu_map_address(struct dmar_domain *domain, dma_addr_t iova,
 			    u64 hpa, size_t size, int prot)
diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h
index 0a7ba0cefc7..9909c5a1b20 100644
--- a/include/linux/intel-iommu.h
+++ b/include/linux/intel-iommu.h
@@ -330,10 +330,6 @@ extern int qi_flush_iotlb(struct intel_iommu *iommu, u16 did, u64 addr,
 
 extern void qi_submit_sync(struct qi_desc *desc, struct intel_iommu *iommu);
 
-int intel_iommu_attach_device(struct dmar_domain *domain,
-			      struct pci_dev *pdev);
-void intel_iommu_detach_device(struct dmar_domain *domain,
-			       struct pci_dev *pdev);
 int intel_iommu_map_address(struct dmar_domain *domain, dma_addr_t iova,
 			    u64 hpa, size_t size, int prot);
 void intel_iommu_unmap_address(struct dmar_domain *domain,
-- 
cgit v1.2.3-70-g09d2


From dde57a210dcdce85e2813bab8f88687761d9f6a6 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Wed, 3 Dec 2008 15:04:09 +0100
Subject: VT-d: adapt domain map and unmap functions for IOMMU API

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
---
 drivers/pci/intel-iommu.c   | 33 ++++++++++++++++++++-------------
 include/linux/intel-iommu.h |  4 ----
 2 files changed, 20 insertions(+), 17 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/pci/intel-iommu.c b/drivers/pci/intel-iommu.c
index db9a26cfeb8..8af6c96f31b 100644
--- a/drivers/pci/intel-iommu.c
+++ b/drivers/pci/intel-iommu.c
@@ -3047,20 +3047,28 @@ static void intel_iommu_detach_device(struct iommu_domain *domain,
 	vm_domain_remove_one_dev_info(dmar_domain, pdev);
 }
 
-int intel_iommu_map_address(struct dmar_domain *domain, dma_addr_t iova,
-			    u64 hpa, size_t size, int prot)
+static int intel_iommu_map_range(struct iommu_domain *domain,
+				 unsigned long iova, phys_addr_t hpa,
+				 size_t size, int iommu_prot)
 {
+	struct dmar_domain *dmar_domain = domain->priv;
 	u64 max_addr;
 	int addr_width;
+	int prot = 0;
 	int ret;
 
+	if (iommu_prot & IOMMU_READ)
+		prot |= DMA_PTE_READ;
+	if (iommu_prot & IOMMU_WRITE)
+		prot |= DMA_PTE_WRITE;
+
 	max_addr = (iova & VTD_PAGE_MASK) + VTD_PAGE_ALIGN(size);
-	if (domain->max_addr < max_addr) {
+	if (dmar_domain->max_addr < max_addr) {
 		int min_agaw;
 		u64 end;
 
 		/* check if minimum agaw is sufficient for mapped address */
-		min_agaw = vm_domain_min_agaw(domain);
+		min_agaw = vm_domain_min_agaw(dmar_domain);
 		addr_width = agaw_to_width(min_agaw);
 		end = DOMAIN_MAX_ADDR(addr_width);
 		end = end & VTD_PAGE_MASK;
@@ -3070,28 +3078,27 @@ int intel_iommu_map_address(struct dmar_domain *domain, dma_addr_t iova,
 			       __func__, min_agaw, max_addr);
 			return -EFAULT;
 		}
-		domain->max_addr = max_addr;
+		dmar_domain->max_addr = max_addr;
 	}
 
-	ret = domain_page_mapping(domain, iova, hpa, size, prot);
+	ret = domain_page_mapping(dmar_domain, iova, hpa, size, prot);
 	return ret;
 }
-EXPORT_SYMBOL_GPL(intel_iommu_map_address);
 
-void intel_iommu_unmap_address(struct dmar_domain *domain,
-			       dma_addr_t iova, size_t size)
+static void intel_iommu_unmap_range(struct iommu_domain *domain,
+				    unsigned long iova, size_t size)
 {
+	struct dmar_domain *dmar_domain = domain->priv;
 	dma_addr_t base;
 
 	/* The address might not be aligned */
 	base = iova & VTD_PAGE_MASK;
 	size = VTD_PAGE_ALIGN(size);
-	dma_pte_clear_range(domain, base, base + size);
+	dma_pte_clear_range(dmar_domain, base, base + size);
 
-	if (domain->max_addr == base + size)
-		domain->max_addr = base;
+	if (dmar_domain->max_addr == base + size)
+		dmar_domain->max_addr = base;
 }
-EXPORT_SYMBOL_GPL(intel_iommu_unmap_address);
 
 int intel_iommu_found(void)
 {
diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h
index 9909c5a1b20..6bc26e03858 100644
--- a/include/linux/intel-iommu.h
+++ b/include/linux/intel-iommu.h
@@ -330,10 +330,6 @@ extern int qi_flush_iotlb(struct intel_iommu *iommu, u16 did, u64 addr,
 
 extern void qi_submit_sync(struct qi_desc *desc, struct intel_iommu *iommu);
 
-int intel_iommu_map_address(struct dmar_domain *domain, dma_addr_t iova,
-			    u64 hpa, size_t size, int prot);
-void intel_iommu_unmap_address(struct dmar_domain *domain,
-			       dma_addr_t iova, size_t size);
 u64 intel_iommu_iova_to_phys(struct dmar_domain *domain, u64 iova);
 
 #ifdef CONFIG_DMAR
-- 
cgit v1.2.3-70-g09d2


From d14d65777c2491dd5baf1e17f444b8f653f3cbb1 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Wed, 3 Dec 2008 15:06:57 +0100
Subject: VT-d: adapt domain iova_to_phys function for IOMMU API

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
---
 drivers/pci/intel-iommu.c   | 7 ++++---
 include/linux/intel-iommu.h | 2 --
 2 files changed, 4 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/pci/intel-iommu.c b/drivers/pci/intel-iommu.c
index 8af6c96f31b..712810598a2 100644
--- a/drivers/pci/intel-iommu.c
+++ b/drivers/pci/intel-iommu.c
@@ -3106,15 +3106,16 @@ int intel_iommu_found(void)
 }
 EXPORT_SYMBOL_GPL(intel_iommu_found);
 
-u64 intel_iommu_iova_to_phys(struct dmar_domain *domain, u64 iova)
+static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
+					    unsigned long iova)
 {
+	struct dmar_domain *dmar_domain = domain->priv;
 	struct dma_pte *pte;
 	u64 phys = 0;
 
-	pte = addr_to_dma_pte(domain, iova);
+	pte = addr_to_dma_pte(dmar_domain, iova);
 	if (pte)
 		phys = dma_pte_addr(pte);
 
 	return phys;
 }
-EXPORT_SYMBOL_GPL(intel_iommu_iova_to_phys);
diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h
index 6bc26e03858..26ccc029456 100644
--- a/include/linux/intel-iommu.h
+++ b/include/linux/intel-iommu.h
@@ -330,8 +330,6 @@ extern int qi_flush_iotlb(struct intel_iommu *iommu, u16 did, u64 addr,
 
 extern void qi_submit_sync(struct qi_desc *desc, struct intel_iommu *iommu);
 
-u64 intel_iommu_iova_to_phys(struct dmar_domain *domain, u64 iova);
-
 #ifdef CONFIG_DMAR
 int intel_iommu_found(void);
 #else /* CONFIG_DMAR */
-- 
cgit v1.2.3-70-g09d2


From e4754c96cf8b82a754dc5ba791d6c0bf1fbe8e8e Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Wed, 3 Dec 2008 15:26:42 +0100
Subject: VT-d: remove now unused intel_iommu_found function

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
---
 drivers/pci/intel-iommu.c   | 6 ------
 include/linux/intel-iommu.h | 9 ---------
 2 files changed, 15 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/pci/intel-iommu.c b/drivers/pci/intel-iommu.c
index 81e04ec85d9..ecb5fd3b71f 100644
--- a/drivers/pci/intel-iommu.c
+++ b/drivers/pci/intel-iommu.c
@@ -3105,12 +3105,6 @@ static void intel_iommu_unmap_range(struct iommu_domain *domain,
 		dmar_domain->max_addr = base;
 }
 
-int intel_iommu_found(void)
-{
-	return g_num_of_iommus;
-}
-EXPORT_SYMBOL_GPL(intel_iommu_found);
-
 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
 					    unsigned long iova)
 {
diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h
index 26ccc029456..c4f6c101dbc 100644
--- a/include/linux/intel-iommu.h
+++ b/include/linux/intel-iommu.h
@@ -330,15 +330,6 @@ extern int qi_flush_iotlb(struct intel_iommu *iommu, u16 did, u64 addr,
 
 extern void qi_submit_sync(struct qi_desc *desc, struct intel_iommu *iommu);
 
-#ifdef CONFIG_DMAR
-int intel_iommu_found(void);
-#else /* CONFIG_DMAR */
-static inline int intel_iommu_found(void)
-{
-	return 0;
-}
-#endif /* CONFIG_DMAR */
-
 extern void *intel_alloc_coherent(struct device *, size_t, dma_addr_t *, gfp_t);
 extern void intel_free_coherent(struct device *, size_t, void *, dma_addr_t);
 extern dma_addr_t intel_map_single(struct device *, phys_addr_t, size_t, int);
-- 
cgit v1.2.3-70-g09d2


From 2f983570010a0dcb26d988da02d7ccfad00c807c Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai@kernel.org>
Date: Sat, 3 Jan 2009 00:06:34 -0800
Subject: sparseirq: move set/get_timer_rand_state back to .c

those two functions only used in that C file

Signed-off-by: Yinghai Lu <yinghai@kernel.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/char/random.c  | 40 ++++++++++++++++++++++++++++++++++------
 include/linux/random.h | 50 --------------------------------------------------
 2 files changed, 34 insertions(+), 56 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/char/random.c b/drivers/char/random.c
index d26891bfcd4..c7afc068c28 100644
--- a/drivers/char/random.c
+++ b/drivers/char/random.c
@@ -559,7 +559,40 @@ struct timer_rand_state {
 };
 
 #ifndef CONFIG_SPARSE_IRQ
-struct timer_rand_state *irq_timer_state[NR_IRQS];
+
+static struct timer_rand_state *irq_timer_state[NR_IRQS];
+
+static struct timer_rand_state *get_timer_rand_state(unsigned int irq)
+{
+	return irq_timer_state[irq];
+}
+
+static void set_timer_rand_state(unsigned int irq,
+				 struct timer_rand_state *state)
+{
+	irq_timer_state[irq] = state;
+}
+
+#else
+
+static struct timer_rand_state *get_timer_rand_state(unsigned int irq)
+{
+	struct irq_desc *desc;
+
+	desc = irq_to_desc(irq);
+
+	return desc->timer_rand_state;
+}
+
+static void set_timer_rand_state(unsigned int irq,
+				 struct timer_rand_state *state)
+{
+	struct irq_desc *desc;
+
+	desc = irq_to_desc(irq);
+
+	desc->timer_rand_state = state;
+}
 #endif
 
 static struct timer_rand_state input_timer_state;
@@ -919,11 +952,6 @@ void rand_initialize_irq(int irq)
 {
 	struct timer_rand_state *state;
 
-#ifndef CONFIG_SPARSE_IRQ
-	if (irq >= nr_irqs)
-		return;
-#endif
-
 	state = get_timer_rand_state(irq);
 
 	if (state)
diff --git a/include/linux/random.h b/include/linux/random.h
index adbf3bd3c6b..407ea3646f8 100644
--- a/include/linux/random.h
+++ b/include/linux/random.h
@@ -45,56 +45,6 @@ struct rand_pool_info {
 
 extern void rand_initialize_irq(int irq);
 
-struct timer_rand_state;
-#ifndef CONFIG_SPARSE_IRQ
-
-extern struct timer_rand_state *irq_timer_state[];
-
-static inline struct timer_rand_state *get_timer_rand_state(unsigned int irq)
-{
-	if (irq >= nr_irqs)
-		return NULL;
-
-	return irq_timer_state[irq];
-}
-
-static inline void set_timer_rand_state(unsigned int irq, struct timer_rand_state *state)
-{
-	if (irq >= nr_irqs)
-		return;
-
-	irq_timer_state[irq] = state;
-}
-
-#else
-
-#include <linux/irq.h>
-static inline struct timer_rand_state *get_timer_rand_state(unsigned int irq)
-{
-	struct irq_desc *desc;
-
-	desc = irq_to_desc(irq);
-
-	if (!desc)
-		return NULL;
-
-	return desc->timer_rand_state;
-}
-
-static inline void set_timer_rand_state(unsigned int irq, struct timer_rand_state *state)
-{
-	struct irq_desc *desc;
-
-	desc = irq_to_desc(irq);
-
-	if (!desc)
-		return;
-
-	desc->timer_rand_state = state;
-}
-#endif
-
-
 extern void add_input_randomness(unsigned int type, unsigned int code,
 				 unsigned int value);
 extern void add_interrupt_randomness(int irq);
-- 
cgit v1.2.3-70-g09d2


From f3298dc4f2277874d40cb4fc3a6e277317d6603b Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Wed, 10 Dec 2008 03:16:51 -0500
Subject: sanitize audit_socketcall

* don't bother with allocations
* now that it can't fail, make it return void

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 include/linux/audit.h |  4 ++--
 kernel/auditsc.c      | 66 +++++++++++++++++++++++++++++----------------------
 net/socket.c          |  4 +---
 3 files changed, 41 insertions(+), 33 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/audit.h b/include/linux/audit.h
index 26c4f6f65a4..466a953d4bf 100644
--- a/include/linux/audit.h
+++ b/include/linux/audit.h
@@ -446,7 +446,7 @@ extern void audit_log_task_context(struct audit_buffer *ab);
 extern int __audit_ipc_obj(struct kern_ipc_perm *ipcp);
 extern int __audit_ipc_set_perm(unsigned long qbytes, uid_t uid, gid_t gid, mode_t mode);
 extern int audit_bprm(struct linux_binprm *bprm);
-extern int audit_socketcall(int nargs, unsigned long *args);
+extern void audit_socketcall(int nargs, unsigned long *args);
 extern int audit_sockaddr(int len, void *addr);
 extern int __audit_fd_pair(int fd1, int fd2);
 extern int audit_set_macxattr(const char *name);
@@ -549,7 +549,7 @@ extern int audit_signals;
 #define audit_ipc_obj(i) ({ 0; })
 #define audit_ipc_set_perm(q,u,g,m) ({ 0; })
 #define audit_bprm(p) ({ 0; })
-#define audit_socketcall(n,a) ({ 0; })
+#define audit_socketcall(n,a) ((void)0)
 #define audit_fd_pair(n,a) ({ 0; })
 #define audit_sockaddr(len, addr) ({ 0; })
 #define audit_set_macxattr(n) do { ; } while (0)
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index c2e43ebb1b6..5cda66466e1 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -168,12 +168,6 @@ struct audit_aux_data_execve {
 	struct mm_struct *mm;
 };
 
-struct audit_aux_data_socketcall {
-	struct audit_aux_data	d;
-	int			nargs;
-	unsigned long		args[0];
-};
-
 struct audit_aux_data_fd_pair {
 	struct	audit_aux_data d;
 	int	fd[2];
@@ -247,6 +241,14 @@ struct audit_context {
 	struct audit_tree_refs *trees, *first_trees;
 	int tree_count;
 
+	int type;
+	union {
+		struct {
+			int nargs;
+			long args[6];
+		} socketcall;
+	};
+
 #if AUDIT_DEBUG
 	int		    put_count;
 	int		    ino_count;
@@ -1226,6 +1228,27 @@ static void audit_log_fcaps(struct audit_buffer *ab, struct audit_names *name)
 		audit_log_format(ab, " cap_fe=%d cap_fver=%x", name->fcap.fE, name->fcap_ver);
 }
 
+static void show_special(struct audit_context *context)
+{
+	struct audit_buffer *ab;
+	int i;
+
+	ab = audit_log_start(context, GFP_KERNEL, context->type);
+	if (!ab)
+		return;
+
+	switch (context->type) {
+	case AUDIT_SOCKETCALL: {
+		int nargs = context->socketcall.nargs;
+		audit_log_format(ab, "nargs=%d", nargs);
+		for (i = 0; i < nargs; i++)
+			audit_log_format(ab, " a%d=%lx", i,
+				context->socketcall.args[i]);
+		break; }
+	}
+	audit_log_end(ab);
+}
+
 static void audit_log_exit(struct audit_context *context, struct task_struct *tsk)
 {
 	const struct cred *cred;
@@ -1372,13 +1395,6 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
 			audit_log_execve_info(context, &ab, axi);
 			break; }
 
-		case AUDIT_SOCKETCALL: {
-			struct audit_aux_data_socketcall *axs = (void *)aux;
-			audit_log_format(ab, "nargs=%d", axs->nargs);
-			for (i=0; i<axs->nargs; i++)
-				audit_log_format(ab, " a%d=%lx", i, axs->args[i]);
-			break; }
-
 		case AUDIT_FD_PAIR: {
 			struct audit_aux_data_fd_pair *axs = (void *)aux;
 			audit_log_format(ab, "fd0=%d fd1=%d", axs->fd[0], axs->fd[1]);
@@ -1410,6 +1426,9 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
 		audit_log_end(ab);
 	}
 
+	if (context->type)
+		show_special(context);
+
 	if (context->sockaddr_len) {
 		ab = audit_log_start(context, GFP_KERNEL, AUDIT_SOCKADDR);
 		if (ab) {
@@ -1689,6 +1708,7 @@ void audit_syscall_exit(int valid, long return_code)
 		context->target_pid = 0;
 		context->target_sid = 0;
 		context->sockaddr_len = 0;
+		context->type = 0;
 		kfree(context->filterkey);
 		context->filterkey = NULL;
 		tsk->audit_context = context;
@@ -2406,27 +2426,17 @@ int audit_bprm(struct linux_binprm *bprm)
  * @nargs: number of args
  * @args: args array
  *
- * Returns 0 for success or NULL context or < 0 on error.
  */
-int audit_socketcall(int nargs, unsigned long *args)
+void audit_socketcall(int nargs, unsigned long *args)
 {
-	struct audit_aux_data_socketcall *ax;
 	struct audit_context *context = current->audit_context;
 
 	if (likely(!context || context->dummy))
-		return 0;
-
-	ax = kmalloc(sizeof(*ax) + nargs * sizeof(unsigned long), GFP_KERNEL);
-	if (!ax)
-		return -ENOMEM;
-
-	ax->nargs = nargs;
-	memcpy(ax->args, args, nargs * sizeof(unsigned long));
+		return;
 
-	ax->d.type = AUDIT_SOCKETCALL;
-	ax->d.next = context->aux;
-	context->aux = (void *)ax;
-	return 0;
+	context->type = AUDIT_SOCKETCALL;
+	context->socketcall.nargs = nargs;
+	memcpy(context->socketcall.args, args, nargs * sizeof(unsigned long));
 }
 
 /**
diff --git a/net/socket.c b/net/socket.c
index 2c730fc718a..b41a92093e4 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -2065,9 +2065,7 @@ asmlinkage long sys_socketcall(int call, unsigned long __user *args)
 	if (copy_from_user(a, args, nargs[call]))
 		return -EFAULT;
 
-	err = audit_socketcall(nargs[call] / sizeof(unsigned long), a);
-	if (err)
-		return err;
+	audit_socketcall(nargs[call] / sizeof(unsigned long), a);
 
 	a0 = a[0];
 	a1 = a[1];
-- 
cgit v1.2.3-70-g09d2


From a33e6751003c5ade603737d828b1519d980ce392 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Wed, 10 Dec 2008 03:40:06 -0500
Subject: sanitize audit_ipc_obj()

* get rid of allocations
* make it return void
* simplify callers

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 include/linux/audit.h |  9 +++---
 ipc/shm.c             |  4 +--
 ipc/util.c            |  9 ++----
 kernel/auditsc.c      | 88 ++++++++++++++++++++++-----------------------------
 4 files changed, 45 insertions(+), 65 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/audit.h b/include/linux/audit.h
index 466a953d4bf..f8578b9088e 100644
--- a/include/linux/audit.h
+++ b/include/linux/audit.h
@@ -443,7 +443,7 @@ extern int  audit_set_loginuid(struct task_struct *task, uid_t loginuid);
 #define audit_get_loginuid(t) ((t)->loginuid)
 #define audit_get_sessionid(t) ((t)->sessionid)
 extern void audit_log_task_context(struct audit_buffer *ab);
-extern int __audit_ipc_obj(struct kern_ipc_perm *ipcp);
+extern void __audit_ipc_obj(struct kern_ipc_perm *ipcp);
 extern int __audit_ipc_set_perm(unsigned long qbytes, uid_t uid, gid_t gid, mode_t mode);
 extern int audit_bprm(struct linux_binprm *bprm);
 extern void audit_socketcall(int nargs, unsigned long *args);
@@ -460,11 +460,10 @@ extern int __audit_log_bprm_fcaps(struct linux_binprm *bprm,
 				  const struct cred *old);
 extern int __audit_log_capset(pid_t pid, const struct cred *new, const struct cred *old);
 
-static inline int audit_ipc_obj(struct kern_ipc_perm *ipcp)
+static inline void audit_ipc_obj(struct kern_ipc_perm *ipcp)
 {
 	if (unlikely(!audit_dummy_context()))
-		return __audit_ipc_obj(ipcp);
-	return 0;
+		__audit_ipc_obj(ipcp);
 }
 static inline int audit_fd_pair(int fd1, int fd2)
 {
@@ -546,7 +545,7 @@ extern int audit_signals;
 #define audit_get_loginuid(t) (-1)
 #define audit_get_sessionid(t) (-1)
 #define audit_log_task_context(b) do { ; } while (0)
-#define audit_ipc_obj(i) ({ 0; })
+#define audit_ipc_obj(i) ((void)0)
 #define audit_ipc_set_perm(q,u,g,m) ({ 0; })
 #define audit_bprm(p) ({ 0; })
 #define audit_socketcall(n,a) ((void)0)
diff --git a/ipc/shm.c b/ipc/shm.c
index 38a055758a9..57dd50046ce 100644
--- a/ipc/shm.c
+++ b/ipc/shm.c
@@ -747,9 +747,7 @@ asmlinkage long sys_shmctl(int shmid, int cmd, struct shmid_ds __user *buf)
 			goto out;
 		}
 
-		err = audit_ipc_obj(&(shp->shm_perm));
-		if (err)
-			goto out_unlock;
+		audit_ipc_obj(&(shp->shm_perm));
 
 		if (!capable(CAP_IPC_LOCK)) {
 			uid_t euid = current_euid();
diff --git a/ipc/util.c b/ipc/util.c
index 5a1808c774a..579552abd50 100644
--- a/ipc/util.c
+++ b/ipc/util.c
@@ -624,10 +624,9 @@ void ipc_rcu_putref(void *ptr)
 int ipcperms (struct kern_ipc_perm *ipcp, short flag)
 {	/* flag will most probably be 0 or S_...UGO from <linux/stat.h> */
 	uid_t euid = current_euid();
-	int requested_mode, granted_mode, err;
+	int requested_mode, granted_mode;
 
-	if (unlikely((err = audit_ipc_obj(ipcp))))
-		return err;
+	audit_ipc_obj(ipcp);
 	requested_mode = (flag >> 6) | (flag >> 3) | flag;
 	granted_mode = ipcp->mode;
 	if (euid == ipcp->cuid ||
@@ -803,9 +802,7 @@ struct kern_ipc_perm *ipcctl_pre_down(struct ipc_ids *ids, int id, int cmd,
 		goto out_up;
 	}
 
-	err = audit_ipc_obj(ipcp);
-	if (err)
-		goto out_unlock;
+	audit_ipc_obj(ipcp);
 
 	if (cmd == IPC_SET) {
 		err = audit_ipc_set_perm(extra_perm, perm->uid,
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 5cda66466e1..73504313264 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -247,6 +247,12 @@ struct audit_context {
 			int nargs;
 			long args[6];
 		} socketcall;
+		struct {
+			uid_t			uid;
+			gid_t			gid;
+			mode_t			mode;
+			u32			osid;
+		} ipc;
 	};
 
 #if AUDIT_DEBUG
@@ -605,19 +611,12 @@ static int audit_filter_rules(struct task_struct *tsk,
 					}
 				}
 				/* Find ipc objects that match */
-				if (ctx) {
-					struct audit_aux_data *aux;
-					for (aux = ctx->aux; aux;
-					     aux = aux->next) {
-						if (aux->type == AUDIT_IPC) {
-							struct audit_aux_data_ipcctl *axi = (void *)aux;
-							if (security_audit_rule_match(axi->osid, f->type, f->op, f->lsm_rule, ctx)) {
-								++result;
-								break;
-							}
-						}
-					}
-				}
+				if (!ctx || ctx->type != AUDIT_IPC)
+					break;
+				if (security_audit_rule_match(ctx->ipc.osid,
+							      f->type, f->op,
+							      f->lsm_rule, ctx))
+					++result;
 			}
 			break;
 		case AUDIT_ARG0:
@@ -1228,7 +1227,7 @@ static void audit_log_fcaps(struct audit_buffer *ab, struct audit_names *name)
 		audit_log_format(ab, " cap_fe=%d cap_fver=%x", name->fcap.fE, name->fcap_ver);
 }
 
-static void show_special(struct audit_context *context)
+static void show_special(struct audit_context *context, int *call_panic)
 {
 	struct audit_buffer *ab;
 	int i;
@@ -1245,6 +1244,23 @@ static void show_special(struct audit_context *context)
 			audit_log_format(ab, " a%d=%lx", i,
 				context->socketcall.args[i]);
 		break; }
+	case AUDIT_IPC: {
+		u32 osid = context->ipc.osid;
+
+		audit_log_format(ab, "ouid=%u ogid=%u mode=%#o",
+			 context->ipc.uid, context->ipc.gid, context->ipc.mode);
+		if (osid) {
+			char *ctx = NULL;
+			u32 len;
+			if (security_secid_to_secctx(osid, &ctx, &len)) {
+				audit_log_format(ab, " osid=%u", osid);
+				*call_panic = 1;
+			} else {
+				audit_log_format(ab, " obj=%s", ctx);
+				security_release_secctx(ctx, len);
+			}
+		}
+		break; }
 	}
 	audit_log_end(ab);
 }
@@ -1363,26 +1379,6 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
 				axi->mqstat.mq_msgsize, axi->mqstat.mq_curmsgs);
 			break; }
 
-		case AUDIT_IPC: {
-			struct audit_aux_data_ipcctl *axi = (void *)aux;
-			audit_log_format(ab, 
-				 "ouid=%u ogid=%u mode=%#o",
-				 axi->uid, axi->gid, axi->mode);
-			if (axi->osid != 0) {
-				char *ctx = NULL;
-				u32 len;
-				if (security_secid_to_secctx(
-						axi->osid, &ctx, &len)) {
-					audit_log_format(ab, " osid=%u",
-							axi->osid);
-					call_panic = 1;
-				} else {
-					audit_log_format(ab, " obj=%s", ctx);
-					security_release_secctx(ctx, len);
-				}
-			}
-			break; }
-
 		case AUDIT_IPC_SET_PERM: {
 			struct audit_aux_data_ipcctl *axi = (void *)aux;
 			audit_log_format(ab,
@@ -1427,7 +1423,7 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
 	}
 
 	if (context->type)
-		show_special(context);
+		show_special(context, &call_panic);
 
 	if (context->sockaddr_len) {
 		ab = audit_log_start(context, GFP_KERNEL, AUDIT_SOCKADDR);
@@ -2349,25 +2345,15 @@ int __audit_mq_getsetattr(mqd_t mqdes, struct mq_attr *mqstat)
  * audit_ipc_obj - record audit data for ipc object
  * @ipcp: ipc permissions
  *
- * Returns 0 for success or NULL context or < 0 on error.
  */
-int __audit_ipc_obj(struct kern_ipc_perm *ipcp)
+void __audit_ipc_obj(struct kern_ipc_perm *ipcp)
 {
-	struct audit_aux_data_ipcctl *ax;
 	struct audit_context *context = current->audit_context;
-
-	ax = kmalloc(sizeof(*ax), GFP_ATOMIC);
-	if (!ax)
-		return -ENOMEM;
-
-	ax->uid = ipcp->uid;
-	ax->gid = ipcp->gid;
-	ax->mode = ipcp->mode;
-	security_ipc_getsecid(ipcp, &ax->osid);
-	ax->d.type = AUDIT_IPC;
-	ax->d.next = context->aux;
-	context->aux = (void *)ax;
-	return 0;
+	context->ipc.uid = ipcp->uid;
+	context->ipc.gid = ipcp->gid;
+	context->ipc.mode = ipcp->mode;
+	security_ipc_getsecid(ipcp, &context->ipc.osid);
+	context->type = AUDIT_IPC;
 }
 
 /**
-- 
cgit v1.2.3-70-g09d2


From e816f370cbadd2afea9f1a42f232d0636137d563 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Wed, 10 Dec 2008 03:47:15 -0500
Subject: sanitize audit_ipc_set_perm()

* get rid of allocations
* make it return void
* simplify callers

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 include/linux/audit.h |  9 ++++----
 ipc/util.c            |  9 ++------
 kernel/auditsc.c      | 59 +++++++++++++++++++++++----------------------------
 3 files changed, 32 insertions(+), 45 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/audit.h b/include/linux/audit.h
index f8578b9088e..b7abfe0d673 100644
--- a/include/linux/audit.h
+++ b/include/linux/audit.h
@@ -444,7 +444,7 @@ extern int  audit_set_loginuid(struct task_struct *task, uid_t loginuid);
 #define audit_get_sessionid(t) ((t)->sessionid)
 extern void audit_log_task_context(struct audit_buffer *ab);
 extern void __audit_ipc_obj(struct kern_ipc_perm *ipcp);
-extern int __audit_ipc_set_perm(unsigned long qbytes, uid_t uid, gid_t gid, mode_t mode);
+extern void __audit_ipc_set_perm(unsigned long qbytes, uid_t uid, gid_t gid, mode_t mode);
 extern int audit_bprm(struct linux_binprm *bprm);
 extern void audit_socketcall(int nargs, unsigned long *args);
 extern int audit_sockaddr(int len, void *addr);
@@ -471,11 +471,10 @@ static inline int audit_fd_pair(int fd1, int fd2)
 		return __audit_fd_pair(fd1, fd2);
 	return 0;
 }
-static inline int audit_ipc_set_perm(unsigned long qbytes, uid_t uid, gid_t gid, mode_t mode)
+static inline void audit_ipc_set_perm(unsigned long qbytes, uid_t uid, gid_t gid, mode_t mode)
 {
 	if (unlikely(!audit_dummy_context()))
-		return __audit_ipc_set_perm(qbytes, uid, gid, mode);
-	return 0;
+		__audit_ipc_set_perm(qbytes, uid, gid, mode);
 }
 static inline int audit_mq_open(int oflag, mode_t mode, struct mq_attr __user *u_attr)
 {
@@ -546,7 +545,7 @@ extern int audit_signals;
 #define audit_get_sessionid(t) (-1)
 #define audit_log_task_context(b) do { ; } while (0)
 #define audit_ipc_obj(i) ((void)0)
-#define audit_ipc_set_perm(q,u,g,m) ({ 0; })
+#define audit_ipc_set_perm(q,u,g,m) ((void)0)
 #define audit_bprm(p) ({ 0; })
 #define audit_socketcall(n,a) ((void)0)
 #define audit_fd_pair(n,a) ({ 0; })
diff --git a/ipc/util.c b/ipc/util.c
index 579552abd50..7585a72e259 100644
--- a/ipc/util.c
+++ b/ipc/util.c
@@ -803,13 +803,9 @@ struct kern_ipc_perm *ipcctl_pre_down(struct ipc_ids *ids, int id, int cmd,
 	}
 
 	audit_ipc_obj(ipcp);
-
-	if (cmd == IPC_SET) {
-		err = audit_ipc_set_perm(extra_perm, perm->uid,
+	if (cmd == IPC_SET)
+		audit_ipc_set_perm(extra_perm, perm->uid,
 					 perm->gid, perm->mode);
-		if (err)
-			goto out_unlock;
-	}
 
 	euid = current_euid();
 	if (euid == ipcp->cuid ||
@@ -817,7 +813,6 @@ struct kern_ipc_perm *ipcctl_pre_down(struct ipc_ids *ids, int id, int cmd,
 		return ipcp;
 
 	err = -EPERM;
-out_unlock:
 	ipc_unlock(ipcp);
 out_up:
 	up_write(&ids->rw_mutex);
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 73504313264..fbed62e05bc 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -151,16 +151,6 @@ struct audit_aux_data_mq_getsetattr {
 	struct mq_attr 		mqstat;
 };
 
-struct audit_aux_data_ipcctl {
-	struct audit_aux_data	d;
-	struct ipc_perm		p;
-	unsigned long		qbytes;
-	uid_t			uid;
-	gid_t			gid;
-	mode_t			mode;
-	u32			osid;
-};
-
 struct audit_aux_data_execve {
 	struct audit_aux_data	d;
 	int argc;
@@ -252,6 +242,11 @@ struct audit_context {
 			gid_t			gid;
 			mode_t			mode;
 			u32			osid;
+			int			has_perm;
+			uid_t			perm_uid;
+			gid_t			perm_gid;
+			mode_t			perm_mode;
+			unsigned long		qbytes;
 		} ipc;
 	};
 
@@ -1260,6 +1255,19 @@ static void show_special(struct audit_context *context, int *call_panic)
 				security_release_secctx(ctx, len);
 			}
 		}
+		if (context->ipc.has_perm) {
+			audit_log_end(ab);
+			ab = audit_log_start(context, GFP_KERNEL,
+					     AUDIT_IPC_SET_PERM);
+			audit_log_format(ab,
+				"qbytes=%lx ouid=%u ogid=%u mode=%#o",
+				context->ipc.qbytes,
+				context->ipc.perm_uid,
+				context->ipc.perm_gid,
+				context->ipc.perm_mode);
+			if (!ab)
+				return;
+		}
 		break; }
 	}
 	audit_log_end(ab);
@@ -1379,13 +1387,6 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
 				axi->mqstat.mq_msgsize, axi->mqstat.mq_curmsgs);
 			break; }
 
-		case AUDIT_IPC_SET_PERM: {
-			struct audit_aux_data_ipcctl *axi = (void *)aux;
-			audit_log_format(ab,
-				"qbytes=%lx ouid=%u ogid=%u mode=%#o",
-				axi->qbytes, axi->uid, axi->gid, axi->mode);
-			break; }
-
 		case AUDIT_EXECVE: {
 			struct audit_aux_data_execve *axi = (void *)aux;
 			audit_log_execve_info(context, &ab, axi);
@@ -2352,6 +2353,7 @@ void __audit_ipc_obj(struct kern_ipc_perm *ipcp)
 	context->ipc.uid = ipcp->uid;
 	context->ipc.gid = ipcp->gid;
 	context->ipc.mode = ipcp->mode;
+	context->ipc.has_perm = 0;
 	security_ipc_getsecid(ipcp, &context->ipc.osid);
 	context->type = AUDIT_IPC;
 }
@@ -2363,26 +2365,17 @@ void __audit_ipc_obj(struct kern_ipc_perm *ipcp)
  * @gid: msgq group id
  * @mode: msgq mode (permissions)
  *
- * Returns 0 for success or NULL context or < 0 on error.
+ * Called only after audit_ipc_obj().
  */
-int __audit_ipc_set_perm(unsigned long qbytes, uid_t uid, gid_t gid, mode_t mode)
+void __audit_ipc_set_perm(unsigned long qbytes, uid_t uid, gid_t gid, mode_t mode)
 {
-	struct audit_aux_data_ipcctl *ax;
 	struct audit_context *context = current->audit_context;
 
-	ax = kmalloc(sizeof(*ax), GFP_ATOMIC);
-	if (!ax)
-		return -ENOMEM;
-
-	ax->qbytes = qbytes;
-	ax->uid = uid;
-	ax->gid = gid;
-	ax->mode = mode;
-
-	ax->d.type = AUDIT_IPC_SET_PERM;
-	ax->d.next = context->aux;
-	context->aux = (void *)ax;
-	return 0;
+	context->ipc.qbytes = qbytes;
+	context->ipc.perm_uid = uid;
+	context->ipc.perm_gid = gid;
+	context->ipc.perm_mode = mode;
+	context->ipc.has_perm = 1;
 }
 
 int audit_bprm(struct linux_binprm *bprm)
-- 
cgit v1.2.3-70-g09d2


From 7392906ea915b9a2c14dea32b3604b4e178f82f7 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Wed, 10 Dec 2008 06:58:59 -0500
Subject: sanitize audit_mq_getsetattr()

* get rid of allocations
* make it return void
* don't duplicate parts of audit_dummy_context()

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 include/linux/audit.h |  9 ++++-----
 ipc/mqueue.c          |  6 +-----
 kernel/auditsc.c      | 54 ++++++++++++++++-----------------------------------
 3 files changed, 22 insertions(+), 47 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/audit.h b/include/linux/audit.h
index b7abfe0d673..b7707e577b8 100644
--- a/include/linux/audit.h
+++ b/include/linux/audit.h
@@ -454,7 +454,7 @@ extern int __audit_mq_open(int oflag, mode_t mode, struct mq_attr __user *u_attr
 extern int __audit_mq_timedsend(mqd_t mqdes, size_t msg_len, unsigned int msg_prio, const struct timespec __user *u_abs_timeout);
 extern int __audit_mq_timedreceive(mqd_t mqdes, size_t msg_len, unsigned int __user *u_msg_prio, const struct timespec __user *u_abs_timeout);
 extern int __audit_mq_notify(mqd_t mqdes, const struct sigevent __user *u_notification);
-extern int __audit_mq_getsetattr(mqd_t mqdes, struct mq_attr *mqstat);
+extern void __audit_mq_getsetattr(mqd_t mqdes, struct mq_attr *mqstat);
 extern int __audit_log_bprm_fcaps(struct linux_binprm *bprm,
 				  const struct cred *new,
 				  const struct cred *old);
@@ -500,11 +500,10 @@ static inline int audit_mq_notify(mqd_t mqdes, const struct sigevent __user *u_n
 		return __audit_mq_notify(mqdes, u_notification);
 	return 0;
 }
-static inline int audit_mq_getsetattr(mqd_t mqdes, struct mq_attr *mqstat)
+static inline void audit_mq_getsetattr(mqd_t mqdes, struct mq_attr *mqstat)
 {
 	if (unlikely(!audit_dummy_context()))
-		return __audit_mq_getsetattr(mqdes, mqstat);
-	return 0;
+		__audit_mq_getsetattr(mqdes, mqstat);
 }
 
 static inline int audit_log_bprm_fcaps(struct linux_binprm *bprm,
@@ -555,7 +554,7 @@ extern int audit_signals;
 #define audit_mq_timedsend(d,l,p,t) ({ 0; })
 #define audit_mq_timedreceive(d,l,p,t) ({ 0; })
 #define audit_mq_notify(d,n) ({ 0; })
-#define audit_mq_getsetattr(d,s) ({ 0; })
+#define audit_mq_getsetattr(d,s) ((void)0)
 #define audit_log_bprm_fcaps(b, ncr, ocr) ({ 0; })
 #define audit_log_capset(pid, ncr, ocr) ({ 0; })
 #define audit_ptrace(t) ((void)0)
diff --git a/ipc/mqueue.c b/ipc/mqueue.c
index d9393f8e4c3..7563611c661 100644
--- a/ipc/mqueue.c
+++ b/ipc/mqueue.c
@@ -1150,11 +1150,7 @@ asmlinkage long sys_mq_getsetattr(mqd_t mqdes,
 	omqstat = info->attr;
 	omqstat.mq_flags = filp->f_flags & O_NONBLOCK;
 	if (u_mqstat) {
-		ret = audit_mq_getsetattr(mqdes, &mqstat);
-		if (ret != 0) {
-			spin_unlock(&info->lock);
-			goto out_fput;
-		}
+		audit_mq_getsetattr(mqdes, &mqstat);
 		if (mqstat.mq_flags & O_NONBLOCK)
 			filp->f_flags |= O_NONBLOCK;
 		else
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index fbed62e05bc..c50178c7e24 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -145,12 +145,6 @@ struct audit_aux_data_mq_notify {
 	struct sigevent 	notification;
 };
 
-struct audit_aux_data_mq_getsetattr {
-	struct audit_aux_data	d;
-	mqd_t			mqdes;
-	struct mq_attr 		mqstat;
-};
-
 struct audit_aux_data_execve {
 	struct audit_aux_data	d;
 	int argc;
@@ -248,6 +242,10 @@ struct audit_context {
 			mode_t			perm_mode;
 			unsigned long		qbytes;
 		} ipc;
+		struct {
+			mqd_t			mqdes;
+			struct mq_attr 		mqstat;
+		} mq_getsetattr;
 	};
 
 #if AUDIT_DEBUG
@@ -1269,6 +1267,15 @@ static void show_special(struct audit_context *context, int *call_panic)
 				return;
 		}
 		break; }
+	case AUDIT_MQ_GETSETATTR: {
+		struct mq_attr *attr = &context->mq_getsetattr.mqstat;
+		audit_log_format(ab,
+			"mqdes=%d mq_flags=0x%lx mq_maxmsg=%ld mq_msgsize=%ld "
+			"mq_curmsgs=%ld ",
+			context->mq_getsetattr.mqdes,
+			attr->mq_flags, attr->mq_maxmsg,
+			attr->mq_msgsize, attr->mq_curmsgs);
+		break; }
 	}
 	audit_log_end(ab);
 }
@@ -1377,16 +1384,6 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
 				axi->notification.sigev_signo);
 			break; }
 
-		case AUDIT_MQ_GETSETATTR: {
-			struct audit_aux_data_mq_getsetattr *axi = (void *)aux;
-			audit_log_format(ab,
-				"mqdes=%d mq_flags=0x%lx mq_maxmsg=%ld mq_msgsize=%ld "
-				"mq_curmsgs=%ld ",
-				axi->mqdes,
-				axi->mqstat.mq_flags, axi->mqstat.mq_maxmsg,
-				axi->mqstat.mq_msgsize, axi->mqstat.mq_curmsgs);
-			break; }
-
 		case AUDIT_EXECVE: {
 			struct audit_aux_data_execve *axi = (void *)aux;
 			audit_log_execve_info(context, &ab, axi);
@@ -2316,30 +2313,13 @@ int __audit_mq_notify(mqd_t mqdes, const struct sigevent __user *u_notification)
  * @mqdes: MQ descriptor
  * @mqstat: MQ flags
  *
- * Returns 0 for success or NULL context or < 0 on error.
  */
-int __audit_mq_getsetattr(mqd_t mqdes, struct mq_attr *mqstat)
+void __audit_mq_getsetattr(mqd_t mqdes, struct mq_attr *mqstat)
 {
-	struct audit_aux_data_mq_getsetattr *ax;
 	struct audit_context *context = current->audit_context;
-
-	if (!audit_enabled)
-		return 0;
-
-	if (likely(!context))
-		return 0;
-
-	ax = kmalloc(sizeof(*ax), GFP_ATOMIC);
-	if (!ax)
-		return -ENOMEM;
-
-	ax->mqdes = mqdes;
-	ax->mqstat = *mqstat;
-
-	ax->d.type = AUDIT_MQ_GETSETATTR;
-	ax->d.next = context->aux;
-	context->aux = (void *)ax;
-	return 0;
+	context->mq_getsetattr.mqdes = mqdes;
+	context->mq_getsetattr.mqstat = *mqstat;
+	context->type = AUDIT_MQ_GETSETATTR;
 }
 
 /**
-- 
cgit v1.2.3-70-g09d2


From 20114f71b27cafeb7c7e41d2b0f0b68c3fbb022b Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Wed, 10 Dec 2008 07:16:12 -0500
Subject: sanitize audit_mq_notify()

* don't copy_from_user() twice
* don't bother with allocations
* don't duplicate parts of audit_dummy_context()
* make it return void

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 include/linux/audit.h |  9 ++++-----
 ipc/mqueue.c          | 14 ++++++-------
 kernel/auditsc.c      | 56 +++++++++++++++------------------------------------
 3 files changed, 27 insertions(+), 52 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/audit.h b/include/linux/audit.h
index b7707e577b8..8101d2c4a99 100644
--- a/include/linux/audit.h
+++ b/include/linux/audit.h
@@ -453,7 +453,7 @@ extern int audit_set_macxattr(const char *name);
 extern int __audit_mq_open(int oflag, mode_t mode, struct mq_attr __user *u_attr);
 extern int __audit_mq_timedsend(mqd_t mqdes, size_t msg_len, unsigned int msg_prio, const struct timespec __user *u_abs_timeout);
 extern int __audit_mq_timedreceive(mqd_t mqdes, size_t msg_len, unsigned int __user *u_msg_prio, const struct timespec __user *u_abs_timeout);
-extern int __audit_mq_notify(mqd_t mqdes, const struct sigevent __user *u_notification);
+extern void __audit_mq_notify(mqd_t mqdes, const struct sigevent *notification);
 extern void __audit_mq_getsetattr(mqd_t mqdes, struct mq_attr *mqstat);
 extern int __audit_log_bprm_fcaps(struct linux_binprm *bprm,
 				  const struct cred *new,
@@ -494,11 +494,10 @@ static inline int audit_mq_timedreceive(mqd_t mqdes, size_t msg_len, unsigned in
 		return __audit_mq_timedreceive(mqdes, msg_len, u_msg_prio, u_abs_timeout);
 	return 0;
 }
-static inline int audit_mq_notify(mqd_t mqdes, const struct sigevent __user *u_notification)
+static inline void audit_mq_notify(mqd_t mqdes, const struct sigevent *notification)
 {
 	if (unlikely(!audit_dummy_context()))
-		return __audit_mq_notify(mqdes, u_notification);
-	return 0;
+		__audit_mq_notify(mqdes, notification);
 }
 static inline void audit_mq_getsetattr(mqd_t mqdes, struct mq_attr *mqstat)
 {
@@ -553,7 +552,7 @@ extern int audit_signals;
 #define audit_mq_open(o,m,a) ({ 0; })
 #define audit_mq_timedsend(d,l,p,t) ({ 0; })
 #define audit_mq_timedreceive(d,l,p,t) ({ 0; })
-#define audit_mq_notify(d,n) ({ 0; })
+#define audit_mq_notify(d,n) ((void)0)
 #define audit_mq_getsetattr(d,s) ((void)0)
 #define audit_log_bprm_fcaps(b, ncr, ocr) ({ 0; })
 #define audit_log_capset(pid, ncr, ocr) ({ 0; })
diff --git a/ipc/mqueue.c b/ipc/mqueue.c
index 7563611c661..e7b2f68f8d7 100644
--- a/ipc/mqueue.c
+++ b/ipc/mqueue.c
@@ -1003,17 +1003,17 @@ asmlinkage long sys_mq_notify(mqd_t mqdes,
 	struct mqueue_inode_info *info;
 	struct sk_buff *nc;
 
-	ret = audit_mq_notify(mqdes, u_notification);
-	if (ret != 0)
-		return ret;
-
-	nc = NULL;
-	sock = NULL;
-	if (u_notification != NULL) {
+	if (u_notification) {
 		if (copy_from_user(&notification, u_notification,
 					sizeof(struct sigevent)))
 			return -EFAULT;
+	}
+
+	audit_mq_notify(mqdes, u_notification ? &notification : NULL);
 
+	nc = NULL;
+	sock = NULL;
+	if (u_notification != NULL) {
 		if (unlikely(notification.sigev_notify != SIGEV_NONE &&
 			     notification.sigev_notify != SIGEV_SIGNAL &&
 			     notification.sigev_notify != SIGEV_THREAD))
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index c50178c7e24..3ece960de89 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -139,12 +139,6 @@ struct audit_aux_data_mq_sendrecv {
 	struct timespec		abs_timeout;
 };
 
-struct audit_aux_data_mq_notify {
-	struct audit_aux_data	d;
-	mqd_t			mqdes;
-	struct sigevent 	notification;
-};
-
 struct audit_aux_data_execve {
 	struct audit_aux_data	d;
 	int argc;
@@ -246,6 +240,10 @@ struct audit_context {
 			mqd_t			mqdes;
 			struct mq_attr 		mqstat;
 		} mq_getsetattr;
+		struct {
+			mqd_t			mqdes;
+			int			sigev_signo;
+		} mq_notify;
 	};
 
 #if AUDIT_DEBUG
@@ -1267,6 +1265,11 @@ static void show_special(struct audit_context *context, int *call_panic)
 				return;
 		}
 		break; }
+	case AUDIT_MQ_NOTIFY: {
+		audit_log_format(ab, "mqdes=%d sigev_signo=%d",
+				context->mq_notify.mqdes,
+				context->mq_notify.sigev_signo);
+		break; }
 	case AUDIT_MQ_GETSETATTR: {
 		struct mq_attr *attr = &context->mq_getsetattr.mqstat;
 		audit_log_format(ab,
@@ -1376,14 +1379,6 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
 				axi->abs_timeout.tv_sec, axi->abs_timeout.tv_nsec);
 			break; }
 
-		case AUDIT_MQ_NOTIFY: {
-			struct audit_aux_data_mq_notify *axi = (void *)aux;
-			audit_log_format(ab,
-				"mqdes=%d sigev_signo=%d",
-				axi->mqdes,
-				axi->notification.sigev_signo);
-			break; }
-
 		case AUDIT_EXECVE: {
 			struct audit_aux_data_execve *axi = (void *)aux;
 			audit_log_execve_info(context, &ab, axi);
@@ -2274,38 +2269,19 @@ int __audit_mq_timedreceive(mqd_t mqdes, size_t msg_len,
  * @mqdes: MQ descriptor
  * @u_notification: Notification event
  *
- * Returns 0 for success or NULL context or < 0 on error.
  */
 
-int __audit_mq_notify(mqd_t mqdes, const struct sigevent __user *u_notification)
+void __audit_mq_notify(mqd_t mqdes, const struct sigevent *notification)
 {
-	struct audit_aux_data_mq_notify *ax;
 	struct audit_context *context = current->audit_context;
 
-	if (!audit_enabled)
-		return 0;
-
-	if (likely(!context))
-		return 0;
-
-	ax = kmalloc(sizeof(*ax), GFP_ATOMIC);
-	if (!ax)
-		return -ENOMEM;
-
-	if (u_notification != NULL) {
-		if (copy_from_user(&ax->notification, u_notification, sizeof(ax->notification))) {
-			kfree(ax);
-			return -EFAULT;
-		}
-	} else
-		memset(&ax->notification, 0, sizeof(ax->notification));
-
-	ax->mqdes = mqdes;
+	if (notification)
+		context->mq_notify.sigev_signo = notification->sigev_signo;
+	else
+		context->mq_notify.sigev_signo = 0;
 
-	ax->d.type = AUDIT_MQ_NOTIFY;
-	ax->d.next = context->aux;
-	context->aux = (void *)ax;
-	return 0;
+	context->mq_notify.mqdes = mqdes;
+	context->type = AUDIT_MQ_NOTIFY;
 }
 
 /**
-- 
cgit v1.2.3-70-g09d2


From c32c8af43b9adde8d6f938d8e6328c13b8de79ac Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 14 Dec 2008 03:46:48 -0500
Subject: sanitize AUDIT_MQ_SENDRECV

* logging the original value of *msg_prio in mq_timedreceive(2)
  is insane - the argument is write-only (i.e. syscall always
  ignores the original value and only overwrites it).
* merge __audit_mq_timed{send,receive}
* don't do copy_from_user() twice
* don't mess with allocations in auditsc part
* ... and don't bother checking !audit_enabled and !context in there -
  we'd already checked for audit_dummy_context().

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 include/linux/audit.h |  17 ++-----
 ipc/mqueue.c          |  54 +++++++++++----------
 kernel/auditsc.c      | 127 ++++++++++++--------------------------------------
 3 files changed, 63 insertions(+), 135 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/audit.h b/include/linux/audit.h
index 8101d2c4a99..67f0cdd991b 100644
--- a/include/linux/audit.h
+++ b/include/linux/audit.h
@@ -451,8 +451,7 @@ extern int audit_sockaddr(int len, void *addr);
 extern int __audit_fd_pair(int fd1, int fd2);
 extern int audit_set_macxattr(const char *name);
 extern int __audit_mq_open(int oflag, mode_t mode, struct mq_attr __user *u_attr);
-extern int __audit_mq_timedsend(mqd_t mqdes, size_t msg_len, unsigned int msg_prio, const struct timespec __user *u_abs_timeout);
-extern int __audit_mq_timedreceive(mqd_t mqdes, size_t msg_len, unsigned int __user *u_msg_prio, const struct timespec __user *u_abs_timeout);
+extern void __audit_mq_sendrecv(mqd_t mqdes, size_t msg_len, unsigned int msg_prio, const struct timespec *abs_timeout);
 extern void __audit_mq_notify(mqd_t mqdes, const struct sigevent *notification);
 extern void __audit_mq_getsetattr(mqd_t mqdes, struct mq_attr *mqstat);
 extern int __audit_log_bprm_fcaps(struct linux_binprm *bprm,
@@ -482,17 +481,10 @@ static inline int audit_mq_open(int oflag, mode_t mode, struct mq_attr __user *u
 		return __audit_mq_open(oflag, mode, u_attr);
 	return 0;
 }
-static inline int audit_mq_timedsend(mqd_t mqdes, size_t msg_len, unsigned int msg_prio, const struct timespec __user *u_abs_timeout)
+static inline void audit_mq_sendrecv(mqd_t mqdes, size_t msg_len, unsigned int msg_prio, const struct timespec *abs_timeout)
 {
 	if (unlikely(!audit_dummy_context()))
-		return __audit_mq_timedsend(mqdes, msg_len, msg_prio, u_abs_timeout);
-	return 0;
-}
-static inline int audit_mq_timedreceive(mqd_t mqdes, size_t msg_len, unsigned int __user *u_msg_prio, const struct timespec __user *u_abs_timeout)
-{
-	if (unlikely(!audit_dummy_context()))
-		return __audit_mq_timedreceive(mqdes, msg_len, u_msg_prio, u_abs_timeout);
-	return 0;
+		__audit_mq_sendrecv(mqdes, msg_len, msg_prio, abs_timeout);
 }
 static inline void audit_mq_notify(mqd_t mqdes, const struct sigevent *notification)
 {
@@ -550,8 +542,7 @@ extern int audit_signals;
 #define audit_sockaddr(len, addr) ({ 0; })
 #define audit_set_macxattr(n) do { ; } while (0)
 #define audit_mq_open(o,m,a) ({ 0; })
-#define audit_mq_timedsend(d,l,p,t) ({ 0; })
-#define audit_mq_timedreceive(d,l,p,t) ({ 0; })
+#define audit_mq_sendrecv(d,l,p,t) ((void)0)
 #define audit_mq_notify(d,n) ((void)0)
 #define audit_mq_getsetattr(d,s) ((void)0)
 #define audit_log_bprm_fcaps(b, ncr, ocr) ({ 0; })
diff --git a/ipc/mqueue.c b/ipc/mqueue.c
index e7b2f68f8d7..192da806c28 100644
--- a/ipc/mqueue.c
+++ b/ipc/mqueue.c
@@ -524,31 +524,27 @@ static void __do_notify(struct mqueue_inode_info *info)
 	wake_up(&info->wait_q);
 }
 
-static long prepare_timeout(const struct timespec __user *u_arg)
+static long prepare_timeout(struct timespec *p)
 {
-	struct timespec ts, nowts;
+	struct timespec nowts;
 	long timeout;
 
-	if (u_arg) {
-		if (unlikely(copy_from_user(&ts, u_arg,
-					sizeof(struct timespec))))
-			return -EFAULT;
-
-		if (unlikely(ts.tv_nsec < 0 || ts.tv_sec < 0
-			|| ts.tv_nsec >= NSEC_PER_SEC))
+	if (p) {
+		if (unlikely(p->tv_nsec < 0 || p->tv_sec < 0
+			|| p->tv_nsec >= NSEC_PER_SEC))
 			return -EINVAL;
 		nowts = CURRENT_TIME;
 		/* first subtract as jiffies can't be too big */
-		ts.tv_sec -= nowts.tv_sec;
-		if (ts.tv_nsec < nowts.tv_nsec) {
-			ts.tv_nsec += NSEC_PER_SEC;
-			ts.tv_sec--;
+		p->tv_sec -= nowts.tv_sec;
+		if (p->tv_nsec < nowts.tv_nsec) {
+			p->tv_nsec += NSEC_PER_SEC;
+			p->tv_sec--;
 		}
-		ts.tv_nsec -= nowts.tv_nsec;
-		if (ts.tv_sec < 0)
+		p->tv_nsec -= nowts.tv_nsec;
+		if (p->tv_sec < 0)
 			return 0;
 
-		timeout = timespec_to_jiffies(&ts) + 1;
+		timeout = timespec_to_jiffies(p) + 1;
 	} else
 		return MAX_SCHEDULE_TIMEOUT;
 
@@ -829,17 +825,22 @@ asmlinkage long sys_mq_timedsend(mqd_t mqdes, const char __user *u_msg_ptr,
 	struct ext_wait_queue *receiver;
 	struct msg_msg *msg_ptr;
 	struct mqueue_inode_info *info;
+	struct timespec ts, *p = NULL;
 	long timeout;
 	int ret;
 
-	ret = audit_mq_timedsend(mqdes, msg_len, msg_prio, u_abs_timeout);
-	if (ret != 0)
-		return ret;
+	if (u_abs_timeout) {
+		if (copy_from_user(&ts, u_abs_timeout, 
+					sizeof(struct timespec)))
+			return -EFAULT;
+		p = &ts;
+	}
 
 	if (unlikely(msg_prio >= (unsigned long) MQ_PRIO_MAX))
 		return -EINVAL;
 
-	timeout = prepare_timeout(u_abs_timeout);
+	audit_mq_sendrecv(mqdes, msg_len, msg_prio, p);
+	timeout = prepare_timeout(p);
 
 	ret = -EBADF;
 	filp = fget(mqdes);
@@ -918,12 +919,17 @@ asmlinkage ssize_t sys_mq_timedreceive(mqd_t mqdes, char __user *u_msg_ptr,
 	struct inode *inode;
 	struct mqueue_inode_info *info;
 	struct ext_wait_queue wait;
+	struct timespec ts, *p = NULL;
 
-	ret = audit_mq_timedreceive(mqdes, msg_len, u_msg_prio, u_abs_timeout);
-	if (ret != 0)
-		return ret;
+	if (u_abs_timeout) {
+		if (copy_from_user(&ts, u_abs_timeout, 
+					sizeof(struct timespec)))
+			return -EFAULT;
+		p = &ts;
+	}
 
-	timeout = prepare_timeout(u_abs_timeout);
+	audit_mq_sendrecv(mqdes, msg_len, 0, p);
+	timeout = prepare_timeout(p);
 
 	ret = -EBADF;
 	filp = fget(mqdes);
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 3ece960de89..140c4745347 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -131,14 +131,6 @@ struct audit_aux_data_mq_open {
 	struct mq_attr		attr;
 };
 
-struct audit_aux_data_mq_sendrecv {
-	struct audit_aux_data	d;
-	mqd_t			mqdes;
-	size_t			msg_len;
-	unsigned int		msg_prio;
-	struct timespec		abs_timeout;
-};
-
 struct audit_aux_data_execve {
 	struct audit_aux_data	d;
 	int argc;
@@ -244,6 +236,12 @@ struct audit_context {
 			mqd_t			mqdes;
 			int			sigev_signo;
 		} mq_notify;
+		struct {
+			mqd_t			mqdes;
+			size_t			msg_len;
+			unsigned int		msg_prio;
+			struct timespec		abs_timeout;
+		} mq_sendrecv;
 	};
 
 #if AUDIT_DEBUG
@@ -1265,6 +1263,16 @@ static void show_special(struct audit_context *context, int *call_panic)
 				return;
 		}
 		break; }
+	case AUDIT_MQ_SENDRECV: {
+		audit_log_format(ab,
+			"mqdes=%d msg_len=%zd msg_prio=%u "
+			"abs_timeout_sec=%ld abs_timeout_nsec=%ld",
+			context->mq_sendrecv.mqdes,
+			context->mq_sendrecv.msg_len,
+			context->mq_sendrecv.msg_prio,
+			context->mq_sendrecv.abs_timeout.tv_sec,
+			context->mq_sendrecv.abs_timeout.tv_nsec);
+		break; }
 	case AUDIT_MQ_NOTIFY: {
 		audit_log_format(ab, "mqdes=%d sigev_signo=%d",
 				context->mq_notify.mqdes,
@@ -1370,15 +1378,6 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
 				axi->attr.mq_curmsgs);
 			break; }
 
-		case AUDIT_MQ_SENDRECV: {
-			struct audit_aux_data_mq_sendrecv *axi = (void *)aux;
-			audit_log_format(ab,
-				"mqdes=%d msg_len=%zd msg_prio=%u "
-				"abs_timeout_sec=%ld abs_timeout_nsec=%ld",
-				axi->mqdes, axi->msg_len, axi->msg_prio,
-				axi->abs_timeout.tv_sec, axi->abs_timeout.tv_nsec);
-			break; }
-
 		case AUDIT_EXECVE: {
 			struct audit_aux_data_execve *axi = (void *)aux;
 			audit_log_execve_info(context, &ab, axi);
@@ -2171,97 +2170,29 @@ int __audit_mq_open(int oflag, mode_t mode, struct mq_attr __user *u_attr)
 }
 
 /**
- * __audit_mq_timedsend - record audit data for a POSIX MQ timed send
+ * __audit_mq_sendrecv - record audit data for a POSIX MQ timed send/receive
  * @mqdes: MQ descriptor
  * @msg_len: Message length
  * @msg_prio: Message priority
- * @u_abs_timeout: Message timeout in absolute time
- *
- * Returns 0 for success or NULL context or < 0 on error.
- */
-int __audit_mq_timedsend(mqd_t mqdes, size_t msg_len, unsigned int msg_prio,
-			const struct timespec __user *u_abs_timeout)
-{
-	struct audit_aux_data_mq_sendrecv *ax;
-	struct audit_context *context = current->audit_context;
-
-	if (!audit_enabled)
-		return 0;
-
-	if (likely(!context))
-		return 0;
-
-	ax = kmalloc(sizeof(*ax), GFP_ATOMIC);
-	if (!ax)
-		return -ENOMEM;
-
-	if (u_abs_timeout != NULL) {
-		if (copy_from_user(&ax->abs_timeout, u_abs_timeout, sizeof(ax->abs_timeout))) {
-			kfree(ax);
-			return -EFAULT;
-		}
-	} else
-		memset(&ax->abs_timeout, 0, sizeof(ax->abs_timeout));
-
-	ax->mqdes = mqdes;
-	ax->msg_len = msg_len;
-	ax->msg_prio = msg_prio;
-
-	ax->d.type = AUDIT_MQ_SENDRECV;
-	ax->d.next = context->aux;
-	context->aux = (void *)ax;
-	return 0;
-}
-
-/**
- * __audit_mq_timedreceive - record audit data for a POSIX MQ timed receive
- * @mqdes: MQ descriptor
- * @msg_len: Message length
- * @u_msg_prio: Message priority
- * @u_abs_timeout: Message timeout in absolute time
+ * @abs_timeout: Message timeout in absolute time
  *
- * Returns 0 for success or NULL context or < 0 on error.
  */
-int __audit_mq_timedreceive(mqd_t mqdes, size_t msg_len,
-				unsigned int __user *u_msg_prio,
-				const struct timespec __user *u_abs_timeout)
+void __audit_mq_sendrecv(mqd_t mqdes, size_t msg_len, unsigned int msg_prio,
+			const struct timespec *abs_timeout)
 {
-	struct audit_aux_data_mq_sendrecv *ax;
 	struct audit_context *context = current->audit_context;
+	struct timespec *p = &context->mq_sendrecv.abs_timeout;
 
-	if (!audit_enabled)
-		return 0;
-
-	if (likely(!context))
-		return 0;
-
-	ax = kmalloc(sizeof(*ax), GFP_ATOMIC);
-	if (!ax)
-		return -ENOMEM;
-
-	if (u_msg_prio != NULL) {
-		if (get_user(ax->msg_prio, u_msg_prio)) {
-			kfree(ax);
-			return -EFAULT;
-		}
-	} else
-		ax->msg_prio = 0;
-
-	if (u_abs_timeout != NULL) {
-		if (copy_from_user(&ax->abs_timeout, u_abs_timeout, sizeof(ax->abs_timeout))) {
-			kfree(ax);
-			return -EFAULT;
-		}
-	} else
-		memset(&ax->abs_timeout, 0, sizeof(ax->abs_timeout));
+	if (abs_timeout)
+		memcpy(p, abs_timeout, sizeof(struct timespec));
+	else
+		memset(p, 0, sizeof(struct timespec));
 
-	ax->mqdes = mqdes;
-	ax->msg_len = msg_len;
+	context->mq_sendrecv.mqdes = mqdes;
+	context->mq_sendrecv.msg_len = msg_len;
+	context->mq_sendrecv.msg_prio = msg_prio;
 
-	ax->d.type = AUDIT_MQ_SENDRECV;
-	ax->d.next = context->aux;
-	context->aux = (void *)ax;
-	return 0;
+	context->type = AUDIT_MQ_SENDRECV;
 }
 
 /**
-- 
cgit v1.2.3-70-g09d2


From 564f6993ffef656aebaf46cf2f1f6cb4f5c97207 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 14 Dec 2008 04:02:26 -0500
Subject: sanitize audit_mq_open()

* don't bother with allocations
* don't do double copy_from_user()
* don't duplicate parts of check for audit_dummy_context()

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 include/linux/audit.h |  9 ++++---
 ipc/mqueue.c          | 23 +++++++++---------
 kernel/auditsc.c      | 65 ++++++++++++++++++---------------------------------
 3 files changed, 38 insertions(+), 59 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/audit.h b/include/linux/audit.h
index 67f0cdd991b..54978bdd2bd 100644
--- a/include/linux/audit.h
+++ b/include/linux/audit.h
@@ -450,7 +450,7 @@ extern void audit_socketcall(int nargs, unsigned long *args);
 extern int audit_sockaddr(int len, void *addr);
 extern int __audit_fd_pair(int fd1, int fd2);
 extern int audit_set_macxattr(const char *name);
-extern int __audit_mq_open(int oflag, mode_t mode, struct mq_attr __user *u_attr);
+extern void __audit_mq_open(int oflag, mode_t mode, struct mq_attr *attr);
 extern void __audit_mq_sendrecv(mqd_t mqdes, size_t msg_len, unsigned int msg_prio, const struct timespec *abs_timeout);
 extern void __audit_mq_notify(mqd_t mqdes, const struct sigevent *notification);
 extern void __audit_mq_getsetattr(mqd_t mqdes, struct mq_attr *mqstat);
@@ -475,11 +475,10 @@ static inline void audit_ipc_set_perm(unsigned long qbytes, uid_t uid, gid_t gid
 	if (unlikely(!audit_dummy_context()))
 		__audit_ipc_set_perm(qbytes, uid, gid, mode);
 }
-static inline int audit_mq_open(int oflag, mode_t mode, struct mq_attr __user *u_attr)
+static inline void audit_mq_open(int oflag, mode_t mode, struct mq_attr *attr)
 {
 	if (unlikely(!audit_dummy_context()))
-		return __audit_mq_open(oflag, mode, u_attr);
-	return 0;
+		__audit_mq_open(oflag, mode, attr);
 }
 static inline void audit_mq_sendrecv(mqd_t mqdes, size_t msg_len, unsigned int msg_prio, const struct timespec *abs_timeout)
 {
@@ -541,7 +540,7 @@ extern int audit_signals;
 #define audit_fd_pair(n,a) ({ 0; })
 #define audit_sockaddr(len, addr) ({ 0; })
 #define audit_set_macxattr(n) do { ; } while (0)
-#define audit_mq_open(o,m,a) ({ 0; })
+#define audit_mq_open(o,m,a) ((void)0)
 #define audit_mq_sendrecv(d,l,p,t) ((void)0)
 #define audit_mq_notify(d,n) ((void)0)
 #define audit_mq_getsetattr(d,s) ((void)0)
diff --git a/ipc/mqueue.c b/ipc/mqueue.c
index 192da806c28..d448b69672b 100644
--- a/ipc/mqueue.c
+++ b/ipc/mqueue.c
@@ -588,22 +588,18 @@ static int mq_attr_ok(struct mq_attr *attr)
  * Invoked when creating a new queue via sys_mq_open
  */
 static struct file *do_create(struct dentry *dir, struct dentry *dentry,
-			int oflag, mode_t mode, struct mq_attr __user *u_attr)
+			int oflag, mode_t mode, struct mq_attr *attr)
 {
 	const struct cred *cred = current_cred();
-	struct mq_attr attr;
 	struct file *result;
 	int ret;
 
-	if (u_attr) {
-		ret = -EFAULT;
-		if (copy_from_user(&attr, u_attr, sizeof(attr)))
-			goto out;
+	if (attr) {
 		ret = -EINVAL;
-		if (!mq_attr_ok(&attr))
+		if (!mq_attr_ok(attr))
 			goto out;
 		/* store for use during create */
-		dentry->d_fsdata = &attr;
+		dentry->d_fsdata = attr;
 	}
 
 	mode &= ~current->fs->umask;
@@ -660,11 +656,13 @@ asmlinkage long sys_mq_open(const char __user *u_name, int oflag, mode_t mode,
 	struct dentry *dentry;
 	struct file *filp;
 	char *name;
+	struct mq_attr attr;
 	int fd, error;
 
-	error = audit_mq_open(oflag, mode, u_attr);
-	if (error != 0)
-		return error;
+	if (u_attr && copy_from_user(&attr, u_attr, sizeof(struct mq_attr)))
+		return -EFAULT;
+
+	audit_mq_open(oflag, mode, u_attr ? &attr : NULL);
 
 	if (IS_ERR(name = getname(u_name)))
 		return PTR_ERR(name);
@@ -690,7 +688,8 @@ asmlinkage long sys_mq_open(const char __user *u_name, int oflag, mode_t mode,
 			filp = do_open(dentry, oflag);
 		} else {
 			filp = do_create(mqueue_mnt->mnt_root, dentry,
-						oflag, mode, u_attr);
+						oflag, mode,
+						u_attr ? &attr : NULL);
 		}
 	} else {
 		error = -ENOENT;
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 140c4745347..83e946f1cdd 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -124,13 +124,6 @@ struct audit_aux_data {
 /* Number of target pids per aux struct. */
 #define AUDIT_AUX_PIDS	16
 
-struct audit_aux_data_mq_open {
-	struct audit_aux_data	d;
-	int			oflag;
-	mode_t			mode;
-	struct mq_attr		attr;
-};
-
 struct audit_aux_data_execve {
 	struct audit_aux_data	d;
 	int argc;
@@ -242,6 +235,11 @@ struct audit_context {
 			unsigned int		msg_prio;
 			struct timespec		abs_timeout;
 		} mq_sendrecv;
+		struct {
+			int			oflag;
+			mode_t			mode;
+			struct mq_attr		attr;
+		} mq_open;
 	};
 
 #if AUDIT_DEBUG
@@ -1263,6 +1261,16 @@ static void show_special(struct audit_context *context, int *call_panic)
 				return;
 		}
 		break; }
+	case AUDIT_MQ_OPEN: {
+		audit_log_format(ab,
+			"oflag=0x%x mode=%#o mq_flags=0x%lx mq_maxmsg=%ld "
+			"mq_msgsize=%ld mq_curmsgs=%ld",
+			context->mq_open.oflag, context->mq_open.mode,
+			context->mq_open.attr.mq_flags,
+			context->mq_open.attr.mq_maxmsg,
+			context->mq_open.attr.mq_msgsize,
+			context->mq_open.attr.mq_curmsgs);
+		break; }
 	case AUDIT_MQ_SENDRECV: {
 		audit_log_format(ab,
 			"mqdes=%d msg_len=%zd msg_prio=%u "
@@ -1368,15 +1376,6 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
 			continue; /* audit_panic has been called */
 
 		switch (aux->type) {
-		case AUDIT_MQ_OPEN: {
-			struct audit_aux_data_mq_open *axi = (void *)aux;
-			audit_log_format(ab,
-				"oflag=0x%x mode=%#o mq_flags=0x%lx mq_maxmsg=%ld "
-				"mq_msgsize=%ld mq_curmsgs=%ld",
-				axi->oflag, axi->mode, axi->attr.mq_flags,
-				axi->attr.mq_maxmsg, axi->attr.mq_msgsize,
-				axi->attr.mq_curmsgs);
-			break; }
 
 		case AUDIT_EXECVE: {
 			struct audit_aux_data_execve *axi = (void *)aux;
@@ -2135,38 +2134,20 @@ int audit_set_loginuid(struct task_struct *task, uid_t loginuid)
  * @mode: mode bits
  * @u_attr: queue attributes
  *
- * Returns 0 for success or NULL context or < 0 on error.
  */
-int __audit_mq_open(int oflag, mode_t mode, struct mq_attr __user *u_attr)
+void __audit_mq_open(int oflag, mode_t mode, struct mq_attr *attr)
 {
-	struct audit_aux_data_mq_open *ax;
 	struct audit_context *context = current->audit_context;
 
-	if (!audit_enabled)
-		return 0;
-
-	if (likely(!context))
-		return 0;
-
-	ax = kmalloc(sizeof(*ax), GFP_ATOMIC);
-	if (!ax)
-		return -ENOMEM;
-
-	if (u_attr != NULL) {
-		if (copy_from_user(&ax->attr, u_attr, sizeof(ax->attr))) {
-			kfree(ax);
-			return -EFAULT;
-		}
-	} else
-		memset(&ax->attr, 0, sizeof(ax->attr));
+	if (attr)
+		memcpy(&context->mq_open.attr, attr, sizeof(struct mq_attr));
+	else
+		memset(&context->mq_open.attr, 0, sizeof(struct mq_attr));
 
-	ax->oflag = oflag;
-	ax->mode = mode;
+	context->mq_open.oflag = oflag;
+	context->mq_open.mode = mode;
 
-	ax->d.type = AUDIT_MQ_OPEN;
-	ax->d.next = context->aux;
-	context->aux = (void *)ax;
-	return 0;
+	context->type = AUDIT_MQ_OPEN;
 }
 
 /**
-- 
cgit v1.2.3-70-g09d2


From 157cf649a735a2f7e8dba0ed08e6e38b6c30d886 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 14 Dec 2008 04:57:47 -0500
Subject: sanitize audit_fd_pair()

* no allocations
* return void

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/pipe.c             |  7 +------
 include/linux/audit.h |  9 ++++-----
 kernel/auditsc.c      | 44 ++++++++++++++------------------------------
 net/socket.c          |  9 +--------
 4 files changed, 20 insertions(+), 49 deletions(-)

(limited to 'include/linux')

diff --git a/fs/pipe.c b/fs/pipe.c
index aaf797bd57b..891697112f6 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -1016,10 +1016,7 @@ int do_pipe_flags(int *fd, int flags)
 		goto err_fdr;
 	fdw = error;
 
-	error = audit_fd_pair(fdr, fdw);
-	if (error < 0)
-		goto err_fdw;
-
+	audit_fd_pair(fdr, fdw);
 	fd_install(fdr, fr);
 	fd_install(fdw, fw);
 	fd[0] = fdr;
@@ -1027,8 +1024,6 @@ int do_pipe_flags(int *fd, int flags)
 
 	return 0;
 
- err_fdw:
-	put_unused_fd(fdw);
  err_fdr:
 	put_unused_fd(fdr);
  err_read_pipe:
diff --git a/include/linux/audit.h b/include/linux/audit.h
index 54978bdd2bd..bd59cd1e321 100644
--- a/include/linux/audit.h
+++ b/include/linux/audit.h
@@ -448,7 +448,7 @@ extern void __audit_ipc_set_perm(unsigned long qbytes, uid_t uid, gid_t gid, mod
 extern int audit_bprm(struct linux_binprm *bprm);
 extern void audit_socketcall(int nargs, unsigned long *args);
 extern int audit_sockaddr(int len, void *addr);
-extern int __audit_fd_pair(int fd1, int fd2);
+extern void __audit_fd_pair(int fd1, int fd2);
 extern int audit_set_macxattr(const char *name);
 extern void __audit_mq_open(int oflag, mode_t mode, struct mq_attr *attr);
 extern void __audit_mq_sendrecv(mqd_t mqdes, size_t msg_len, unsigned int msg_prio, const struct timespec *abs_timeout);
@@ -464,11 +464,10 @@ static inline void audit_ipc_obj(struct kern_ipc_perm *ipcp)
 	if (unlikely(!audit_dummy_context()))
 		__audit_ipc_obj(ipcp);
 }
-static inline int audit_fd_pair(int fd1, int fd2)
+static inline void audit_fd_pair(int fd1, int fd2)
 {
 	if (unlikely(!audit_dummy_context()))
-		return __audit_fd_pair(fd1, fd2);
-	return 0;
+		__audit_fd_pair(fd1, fd2);
 }
 static inline void audit_ipc_set_perm(unsigned long qbytes, uid_t uid, gid_t gid, mode_t mode)
 {
@@ -537,7 +536,7 @@ extern int audit_signals;
 #define audit_ipc_set_perm(q,u,g,m) ((void)0)
 #define audit_bprm(p) ({ 0; })
 #define audit_socketcall(n,a) ((void)0)
-#define audit_fd_pair(n,a) ({ 0; })
+#define audit_fd_pair(n,a) ((void)0)
 #define audit_sockaddr(len, addr) ({ 0; })
 #define audit_set_macxattr(n) do { ; } while (0)
 #define audit_mq_open(o,m,a) ((void)0)
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 83e946f1cdd..327e65d5067 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -131,11 +131,6 @@ struct audit_aux_data_execve {
 	struct mm_struct *mm;
 };
 
-struct audit_aux_data_fd_pair {
-	struct	audit_aux_data d;
-	int	fd[2];
-};
-
 struct audit_aux_data_pids {
 	struct audit_aux_data	d;
 	pid_t			target_pid[AUDIT_AUX_PIDS];
@@ -241,6 +236,7 @@ struct audit_context {
 			struct mq_attr		attr;
 		} mq_open;
 	};
+	int fds[2];
 
 #if AUDIT_DEBUG
 	int		    put_count;
@@ -1382,11 +1378,6 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
 			audit_log_execve_info(context, &ab, axi);
 			break; }
 
-		case AUDIT_FD_PAIR: {
-			struct audit_aux_data_fd_pair *axs = (void *)aux;
-			audit_log_format(ab, "fd0=%d fd1=%d", axs->fd[0], axs->fd[1]);
-			break; }
-
 		case AUDIT_BPRM_FCAPS: {
 			struct audit_aux_data_bprm_fcaps *axs = (void *)aux;
 			audit_log_format(ab, "fver=%x", axs->fcap_ver);
@@ -1416,6 +1407,15 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
 	if (context->type)
 		show_special(context, &call_panic);
 
+	if (context->fds[0] >= 0) {
+		ab = audit_log_start(context, GFP_KERNEL, AUDIT_FD_PAIR);
+		if (ab) {
+			audit_log_format(ab, "fd0=%d fd1=%d",
+					context->fds[0], context->fds[1]);
+			audit_log_end(ab);
+		}
+	}
+
 	if (context->sockaddr_len) {
 		ab = audit_log_start(context, GFP_KERNEL, AUDIT_SOCKADDR);
 		if (ab) {
@@ -1696,6 +1696,7 @@ void audit_syscall_exit(int valid, long return_code)
 		context->target_sid = 0;
 		context->sockaddr_len = 0;
 		context->type = 0;
+		context->fds[0] = -1;
 		kfree(context->filterkey);
 		context->filterkey = NULL;
 		tsk->audit_context = context;
@@ -2291,29 +2292,12 @@ void audit_socketcall(int nargs, unsigned long *args)
  * @fd1: the first file descriptor
  * @fd2: the second file descriptor
  *
- * Returns 0 for success or NULL context or < 0 on error.
  */
-int __audit_fd_pair(int fd1, int fd2)
+void __audit_fd_pair(int fd1, int fd2)
 {
 	struct audit_context *context = current->audit_context;
-	struct audit_aux_data_fd_pair *ax;
-
-	if (likely(!context)) {
-		return 0;
-	}
-
-	ax = kmalloc(sizeof(*ax), GFP_KERNEL);
-	if (!ax) {
-		return -ENOMEM;
-	}
-
-	ax->fd[0] = fd1;
-	ax->fd[1] = fd2;
-
-	ax->d.type = AUDIT_FD_PAIR;
-	ax->d.next = context->aux;
-	context->aux = (void *)ax;
-	return 0;
+	context->fds[0] = fd1;
+	context->fds[1] = fd2;
 }
 
 /**
diff --git a/net/socket.c b/net/socket.c
index b41a92093e4..06603d73c41 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -1313,13 +1313,7 @@ asmlinkage long sys_socketpair(int family, int type, int protocol,
 		goto out_fd1;
 	}
 
-	err = audit_fd_pair(fd1, fd2);
-	if (err < 0) {
-		fput(newfile1);
-		fput(newfile2);
-		goto out_fd;
-	}
-
+	audit_fd_pair(fd1, fd2);
 	fd_install(fd1, newfile1);
 	fd_install(fd2, newfile2);
 	/* fd1 and fd2 may be already another descriptors.
@@ -1349,7 +1343,6 @@ out_fd2:
 out_fd1:
 	put_filp(newfile2);
 	sock_release(sock2);
-out_fd:
 	put_unused_fd(fd1);
 	put_unused_fd(fd2);
 	goto out;
-- 
cgit v1.2.3-70-g09d2


From 57f71a0af4244d9ba3c0bce74b1d2e66e8d520bd Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 4 Jan 2009 14:52:57 -0500
Subject: sanitize audit_log_capset()

* no allocations
* return void
* don't duplicate checked for dummy context

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 include/linux/audit.h |  9 ++++-----
 kernel/auditsc.c      | 44 ++++++++++++++++----------------------------
 kernel/capability.c   |  4 +---
 3 files changed, 21 insertions(+), 36 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/audit.h b/include/linux/audit.h
index bd59cd1e321..7ddcb6a29eb 100644
--- a/include/linux/audit.h
+++ b/include/linux/audit.h
@@ -457,7 +457,7 @@ extern void __audit_mq_getsetattr(mqd_t mqdes, struct mq_attr *mqstat);
 extern int __audit_log_bprm_fcaps(struct linux_binprm *bprm,
 				  const struct cred *new,
 				  const struct cred *old);
-extern int __audit_log_capset(pid_t pid, const struct cred *new, const struct cred *old);
+extern void __audit_log_capset(pid_t pid, const struct cred *new, const struct cred *old);
 
 static inline void audit_ipc_obj(struct kern_ipc_perm *ipcp)
 {
@@ -504,12 +504,11 @@ static inline int audit_log_bprm_fcaps(struct linux_binprm *bprm,
 	return 0;
 }
 
-static inline int audit_log_capset(pid_t pid, const struct cred *new,
+static inline void audit_log_capset(pid_t pid, const struct cred *new,
 				   const struct cred *old)
 {
 	if (unlikely(!audit_dummy_context()))
-		return __audit_log_capset(pid, new, old);
-	return 0;
+		__audit_log_capset(pid, new, old);
 }
 
 extern int audit_n_rules;
@@ -544,7 +543,7 @@ extern int audit_signals;
 #define audit_mq_notify(d,n) ((void)0)
 #define audit_mq_getsetattr(d,s) ((void)0)
 #define audit_log_bprm_fcaps(b, ncr, ocr) ({ 0; })
-#define audit_log_capset(pid, ncr, ocr) ({ 0; })
+#define audit_log_capset(pid, ncr, ocr) ((void)0)
 #define audit_ptrace(t) ((void)0)
 #define audit_n_rules 0
 #define audit_signals 0
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 327e65d5067..c76a58215f5 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -235,6 +235,10 @@ struct audit_context {
 			mode_t			mode;
 			struct mq_attr		attr;
 		} mq_open;
+		struct {
+			pid_t			pid;
+			struct audit_cap_data	cap;
+		} capset;
 	};
 	int fds[2];
 
@@ -1291,6 +1295,12 @@ static void show_special(struct audit_context *context, int *call_panic)
 			attr->mq_flags, attr->mq_maxmsg,
 			attr->mq_msgsize, attr->mq_curmsgs);
 		break; }
+	case AUDIT_CAPSET: {
+		audit_log_format(ab, "pid=%d", context->capset.pid);
+		audit_log_cap(ab, "cap_pi", &context->capset.cap.inheritable);
+		audit_log_cap(ab, "cap_pp", &context->capset.cap.permitted);
+		audit_log_cap(ab, "cap_pe", &context->capset.cap.effective);
+		break; }
 	}
 	audit_log_end(ab);
 }
@@ -1392,14 +1402,6 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
 			audit_log_cap(ab, "new_pe", &axs->new_pcap.effective);
 			break; }
 
-		case AUDIT_CAPSET: {
-			struct audit_aux_data_capset *axs = (void *)aux;
-			audit_log_format(ab, "pid=%d", axs->pid);
-			audit_log_cap(ab, "cap_pi", &axs->cap.inheritable);
-			audit_log_cap(ab, "cap_pp", &axs->cap.permitted);
-			audit_log_cap(ab, "cap_pe", &axs->cap.effective);
-			break; }
-
 		}
 		audit_log_end(ab);
 	}
@@ -2456,29 +2458,15 @@ int __audit_log_bprm_fcaps(struct linux_binprm *bprm,
  * Record the aguments userspace sent to sys_capset for later printing by the
  * audit system if applicable
  */
-int __audit_log_capset(pid_t pid,
+void __audit_log_capset(pid_t pid,
 		       const struct cred *new, const struct cred *old)
 {
-	struct audit_aux_data_capset *ax;
 	struct audit_context *context = current->audit_context;
-
-	if (likely(!audit_enabled || !context || context->dummy))
-		return 0;
-
-	ax = kmalloc(sizeof(*ax), GFP_KERNEL);
-	if (!ax)
-		return -ENOMEM;
-
-	ax->d.type = AUDIT_CAPSET;
-	ax->d.next = context->aux;
-	context->aux = (void *)ax;
-
-	ax->pid = pid;
-	ax->cap.effective   = new->cap_effective;
-	ax->cap.inheritable = new->cap_effective;
-	ax->cap.permitted   = new->cap_permitted;
-
-	return 0;
+	context->capset.pid = pid;
+	context->capset.cap.effective   = new->cap_effective;
+	context->capset.cap.inheritable = new->cap_effective;
+	context->capset.cap.permitted   = new->cap_permitted;
+	context->type = AUDIT_CAPSET;
 }
 
 /**
diff --git a/kernel/capability.c b/kernel/capability.c
index 36b4b4daebe..c598d9d5be4 100644
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -280,9 +280,7 @@ asmlinkage long sys_capset(cap_user_header_t header, const cap_user_data_t data)
 	if (ret < 0)
 		goto error;
 
-	ret = audit_log_capset(pid, new, current_cred());
-	if (ret < 0)
-		return ret;
+	audit_log_capset(pid, new, current_cred());
 
 	return commit_creds(new);
 
-- 
cgit v1.2.3-70-g09d2


From 0590b9335a1c72a3f0defcc6231287f7817e07c8 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 14 Dec 2008 23:45:27 -0500
Subject: fixing audit rule ordering mess, part 1

Problem: ordering between the rules on exit chain is currently lost;
all watch and inode rules are listed after everything else _and_
exit,never on one kind doesn't stop exit,always on another from
being matched.

Solution: assign priorities to rules, keep track of the current
highest-priority matching rule and its result (always/never).

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 include/linux/audit.h |  1 +
 kernel/audit.h        |  5 +---
 kernel/auditfilter.c  | 17 +++++++++--
 kernel/auditsc.c      | 79 ++++++++++++++++++++++++++++-----------------------
 4 files changed, 59 insertions(+), 43 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/audit.h b/include/linux/audit.h
index 7ddcb6a29eb..5b47eeb00d5 100644
--- a/include/linux/audit.h
+++ b/include/linux/audit.h
@@ -373,6 +373,7 @@ struct audit_krule {
 	struct audit_watch	*watch;	/* associated watch */
 	struct audit_tree	*tree;	/* associated watched tree */
 	struct list_head	rlist;	/* entry in audit_{watch,tree}.rules list */
+	u64			prio;
 };
 
 struct audit_field {
diff --git a/kernel/audit.h b/kernel/audit.h
index 9d6717412fe..16f18cac661 100644
--- a/kernel/audit.h
+++ b/kernel/audit.h
@@ -159,11 +159,8 @@ static inline int audit_signal_info(int sig, struct task_struct *t)
 		return __audit_signal_info(sig, t);
 	return 0;
 }
-extern enum audit_state audit_filter_inodes(struct task_struct *,
-					    struct audit_context *);
-extern void audit_set_auditable(struct audit_context *);
+extern void audit_filter_inodes(struct task_struct *, struct audit_context *);
 #else
 #define audit_signal_info(s,t) AUDIT_DISABLED
 #define audit_filter_inodes(t,c) AUDIT_DISABLED
-#define audit_set_auditable(c)
 #endif
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index 0febaa0f784..995a2e86808 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -919,6 +919,7 @@ static struct audit_entry *audit_dupe_rule(struct audit_krule *old,
 	new->action = old->action;
 	for (i = 0; i < AUDIT_BITMASK_SIZE; i++)
 		new->mask[i] = old->mask[i];
+	new->prio = old->prio;
 	new->buflen = old->buflen;
 	new->inode_f = old->inode_f;
 	new->watch = NULL;
@@ -987,9 +988,8 @@ static void audit_update_watch(struct audit_parent *parent,
 
 		/* If the update involves invalidating rules, do the inode-based
 		 * filtering now, so we don't omit records. */
-		if (invalidating && current->audit_context &&
-		    audit_filter_inodes(current, current->audit_context) == AUDIT_RECORD_CONTEXT)
-			audit_set_auditable(current->audit_context);
+		if (invalidating && current->audit_context)
+			audit_filter_inodes(current, current->audit_context);
 
 		nwatch = audit_dupe_watch(owatch);
 		if (IS_ERR(nwatch)) {
@@ -1258,6 +1258,9 @@ static int audit_add_watch(struct audit_krule *krule, struct nameidata *ndp,
 	return ret;
 }
 
+static u64 prio_low = ~0ULL/2;
+static u64 prio_high = ~0ULL/2 - 1;
+
 /* Add rule to given filterlist if not a duplicate. */
 static inline int audit_add_rule(struct audit_entry *entry,
 				 struct list_head *list)
@@ -1319,6 +1322,14 @@ static inline int audit_add_rule(struct audit_entry *entry,
 		}
 	}
 
+	entry->rule.prio = ~0ULL;
+	if (entry->rule.listnr == AUDIT_FILTER_EXIT) {
+		if (entry->rule.flags & AUDIT_FILTER_PREPEND)
+			entry->rule.prio = ++prio_high;
+		else
+			entry->rule.prio = --prio_low;
+	}
+
 	if (entry->rule.flags & AUDIT_FILTER_PREPEND) {
 		list_add_rcu(&entry->list, list);
 		entry->rule.flags &= ~AUDIT_FILTER_PREPEND;
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index c76a58215f5..19d2c2747c8 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -165,14 +165,14 @@ struct audit_tree_refs {
 struct audit_context {
 	int		    dummy;	/* must be the first element */
 	int		    in_syscall;	/* 1 if task is in a syscall */
-	enum audit_state    state;
+	enum audit_state    state, current_state;
 	unsigned int	    serial;     /* serial number for record */
 	struct timespec	    ctime;      /* time of syscall entry */
 	int		    major;      /* syscall number */
 	unsigned long	    argv[4];    /* syscall arguments */
 	int		    return_valid; /* return code is valid */
 	long		    return_code;/* syscall return code */
-	int		    auditable;  /* 1 if record should be written */
+	u64		    prio;
 	int		    name_count;
 	struct audit_names  names[AUDIT_NAMES];
 	char *		    filterkey;	/* key for rule that triggered record */
@@ -630,8 +630,16 @@ static int audit_filter_rules(struct task_struct *tsk,
 			return 0;
 		}
 	}
-	if (rule->filterkey && ctx)
-		ctx->filterkey = kstrdup(rule->filterkey, GFP_ATOMIC);
+
+	if (ctx) {
+		if (rule->prio <= ctx->prio)
+			return 0;
+		if (rule->filterkey) {
+			kfree(ctx->filterkey);
+			ctx->filterkey = kstrdup(rule->filterkey, GFP_ATOMIC);
+		}
+		ctx->prio = rule->prio;
+	}
 	switch (rule->action) {
 	case AUDIT_NEVER:    *state = AUDIT_DISABLED;	    break;
 	case AUDIT_ALWAYS:   *state = AUDIT_RECORD_CONTEXT; break;
@@ -685,6 +693,7 @@ static enum audit_state audit_filter_syscall(struct task_struct *tsk,
 			    audit_filter_rules(tsk, &e->rule, ctx, NULL,
 					       &state)) {
 				rcu_read_unlock();
+				ctx->current_state = state;
 				return state;
 			}
 		}
@@ -698,15 +707,14 @@ static enum audit_state audit_filter_syscall(struct task_struct *tsk,
  * buckets applicable to the inode numbers in audit_names[].
  * Regarding audit_state, same rules apply as for audit_filter_syscall().
  */
-enum audit_state audit_filter_inodes(struct task_struct *tsk,
-				     struct audit_context *ctx)
+void audit_filter_inodes(struct task_struct *tsk, struct audit_context *ctx)
 {
 	int i;
 	struct audit_entry *e;
 	enum audit_state state;
 
 	if (audit_pid && tsk->tgid == audit_pid)
-		return AUDIT_DISABLED;
+		return;
 
 	rcu_read_lock();
 	for (i = 0; i < ctx->name_count; i++) {
@@ -723,17 +731,20 @@ enum audit_state audit_filter_inodes(struct task_struct *tsk,
 			if ((e->rule.mask[word] & bit) == bit &&
 			    audit_filter_rules(tsk, &e->rule, ctx, n, &state)) {
 				rcu_read_unlock();
-				return state;
+				ctx->current_state = state;
+				return;
 			}
 		}
 	}
 	rcu_read_unlock();
-	return AUDIT_BUILD_CONTEXT;
 }
 
-void audit_set_auditable(struct audit_context *ctx)
+static void audit_set_auditable(struct audit_context *ctx)
 {
-	ctx->auditable = 1;
+	if (!ctx->prio) {
+		ctx->prio = 1;
+		ctx->current_state = AUDIT_RECORD_CONTEXT;
+	}
 }
 
 static inline struct audit_context *audit_get_context(struct task_struct *tsk,
@@ -764,23 +775,11 @@ static inline struct audit_context *audit_get_context(struct task_struct *tsk,
 	else
 		context->return_code  = return_code;
 
-	if (context->in_syscall && !context->dummy && !context->auditable) {
-		enum audit_state state;
-
-		state = audit_filter_syscall(tsk, context, &audit_filter_list[AUDIT_FILTER_EXIT]);
-		if (state == AUDIT_RECORD_CONTEXT) {
-			context->auditable = 1;
-			goto get_context;
-		}
-
-		state = audit_filter_inodes(tsk, context);
-		if (state == AUDIT_RECORD_CONTEXT)
-			context->auditable = 1;
-
+	if (context->in_syscall && !context->dummy) {
+		audit_filter_syscall(tsk, context, &audit_filter_list[AUDIT_FILTER_EXIT]);
+		audit_filter_inodes(tsk, context);
 	}
 
-get_context:
-
 	tsk->audit_context = NULL;
 	return context;
 }
@@ -790,8 +789,7 @@ static inline void audit_free_names(struct audit_context *context)
 	int i;
 
 #if AUDIT_DEBUG == 2
-	if (context->auditable
-	    ||context->put_count + context->ino_count != context->name_count) {
+	if (context->put_count + context->ino_count != context->name_count) {
 		printk(KERN_ERR "%s:%d(:%d): major=%d in_syscall=%d"
 		       " name_count=%d put_count=%d"
 		       " ino_count=%d [NOT freeing]\n",
@@ -842,6 +840,7 @@ static inline void audit_zero_context(struct audit_context *context,
 {
 	memset(context, 0, sizeof(*context));
 	context->state      = state;
+	context->prio = state == AUDIT_RECORD_CONTEXT ? ~0ULL : 0;
 }
 
 static inline struct audit_context *audit_alloc_context(enum audit_state state)
@@ -1543,7 +1542,7 @@ void audit_free(struct task_struct *tsk)
 	 * We use GFP_ATOMIC here because we might be doing this
 	 * in the context of the idle thread */
 	/* that can happen only if we are called from do_exit() */
-	if (context->in_syscall && context->auditable)
+	if (context->in_syscall && context->current_state == AUDIT_RECORD_CONTEXT)
 		audit_log_exit(context, tsk);
 
 	audit_free_context(context);
@@ -1627,15 +1626,17 @@ void audit_syscall_entry(int arch, int major,
 
 	state = context->state;
 	context->dummy = !audit_n_rules;
-	if (!context->dummy && (state == AUDIT_SETUP_CONTEXT || state == AUDIT_BUILD_CONTEXT))
+	if (!context->dummy && state == AUDIT_BUILD_CONTEXT) {
+		context->prio = 0;
 		state = audit_filter_syscall(tsk, context, &audit_filter_list[AUDIT_FILTER_ENTRY]);
+	}
 	if (likely(state == AUDIT_DISABLED))
 		return;
 
 	context->serial     = 0;
 	context->ctime      = CURRENT_TIME;
 	context->in_syscall = 1;
-	context->auditable  = !!(state == AUDIT_RECORD_CONTEXT);
+	context->current_state  = state;
 	context->ppid       = 0;
 }
 
@@ -1643,17 +1644,20 @@ void audit_finish_fork(struct task_struct *child)
 {
 	struct audit_context *ctx = current->audit_context;
 	struct audit_context *p = child->audit_context;
-	if (!p || !ctx || !ctx->auditable)
+	if (!p || !ctx)
+		return;
+	if (!ctx->in_syscall || ctx->current_state != AUDIT_RECORD_CONTEXT)
 		return;
 	p->arch = ctx->arch;
 	p->major = ctx->major;
 	memcpy(p->argv, ctx->argv, sizeof(ctx->argv));
 	p->ctime = ctx->ctime;
 	p->dummy = ctx->dummy;
-	p->auditable = ctx->auditable;
 	p->in_syscall = ctx->in_syscall;
 	p->filterkey = kstrdup(ctx->filterkey, GFP_KERNEL);
 	p->ppid = current->pid;
+	p->prio = ctx->prio;
+	p->current_state = ctx->current_state;
 }
 
 /**
@@ -1677,11 +1681,11 @@ void audit_syscall_exit(int valid, long return_code)
 	if (likely(!context))
 		return;
 
-	if (context->in_syscall && context->auditable)
+	if (context->in_syscall && context->current_state == AUDIT_RECORD_CONTEXT)
 		audit_log_exit(context, tsk);
 
 	context->in_syscall = 0;
-	context->auditable  = 0;
+	context->prio = context->state == AUDIT_RECORD_CONTEXT ? ~0ULL : 0;
 
 	if (context->previous) {
 		struct audit_context *new_context = context->previous;
@@ -2091,7 +2095,10 @@ int auditsc_get_stamp(struct audit_context *ctx,
 	t->tv_sec  = ctx->ctime.tv_sec;
 	t->tv_nsec = ctx->ctime.tv_nsec;
 	*serial    = ctx->serial;
-	ctx->auditable = 1;
+	if (!ctx->prio) {
+		ctx->prio = 1;
+		ctx->current_state = AUDIT_RECORD_CONTEXT;
+	}
 	return 1;
 }
 
-- 
cgit v1.2.3-70-g09d2


From e45aa212ea81d39b38ba158df344dc3a500153e5 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Mon, 15 Dec 2008 01:17:50 -0500
Subject: audit rules ordering, part 2

Fix the actual rule listing; add per-type lists _not_ used for matching,
with all exit,... sitting on one such list.  Simplifies "do something
for all rules" logics, while we are at it...

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 include/linux/audit.h |  1 +
 kernel/audit_tree.c   |  1 +
 kernel/auditfilter.c  | 95 +++++++++++++++++++++------------------------------
 3 files changed, 41 insertions(+), 56 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/audit.h b/include/linux/audit.h
index 5b47eeb00d5..cc71fdb56ae 100644
--- a/include/linux/audit.h
+++ b/include/linux/audit.h
@@ -373,6 +373,7 @@ struct audit_krule {
 	struct audit_watch	*watch;	/* associated watch */
 	struct audit_tree	*tree;	/* associated watched tree */
 	struct list_head	rlist;	/* entry in audit_{watch,tree}.rules list */
+	struct list_head	list;	/* for AUDIT_LIST* purposes only */
 	u64			prio;
 };
 
diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c
index 8b509441f49..48bddad2a3d 100644
--- a/kernel/audit_tree.c
+++ b/kernel/audit_tree.c
@@ -450,6 +450,7 @@ static void kill_rules(struct audit_tree *tree)
 			audit_log_end(ab);
 			rule->tree = NULL;
 			list_del_rcu(&entry->list);
+			list_del(&entry->rule.list);
 			call_rcu(&entry->rcu, audit_free_rule_rcu);
 		}
 	}
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index 995a2e86808..5d4edc6f7a3 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -86,6 +86,14 @@ struct list_head audit_filter_list[AUDIT_NR_FILTERS] = {
 #error Fix audit_filter_list initialiser
 #endif
 };
+static struct list_head audit_rules_list[AUDIT_NR_FILTERS] = {
+	LIST_HEAD_INIT(audit_rules_list[0]),
+	LIST_HEAD_INIT(audit_rules_list[1]),
+	LIST_HEAD_INIT(audit_rules_list[2]),
+	LIST_HEAD_INIT(audit_rules_list[3]),
+	LIST_HEAD_INIT(audit_rules_list[4]),
+	LIST_HEAD_INIT(audit_rules_list[5]),
+};
 
 DEFINE_MUTEX(audit_filter_mutex);
 
@@ -1007,12 +1015,15 @@ static void audit_update_watch(struct audit_parent *parent,
 			list_del_rcu(&oentry->list);
 
 			nentry = audit_dupe_rule(&oentry->rule, nwatch);
-			if (IS_ERR(nentry))
+			if (IS_ERR(nentry)) {
+				list_del(&oentry->rule.list);
 				audit_panic("error updating watch, removing");
-			else {
+			} else {
 				int h = audit_hash_ino((u32)ino);
 				list_add(&nentry->rule.rlist, &nwatch->rules);
 				list_add_rcu(&nentry->list, &audit_inode_hash[h]);
+				list_replace(&oentry->rule.list,
+					     &nentry->rule.list);
 			}
 
 			call_rcu(&oentry->rcu, audit_free_rule_rcu);
@@ -1077,6 +1088,7 @@ static void audit_remove_parent_watches(struct audit_parent *parent)
 				audit_log_end(ab);
 			}
 			list_del(&r->rlist);
+			list_del(&r->list);
 			list_del_rcu(&e->list);
 			call_rcu(&e->rcu, audit_free_rule_rcu);
 		}
@@ -1331,9 +1343,13 @@ static inline int audit_add_rule(struct audit_entry *entry,
 	}
 
 	if (entry->rule.flags & AUDIT_FILTER_PREPEND) {
+		list_add(&entry->rule.list,
+			 &audit_rules_list[entry->rule.listnr]);
 		list_add_rcu(&entry->list, list);
 		entry->rule.flags &= ~AUDIT_FILTER_PREPEND;
 	} else {
+		list_add_tail(&entry->rule.list,
+			      &audit_rules_list[entry->rule.listnr]);
 		list_add_tail_rcu(&entry->list, list);
 	}
 #ifdef CONFIG_AUDITSYSCALL
@@ -1415,6 +1431,7 @@ static inline int audit_del_rule(struct audit_entry *entry,
 		audit_remove_tree_rule(&e->rule);
 
 	list_del_rcu(&e->list);
+	list_del(&e->rule.list);
 	call_rcu(&e->rcu, audit_free_rule_rcu);
 
 #ifdef CONFIG_AUDITSYSCALL
@@ -1443,30 +1460,16 @@ out:
 static void audit_list(int pid, int seq, struct sk_buff_head *q)
 {
 	struct sk_buff *skb;
-	struct audit_entry *entry;
+	struct audit_krule *r;
 	int i;
 
 	/* This is a blocking read, so use audit_filter_mutex instead of rcu
 	 * iterator to sync with list writers. */
 	for (i=0; i<AUDIT_NR_FILTERS; i++) {
-		list_for_each_entry(entry, &audit_filter_list[i], list) {
-			struct audit_rule *rule;
-
-			rule = audit_krule_to_rule(&entry->rule);
-			if (unlikely(!rule))
-				break;
-			skb = audit_make_reply(pid, seq, AUDIT_LIST, 0, 1,
-					 rule, sizeof(*rule));
-			if (skb)
-				skb_queue_tail(q, skb);
-			kfree(rule);
-		}
-	}
-	for (i = 0; i < AUDIT_INODE_BUCKETS; i++) {
-		list_for_each_entry(entry, &audit_inode_hash[i], list) {
+		list_for_each_entry(r, &audit_rules_list[i], list) {
 			struct audit_rule *rule;
 
-			rule = audit_krule_to_rule(&entry->rule);
+			rule = audit_krule_to_rule(r);
 			if (unlikely(!rule))
 				break;
 			skb = audit_make_reply(pid, seq, AUDIT_LIST, 0, 1,
@@ -1485,30 +1488,16 @@ static void audit_list(int pid, int seq, struct sk_buff_head *q)
 static void audit_list_rules(int pid, int seq, struct sk_buff_head *q)
 {
 	struct sk_buff *skb;
-	struct audit_entry *e;
+	struct audit_krule *r;
 	int i;
 
 	/* This is a blocking read, so use audit_filter_mutex instead of rcu
 	 * iterator to sync with list writers. */
 	for (i=0; i<AUDIT_NR_FILTERS; i++) {
-		list_for_each_entry(e, &audit_filter_list[i], list) {
-			struct audit_rule_data *data;
-
-			data = audit_krule_to_data(&e->rule);
-			if (unlikely(!data))
-				break;
-			skb = audit_make_reply(pid, seq, AUDIT_LIST_RULES, 0, 1,
-					 data, sizeof(*data) + data->buflen);
-			if (skb)
-				skb_queue_tail(q, skb);
-			kfree(data);
-		}
-	}
-	for (i=0; i< AUDIT_INODE_BUCKETS; i++) {
-		list_for_each_entry(e, &audit_inode_hash[i], list) {
+		list_for_each_entry(r, &audit_rules_list[i], list) {
 			struct audit_rule_data *data;
 
-			data = audit_krule_to_data(&e->rule);
+			data = audit_krule_to_data(r);
 			if (unlikely(!data))
 				break;
 			skb = audit_make_reply(pid, seq, AUDIT_LIST_RULES, 0, 1,
@@ -1789,35 +1778,37 @@ unlock_and_return:
 	return result;
 }
 
-static int update_lsm_rule(struct audit_entry *entry)
+static int update_lsm_rule(struct audit_krule *r)
 {
+	struct audit_entry *entry = container_of(r, struct audit_entry, rule);
 	struct audit_entry *nentry;
 	struct audit_watch *watch;
 	struct audit_tree *tree;
 	int err = 0;
 
-	if (!security_audit_rule_known(&entry->rule))
+	if (!security_audit_rule_known(r))
 		return 0;
 
-	watch = entry->rule.watch;
-	tree = entry->rule.tree;
-	nentry = audit_dupe_rule(&entry->rule, watch);
+	watch = r->watch;
+	tree = r->tree;
+	nentry = audit_dupe_rule(r, watch);
 	if (IS_ERR(nentry)) {
 		/* save the first error encountered for the
 		 * return value */
 		err = PTR_ERR(nentry);
 		audit_panic("error updating LSM filters");
 		if (watch)
-			list_del(&entry->rule.rlist);
+			list_del(&r->rlist);
 		list_del_rcu(&entry->list);
+		list_del(&r->list);
 	} else {
 		if (watch) {
 			list_add(&nentry->rule.rlist, &watch->rules);
-			list_del(&entry->rule.rlist);
+			list_del(&r->rlist);
 		} else if (tree)
-			list_replace_init(&entry->rule.rlist,
-				     &nentry->rule.rlist);
+			list_replace_init(&r->rlist, &nentry->rule.rlist);
 		list_replace_rcu(&entry->list, &nentry->list);
+		list_replace(&r->list, &nentry->rule.list);
 	}
 	call_rcu(&entry->rcu, audit_free_rule_rcu);
 
@@ -1831,27 +1822,19 @@ static int update_lsm_rule(struct audit_entry *entry)
  * updated rule. */
 int audit_update_lsm_rules(void)
 {
-	struct audit_entry *e, *n;
+	struct audit_krule *r, *n;
 	int i, err = 0;
 
 	/* audit_filter_mutex synchronizes the writers */
 	mutex_lock(&audit_filter_mutex);
 
 	for (i = 0; i < AUDIT_NR_FILTERS; i++) {
-		list_for_each_entry_safe(e, n, &audit_filter_list[i], list) {
-			int res = update_lsm_rule(e);
-			if (!err)
-				err = res;
-		}
-	}
-	for (i=0; i< AUDIT_INODE_BUCKETS; i++) {
-		list_for_each_entry_safe(e, n, &audit_inode_hash[i], list) {
-			int res = update_lsm_rule(e);
+		list_for_each_entry_safe(r, n, &audit_rules_list[i], list) {
+			int res = update_lsm_rule(r);
 			if (!err)
 				err = res;
 		}
 	}
-
 	mutex_unlock(&audit_filter_mutex);
 
 	return err;
-- 
cgit v1.2.3-70-g09d2


From 5af75d8d58d0f9f7b7c0515b35786b22892d5f12 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Tue, 16 Dec 2008 05:59:26 -0500
Subject: audit: validate comparison operations, store them in sane form

Don't store the field->op in the messy (and very inconvenient for e.g.
audit_comparator()) form; translate to dense set of values and do full
validation of userland-submitted value while we are at it.

->audit_init_rule() and ->audit_match_rule() get new values now; in-tree
instances updated.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 include/linux/audit.h          |  12 ++++
 kernel/audit_tree.c            |   2 +-
 kernel/auditfilter.c           | 132 ++++++++++++++++++++---------------------
 security/selinux/ss/services.c |  26 ++++----
 security/smack/smack_lsm.c     |   6 +-
 5 files changed, 94 insertions(+), 84 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/audit.h b/include/linux/audit.h
index cc71fdb56ae..67e5dbfc296 100644
--- a/include/linux/audit.h
+++ b/include/linux/audit.h
@@ -247,6 +247,18 @@
 #define AUDIT_GREATER_THAN_OR_EQUAL	(AUDIT_GREATER_THAN|AUDIT_EQUAL)
 #define AUDIT_OPERATORS			(AUDIT_EQUAL|AUDIT_NOT_EQUAL|AUDIT_BIT_MASK)
 
+enum {
+	Audit_equal,
+	Audit_not_equal,
+	Audit_bitmask,
+	Audit_bittest,
+	Audit_lt,
+	Audit_gt,
+	Audit_le,
+	Audit_ge,
+	Audit_bad
+};
+
 /* Status symbols */
 				/* Mask values */
 #define AUDIT_STATUS_ENABLED		0x0001
diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c
index 48bddad2a3d..8ad9545b8db 100644
--- a/kernel/audit_tree.c
+++ b/kernel/audit_tree.c
@@ -618,7 +618,7 @@ int audit_make_tree(struct audit_krule *rule, char *pathname, u32 op)
 
 	if (pathname[0] != '/' ||
 	    rule->listnr != AUDIT_FILTER_EXIT ||
-	    op & ~AUDIT_EQUAL ||
+	    op != Audit_equal ||
 	    rule->inode_f || rule->watch || rule->tree)
 		return -EINVAL;
 	rule->tree = alloc_tree(pathname);
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index e6e3829cadd..fbf24d121d9 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -252,7 +252,8 @@ static inline int audit_to_inode(struct audit_krule *krule,
 				 struct audit_field *f)
 {
 	if (krule->listnr != AUDIT_FILTER_EXIT ||
-	    krule->watch || krule->inode_f || krule->tree)
+	    krule->watch || krule->inode_f || krule->tree ||
+	    (f->op != Audit_equal && f->op != Audit_not_equal))
 		return -EINVAL;
 
 	krule->inode_f = f;
@@ -270,7 +271,7 @@ static int audit_to_watch(struct audit_krule *krule, char *path, int len,
 
 	if (path[0] != '/' || path[len-1] == '/' ||
 	    krule->listnr != AUDIT_FILTER_EXIT ||
-	    op & ~AUDIT_EQUAL ||
+	    op != Audit_equal ||
 	    krule->inode_f || krule->watch || krule->tree)
 		return -EINVAL;
 
@@ -420,12 +421,32 @@ exit_err:
 	return ERR_PTR(err);
 }
 
+static u32 audit_ops[] =
+{
+	[Audit_equal] = AUDIT_EQUAL,
+	[Audit_not_equal] = AUDIT_NOT_EQUAL,
+	[Audit_bitmask] = AUDIT_BIT_MASK,
+	[Audit_bittest] = AUDIT_BIT_TEST,
+	[Audit_lt] = AUDIT_LESS_THAN,
+	[Audit_gt] = AUDIT_GREATER_THAN,
+	[Audit_le] = AUDIT_LESS_THAN_OR_EQUAL,
+	[Audit_ge] = AUDIT_GREATER_THAN_OR_EQUAL,
+};
+
+static u32 audit_to_op(u32 op)
+{
+	u32 n;
+	for (n = Audit_equal; n < Audit_bad && audit_ops[n] != op; n++)
+		;
+	return n;
+}
+
+
 /* Translate struct audit_rule to kernel's rule respresentation.
  * Exists for backward compatibility with userspace. */
 static struct audit_entry *audit_rule_to_entry(struct audit_rule *rule)
 {
 	struct audit_entry *entry;
-	struct audit_field *ino_f;
 	int err = 0;
 	int i;
 
@@ -435,12 +456,28 @@ static struct audit_entry *audit_rule_to_entry(struct audit_rule *rule)
 
 	for (i = 0; i < rule->field_count; i++) {
 		struct audit_field *f = &entry->rule.fields[i];
+		u32 n;
+
+		n = rule->fields[i] & (AUDIT_NEGATE|AUDIT_OPERATORS);
+
+		/* Support for legacy operators where
+		 * AUDIT_NEGATE bit signifies != and otherwise assumes == */
+		if (n & AUDIT_NEGATE)
+			f->op = Audit_not_equal;
+		else if (!n)
+			f->op = Audit_equal;
+		else
+			f->op = audit_to_op(n);
+
+		entry->rule.vers_ops = (n & AUDIT_OPERATORS) ? 2 : 1;
 
-		f->op = rule->fields[i] & (AUDIT_NEGATE|AUDIT_OPERATORS);
 		f->type = rule->fields[i] & ~(AUDIT_NEGATE|AUDIT_OPERATORS);
 		f->val = rule->values[i];
 
 		err = -EINVAL;
+		if (f->op == Audit_bad)
+			goto exit_free;
+
 		switch(f->type) {
 		default:
 			goto exit_free;
@@ -462,11 +499,8 @@ static struct audit_entry *audit_rule_to_entry(struct audit_rule *rule)
 		case AUDIT_EXIT:
 		case AUDIT_SUCCESS:
 			/* bit ops are only useful on syscall args */
-			if (f->op == AUDIT_BIT_MASK ||
-						f->op == AUDIT_BIT_TEST) {
-				err = -EINVAL;
+			if (f->op == Audit_bitmask || f->op == Audit_bittest)
 				goto exit_free;
-			}
 			break;
 		case AUDIT_ARG0:
 		case AUDIT_ARG1:
@@ -475,11 +509,8 @@ static struct audit_entry *audit_rule_to_entry(struct audit_rule *rule)
 			break;
 		/* arch is only allowed to be = or != */
 		case AUDIT_ARCH:
-			if ((f->op != AUDIT_NOT_EQUAL) && (f->op != AUDIT_EQUAL)
-					&& (f->op != AUDIT_NEGATE) && (f->op)) {
-				err = -EINVAL;
+			if (f->op != Audit_not_equal && f->op != Audit_equal)
 				goto exit_free;
-			}
 			entry->rule.arch_f = f;
 			break;
 		case AUDIT_PERM:
@@ -496,33 +527,10 @@ static struct audit_entry *audit_rule_to_entry(struct audit_rule *rule)
 				goto exit_free;
 			break;
 		}
-
-		entry->rule.vers_ops = (f->op & AUDIT_OPERATORS) ? 2 : 1;
-
-		/* Support for legacy operators where
-		 * AUDIT_NEGATE bit signifies != and otherwise assumes == */
-		if (f->op & AUDIT_NEGATE)
-			f->op = AUDIT_NOT_EQUAL;
-		else if (!f->op)
-			f->op = AUDIT_EQUAL;
-		else if (f->op == AUDIT_OPERATORS) {
-			err = -EINVAL;
-			goto exit_free;
-		}
 	}
 
-	ino_f = entry->rule.inode_f;
-	if (ino_f) {
-		switch(ino_f->op) {
-		case AUDIT_NOT_EQUAL:
-			entry->rule.inode_f = NULL;
-		case AUDIT_EQUAL:
-			break;
-		default:
-			err = -EINVAL;
-			goto exit_free;
-		}
-	}
+	if (entry->rule.inode_f && entry->rule.inode_f->op == Audit_not_equal)
+		entry->rule.inode_f = NULL;
 
 exit_nofree:
 	return entry;
@@ -538,7 +546,6 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data,
 {
 	int err = 0;
 	struct audit_entry *entry;
-	struct audit_field *ino_f;
 	void *bufp;
 	size_t remain = datasz - sizeof(struct audit_rule_data);
 	int i;
@@ -554,11 +561,11 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data,
 		struct audit_field *f = &entry->rule.fields[i];
 
 		err = -EINVAL;
-		if (!(data->fieldflags[i] & AUDIT_OPERATORS) ||
-		    data->fieldflags[i] & ~AUDIT_OPERATORS)
+
+		f->op = audit_to_op(data->fieldflags[i]);
+		if (f->op == Audit_bad)
 			goto exit_free;
 
-		f->op = data->fieldflags[i] & AUDIT_OPERATORS;
 		f->type = data->fields[i];
 		f->val = data->values[i];
 		f->lsm_str = NULL;
@@ -670,18 +677,8 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data,
 		}
 	}
 
-	ino_f = entry->rule.inode_f;
-	if (ino_f) {
-		switch(ino_f->op) {
-		case AUDIT_NOT_EQUAL:
-			entry->rule.inode_f = NULL;
-		case AUDIT_EQUAL:
-			break;
-		default:
-			err = -EINVAL;
-			goto exit_free;
-		}
-	}
+	if (entry->rule.inode_f && entry->rule.inode_f->op == Audit_not_equal)
+		entry->rule.inode_f = NULL;
 
 exit_nofree:
 	return entry;
@@ -721,10 +718,10 @@ static struct audit_rule *audit_krule_to_rule(struct audit_krule *krule)
 		rule->fields[i] = krule->fields[i].type;
 
 		if (krule->vers_ops == 1) {
-			if (krule->fields[i].op & AUDIT_NOT_EQUAL)
+			if (krule->fields[i].op == Audit_not_equal)
 				rule->fields[i] |= AUDIT_NEGATE;
 		} else {
-			rule->fields[i] |= krule->fields[i].op;
+			rule->fields[i] |= audit_ops[krule->fields[i].op];
 		}
 	}
 	for (i = 0; i < AUDIT_BITMASK_SIZE; i++) rule->mask[i] = krule->mask[i];
@@ -752,7 +749,7 @@ static struct audit_rule_data *audit_krule_to_data(struct audit_krule *krule)
 		struct audit_field *f = &krule->fields[i];
 
 		data->fields[i] = f->type;
-		data->fieldflags[i] = f->op;
+		data->fieldflags[i] = audit_ops[f->op];
 		switch(f->type) {
 		case AUDIT_SUBJ_USER:
 		case AUDIT_SUBJ_ROLE:
@@ -1626,28 +1623,29 @@ int audit_receive_filter(int type, int pid, int uid, int seq, void *data,
 	return err;
 }
 
-int audit_comparator(const u32 left, const u32 op, const u32 right)
+int audit_comparator(u32 left, u32 op, u32 right)
 {
 	switch (op) {
-	case AUDIT_EQUAL:
+	case Audit_equal:
 		return (left == right);
-	case AUDIT_NOT_EQUAL:
+	case Audit_not_equal:
 		return (left != right);
-	case AUDIT_LESS_THAN:
+	case Audit_lt:
 		return (left < right);
-	case AUDIT_LESS_THAN_OR_EQUAL:
+	case Audit_le:
 		return (left <= right);
-	case AUDIT_GREATER_THAN:
+	case Audit_gt:
 		return (left > right);
-	case AUDIT_GREATER_THAN_OR_EQUAL:
+	case Audit_ge:
 		return (left >= right);
-	case AUDIT_BIT_MASK:
+	case Audit_bitmask:
 		return (left & right);
-	case AUDIT_BIT_TEST:
+	case Audit_bittest:
 		return ((left & right) == right);
+	default:
+		BUG();
+		return 0;
 	}
-	BUG();
-	return 0;
 }
 
 /* Compare given dentry name with last component in given path,
diff --git a/security/selinux/ss/services.c b/security/selinux/ss/services.c
index 343c8ab14af..c65e4fe4a0f 100644
--- a/security/selinux/ss/services.c
+++ b/security/selinux/ss/services.c
@@ -2602,7 +2602,7 @@ int selinux_audit_rule_init(u32 field, u32 op, char *rulestr, void **vrule)
 	case AUDIT_OBJ_ROLE:
 	case AUDIT_OBJ_TYPE:
 		/* only 'equals' and 'not equals' fit user, role, and type */
-		if (op != AUDIT_EQUAL && op != AUDIT_NOT_EQUAL)
+		if (op != Audit_equal && op != Audit_not_equal)
 			return -EINVAL;
 		break;
 	case AUDIT_SUBJ_SEN:
@@ -2736,10 +2736,10 @@ int selinux_audit_rule_match(u32 sid, u32 field, u32 op, void *vrule,
 	case AUDIT_SUBJ_USER:
 	case AUDIT_OBJ_USER:
 		switch (op) {
-		case AUDIT_EQUAL:
+		case Audit_equal:
 			match = (ctxt->user == rule->au_ctxt.user);
 			break;
-		case AUDIT_NOT_EQUAL:
+		case Audit_not_equal:
 			match = (ctxt->user != rule->au_ctxt.user);
 			break;
 		}
@@ -2747,10 +2747,10 @@ int selinux_audit_rule_match(u32 sid, u32 field, u32 op, void *vrule,
 	case AUDIT_SUBJ_ROLE:
 	case AUDIT_OBJ_ROLE:
 		switch (op) {
-		case AUDIT_EQUAL:
+		case Audit_equal:
 			match = (ctxt->role == rule->au_ctxt.role);
 			break;
-		case AUDIT_NOT_EQUAL:
+		case Audit_not_equal:
 			match = (ctxt->role != rule->au_ctxt.role);
 			break;
 		}
@@ -2758,10 +2758,10 @@ int selinux_audit_rule_match(u32 sid, u32 field, u32 op, void *vrule,
 	case AUDIT_SUBJ_TYPE:
 	case AUDIT_OBJ_TYPE:
 		switch (op) {
-		case AUDIT_EQUAL:
+		case Audit_equal:
 			match = (ctxt->type == rule->au_ctxt.type);
 			break;
-		case AUDIT_NOT_EQUAL:
+		case Audit_not_equal:
 			match = (ctxt->type != rule->au_ctxt.type);
 			break;
 		}
@@ -2774,31 +2774,31 @@ int selinux_audit_rule_match(u32 sid, u32 field, u32 op, void *vrule,
 			  field == AUDIT_OBJ_LEV_LOW) ?
 			 &ctxt->range.level[0] : &ctxt->range.level[1]);
 		switch (op) {
-		case AUDIT_EQUAL:
+		case Audit_equal:
 			match = mls_level_eq(&rule->au_ctxt.range.level[0],
 					     level);
 			break;
-		case AUDIT_NOT_EQUAL:
+		case Audit_not_equal:
 			match = !mls_level_eq(&rule->au_ctxt.range.level[0],
 					      level);
 			break;
-		case AUDIT_LESS_THAN:
+		case Audit_lt:
 			match = (mls_level_dom(&rule->au_ctxt.range.level[0],
 					       level) &&
 				 !mls_level_eq(&rule->au_ctxt.range.level[0],
 					       level));
 			break;
-		case AUDIT_LESS_THAN_OR_EQUAL:
+		case Audit_le:
 			match = mls_level_dom(&rule->au_ctxt.range.level[0],
 					      level);
 			break;
-		case AUDIT_GREATER_THAN:
+		case Audit_gt:
 			match = (mls_level_dom(level,
 					      &rule->au_ctxt.range.level[0]) &&
 				 !mls_level_eq(level,
 					       &rule->au_ctxt.range.level[0]));
 			break;
-		case AUDIT_GREATER_THAN_OR_EQUAL:
+		case Audit_ge:
 			match = mls_level_dom(level,
 					      &rule->au_ctxt.range.level[0]);
 			break;
diff --git a/security/smack/smack_lsm.c b/security/smack/smack_lsm.c
index 1b5551dfc1f..848212fd484 100644
--- a/security/smack/smack_lsm.c
+++ b/security/smack/smack_lsm.c
@@ -2492,7 +2492,7 @@ static int smack_audit_rule_init(u32 field, u32 op, char *rulestr, void **vrule)
 	if (field != AUDIT_SUBJ_USER && field != AUDIT_OBJ_USER)
 		return -EINVAL;
 
-	if (op != AUDIT_EQUAL && op != AUDIT_NOT_EQUAL)
+	if (op != Audit_equal && op != Audit_not_equal)
 		return -EINVAL;
 
 	*rule = smk_import(rulestr, 0);
@@ -2556,9 +2556,9 @@ static int smack_audit_rule_match(u32 secid, u32 field, u32 op, void *vrule,
 	 * both pointers will point to the same smack_known
 	 * label.
 	 */
-	if (op == AUDIT_EQUAL)
+	if (op == Audit_equal)
 		return (rule == smack);
-	if (op == AUDIT_NOT_EQUAL)
+	if (op == Audit_not_equal)
 		return (rule != smack);
 
 	return 0;
-- 
cgit v1.2.3-70-g09d2


From 0a30c5cefa53cbac429dcb2de906c0637b646253 Mon Sep 17 00:00:00 2001
From: Randy Dunlap <randy.dunlap@oracle.com>
Date: Sun, 4 Jan 2009 12:00:47 -0800
Subject: spi.h uses/needs device.h

Include header files as used/needed:

  In file included from drivers/leds/leds-dac124s085.c:16:
  include/linux/spi/spi.h:66: error: field 'dev' has incomplete type
  include/linux/spi/spi.h: In function 'to_spi_device':
  include/linux/spi/spi.h:100: warning: type defaults to 'int' in declaration of '__mptr'
  ...

Signed-off-by: Randy Dunlap <randy.dunlap@oracle.com>
Cc: David Brownell <dbrownell@users.sourceforge.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/spi/spi.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/spi/spi.h b/include/linux/spi/spi.h
index 4be01bb4437..82229317753 100644
--- a/include/linux/spi/spi.h
+++ b/include/linux/spi/spi.h
@@ -19,6 +19,8 @@
 #ifndef __LINUX_SPI_H
 #define __LINUX_SPI_H
 
+#include <linux/device.h>
+
 /*
  * INTERFACES between SPI master-side drivers and SPI infrastructure.
  * (There's no SPI slave support for Linux yet...)
-- 
cgit v1.2.3-70-g09d2


From c644f0e4b56f9a2fc066cd0d75a18074d130e4a3 Mon Sep 17 00:00:00 2001
From: Pekka Enberg <penberg@cs.helsinki.fi>
Date: Sun, 4 Jan 2009 12:00:48 -0800
Subject: fs: introduce bgl_lock_ptr()

As suggested by Andreas Dilger, introduce a bgl_lock_ptr() helper in
<linux/blockgroup_lock.h> and add separate sb_bgl_lock() helpers to
filesystem specific header files to break the hidden dependency to
struct ext[234]_sb_info.

Also, while at it, convert the macros to static inlines to try make up
for all the times I broke Andrew Morton's tree.

Acked-by: Andreas Dilger <adilger@sun.com>
Signed-off-by: Pekka Enberg <penberg@cs.helsinki.fi>
Cc: <linux-ext4@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ext4/ext4_sb.h               | 6 ++++++
 include/linux/blockgroup_lock.h | 7 +++++--
 include/linux/ext2_fs_sb.h      | 6 ++++++
 include/linux/ext3_fs_sb.h      | 6 ++++++
 4 files changed, 23 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/fs/ext4/ext4_sb.h b/fs/ext4/ext4_sb.h
index 445fde603df..b21f16713db 100644
--- a/fs/ext4/ext4_sb.h
+++ b/fs/ext4/ext4_sb.h
@@ -146,4 +146,10 @@ struct ext4_sb_info {
 	struct flex_groups *s_flex_groups;
 };
 
+static inline spinlock_t *
+sb_bgl_lock(struct ext4_sb_info *sbi, unsigned int block_group)
+{
+	return bgl_lock_ptr(&sbi->s_blockgroup_lock, block_group);
+}
+
 #endif	/* _EXT4_SB */
diff --git a/include/linux/blockgroup_lock.h b/include/linux/blockgroup_lock.h
index 8607312983b..e44b88ba552 100644
--- a/include/linux/blockgroup_lock.h
+++ b/include/linux/blockgroup_lock.h
@@ -53,7 +53,10 @@ static inline void bgl_lock_init(struct blockgroup_lock *bgl)
  * The accessor is a macro so we can embed a blockgroup_lock into different
  * superblock types
  */
-#define sb_bgl_lock(sb, block_group) \
-	(&(sb)->s_blockgroup_lock.locks[(block_group) & (NR_BG_LOCKS-1)].lock)
+static inline spinlock_t *
+bgl_lock_ptr(struct blockgroup_lock *bgl, unsigned int block_group)
+{
+	return &bgl->locks[(block_group) & (NR_BG_LOCKS-1)].lock;
+}
 
 #endif
diff --git a/include/linux/ext2_fs_sb.h b/include/linux/ext2_fs_sb.h
index f273415ab6f..dc541f3653d 100644
--- a/include/linux/ext2_fs_sb.h
+++ b/include/linux/ext2_fs_sb.h
@@ -108,4 +108,10 @@ struct ext2_sb_info {
 	struct ext2_reserve_window_node s_rsv_window_head;
 };
 
+static inline spinlock_t *
+sb_bgl_lock(struct ext2_sb_info *sbi, unsigned int block_group)
+{
+	return bgl_lock_ptr(&sbi->s_blockgroup_lock, block_group);
+}
+
 #endif	/* _LINUX_EXT2_FS_SB */
diff --git a/include/linux/ext3_fs_sb.h b/include/linux/ext3_fs_sb.h
index b65f0288b84..e024e38248f 100644
--- a/include/linux/ext3_fs_sb.h
+++ b/include/linux/ext3_fs_sb.h
@@ -83,4 +83,10 @@ struct ext3_sb_info {
 #endif
 };
 
+static inline spinlock_t *
+sb_bgl_lock(struct ext3_sb_info *sbi, unsigned int block_group)
+{
+	return bgl_lock_ptr(&sbi->s_blockgroup_lock, block_group);
+}
+
 #endif	/* _LINUX_EXT3_FS_SB */
-- 
cgit v1.2.3-70-g09d2


From 54566b2c1594c2326a645a3551f9d989f7ba3c5e Mon Sep 17 00:00:00 2001
From: Nick Piggin <npiggin@suse.de>
Date: Sun, 4 Jan 2009 12:00:53 -0800
Subject: fs: symlink write_begin allocation context fix

With the write_begin/write_end aops, page_symlink was broken because it
could no longer pass a GFP_NOFS type mask into the point where the
allocations happened.  They are done in write_begin, which would always
assume that the filesystem can be entered from reclaim.  This bug could
cause filesystem deadlocks.

The funny thing with having a gfp_t mask there is that it doesn't really
allow the caller to arbitrarily tinker with the context in which it can be
called.  It couldn't ever be GFP_ATOMIC, for example, because it needs to
take the page lock.  The only thing any callers care about is __GFP_FS
anyway, so turn that into a single flag.

Add a new flag for write_begin, AOP_FLAG_NOFS.  Filesystems can now act on
this flag in their write_begin function.  Change __grab_cache_page to
accept a nofs argument as well, to honour that flag (while we're there,
change the name to grab_cache_page_write_begin which is more instructive
and does away with random leading underscores).

This is really a more flexible way to go in the end anyway -- if a
filesystem happens to want any extra allocations aside from the pagecache
ones in ints write_begin function, it may now use GFP_KERNEL (rather than
GFP_NOFS) for common case allocations (eg.  ocfs2_alloc_write_ctxt, for a
random example).

[kosaki.motohiro@jp.fujitsu.com: fix ubifs]
[kosaki.motohiro@jp.fujitsu.com: fix fuse]
Signed-off-by: Nick Piggin <npiggin@suse.de>
Reviewed-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: <stable@kernel.org>		[2.6.28.x]
Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
[ Cleaned up the calling convention: just pass in the AOP flags
  untouched to the grab_cache_page_write_begin() function.  That
  just simplifies everybody, and may even allow future expansion of the
  logic.   - Linus ]
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/affs/file.c          |  2 +-
 fs/afs/write.c          |  2 +-
 fs/buffer.c             |  4 ++--
 fs/cifs/file.c          |  2 +-
 fs/ecryptfs/mmap.c      |  2 +-
 fs/ext3/inode.c         |  2 +-
 fs/ext3/namei.c         |  3 +--
 fs/ext4/inode.c         |  4 ++--
 fs/ext4/namei.c         |  3 +--
 fs/fuse/file.c          |  4 ++--
 fs/gfs2/ops_address.c   |  2 +-
 fs/hostfs/hostfs_kern.c |  2 +-
 fs/jffs2/file.c         |  2 +-
 fs/libfs.c              |  2 +-
 fs/namei.c              | 13 +++++++++----
 fs/nfs/file.c           |  2 +-
 fs/reiserfs/inode.c     |  2 +-
 fs/smbfs/file.c         |  2 +-
 fs/ubifs/file.c         |  9 +++++----
 include/linux/fs.h      |  5 ++++-
 include/linux/pagemap.h |  3 ++-
 mm/filemap.c            | 13 +++++++++----
 22 files changed, 49 insertions(+), 36 deletions(-)

(limited to 'include/linux')

diff --git a/fs/affs/file.c b/fs/affs/file.c
index 1377b1240b6..9246cb4aa01 100644
--- a/fs/affs/file.c
+++ b/fs/affs/file.c
@@ -628,7 +628,7 @@ static int affs_write_begin_ofs(struct file *file, struct address_space *mapping
 	}
 
 	index = pos >> PAGE_CACHE_SHIFT;
-	page = __grab_cache_page(mapping, index);
+	page = grab_cache_page_write_begin(mapping, index, flags);
 	if (!page)
 		return -ENOMEM;
 	*pagep = page;
diff --git a/fs/afs/write.c b/fs/afs/write.c
index d6b85dab35f..3fb36d43362 100644
--- a/fs/afs/write.c
+++ b/fs/afs/write.c
@@ -144,7 +144,7 @@ int afs_write_begin(struct file *file, struct address_space *mapping,
 	candidate->state = AFS_WBACK_PENDING;
 	init_waitqueue_head(&candidate->waitq);
 
-	page = __grab_cache_page(mapping, index);
+	page = grab_cache_page_write_begin(mapping, index, flags);
 	if (!page) {
 		kfree(candidate);
 		return -ENOMEM;
diff --git a/fs/buffer.c b/fs/buffer.c
index 776ae091d3b..a13f09b696f 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -1996,7 +1996,7 @@ int block_write_begin(struct file *file, struct address_space *mapping,
 	page = *pagep;
 	if (page == NULL) {
 		ownpage = 1;
-		page = __grab_cache_page(mapping, index);
+		page = grab_cache_page_write_begin(mapping, index, flags);
 		if (!page) {
 			status = -ENOMEM;
 			goto out;
@@ -2502,7 +2502,7 @@ int nobh_write_begin(struct file *file, struct address_space *mapping,
 	from = pos & (PAGE_CACHE_SIZE - 1);
 	to = from + len;
 
-	page = __grab_cache_page(mapping, index);
+	page = grab_cache_page_write_begin(mapping, index, flags);
 	if (!page)
 		return -ENOMEM;
 	*pagep = page;
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index b1e1fc6a6e6..12bb656fbe7 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -2074,7 +2074,7 @@ static int cifs_write_begin(struct file *file, struct address_space *mapping,
 
 	cFYI(1, ("write_begin from %lld len %d", (long long)pos, len));
 
-	page = __grab_cache_page(mapping, index);
+	page = grab_cache_page_write_begin(mapping, index, flags);
 	if (!page) {
 		rc = -ENOMEM;
 		goto out;
diff --git a/fs/ecryptfs/mmap.c b/fs/ecryptfs/mmap.c
index 04d7b3fa1ac..46cec2b6979 100644
--- a/fs/ecryptfs/mmap.c
+++ b/fs/ecryptfs/mmap.c
@@ -288,7 +288,7 @@ static int ecryptfs_write_begin(struct file *file,
 	loff_t prev_page_end_size;
 	int rc = 0;
 
-	page = __grab_cache_page(mapping, index);
+	page = grab_cache_page_write_begin(mapping, index, flags);
 	if (!page)
 		return -ENOMEM;
 	*pagep = page;
diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c
index c4bdccf976b..5fa453b49a6 100644
--- a/fs/ext3/inode.c
+++ b/fs/ext3/inode.c
@@ -1161,7 +1161,7 @@ static int ext3_write_begin(struct file *file, struct address_space *mapping,
 	to = from + len;
 
 retry:
-	page = __grab_cache_page(mapping, index);
+	page = grab_cache_page_write_begin(mapping, index, flags);
 	if (!page)
 		return -ENOMEM;
 	*pagep = page;
diff --git a/fs/ext3/namei.c b/fs/ext3/namei.c
index 297ea8dfac7..1dd2abe6313 100644
--- a/fs/ext3/namei.c
+++ b/fs/ext3/namei.c
@@ -2175,8 +2175,7 @@ retry:
 		 * We have a transaction open.  All is sweetness.  It also sets
 		 * i_size in generic_commit_write().
 		 */
-		err = __page_symlink(inode, symname, l,
-				mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS);
+		err = __page_symlink(inode, symname, l, 1);
 		if (err) {
 			drop_nlink(inode);
 			unlock_new_inode(inode);
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 7c3325e0b00..6702a49992a 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -1346,7 +1346,7 @@ retry:
 		goto out;
 	}
 
-	page = __grab_cache_page(mapping, index);
+	page = grab_cache_page_write_begin(mapping, index, flags);
 	if (!page) {
 		ext4_journal_stop(handle);
 		ret = -ENOMEM;
@@ -2550,7 +2550,7 @@ retry:
 		goto out;
 	}
 
-	page = __grab_cache_page(mapping, index);
+	page = grab_cache_page_write_begin(mapping, index, flags);
 	if (!page) {
 		ext4_journal_stop(handle);
 		ret = -ENOMEM;
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index da98a9012fa..9fd2a5e1be4 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -2212,8 +2212,7 @@ retry:
 		 * We have a transaction open.  All is sweetness.  It also sets
 		 * i_size in generic_commit_write().
 		 */
-		err = __page_symlink(inode, symname, l,
-				mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS);
+		err = __page_symlink(inode, symname, l, 1);
 		if (err) {
 			clear_nlink(inode);
 			unlock_new_inode(inode);
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 34930a964b8..4c9ee701126 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -646,7 +646,7 @@ static int fuse_write_begin(struct file *file, struct address_space *mapping,
 {
 	pgoff_t index = pos >> PAGE_CACHE_SHIFT;
 
-	*pagep = __grab_cache_page(mapping, index);
+	*pagep = grab_cache_page_write_begin(mapping, index, flags);
 	if (!*pagep)
 		return -ENOMEM;
 	return 0;
@@ -779,7 +779,7 @@ static ssize_t fuse_fill_write_pages(struct fuse_req *req,
 			break;
 
 		err = -ENOMEM;
-		page = __grab_cache_page(mapping, index);
+		page = grab_cache_page_write_begin(mapping, index, 0);
 		if (!page)
 			break;
 
diff --git a/fs/gfs2/ops_address.c b/fs/gfs2/ops_address.c
index 27563816e1c..15f710f2d4d 100644
--- a/fs/gfs2/ops_address.c
+++ b/fs/gfs2/ops_address.c
@@ -675,7 +675,7 @@ static int gfs2_write_begin(struct file *file, struct address_space *mapping,
 		goto out_trans_fail;
 
 	error = -ENOMEM;
-	page = __grab_cache_page(mapping, index);
+	page = grab_cache_page_write_begin(mapping, index, flags);
 	*pagep = page;
 	if (unlikely(!page))
 		goto out_endtrans;
diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c
index 3a31451ac17..5c538e0ec14 100644
--- a/fs/hostfs/hostfs_kern.c
+++ b/fs/hostfs/hostfs_kern.c
@@ -501,7 +501,7 @@ int hostfs_write_begin(struct file *file, struct address_space *mapping,
 {
 	pgoff_t index = pos >> PAGE_CACHE_SHIFT;
 
-	*pagep = __grab_cache_page(mapping, index);
+	*pagep = grab_cache_page_write_begin(mapping, index, flags);
 	if (!*pagep)
 		return -ENOMEM;
 	return 0;
diff --git a/fs/jffs2/file.c b/fs/jffs2/file.c
index 5a98aa87c85..5edc2bf2058 100644
--- a/fs/jffs2/file.c
+++ b/fs/jffs2/file.c
@@ -132,7 +132,7 @@ static int jffs2_write_begin(struct file *filp, struct address_space *mapping,
 	uint32_t pageofs = index << PAGE_CACHE_SHIFT;
 	int ret = 0;
 
-	pg = __grab_cache_page(mapping, index);
+	pg = grab_cache_page_write_begin(mapping, index, flags);
 	if (!pg)
 		return -ENOMEM;
 	*pagep = pg;
diff --git a/fs/libfs.c b/fs/libfs.c
index e960a832190..bdaec17fa38 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -360,7 +360,7 @@ int simple_write_begin(struct file *file, struct address_space *mapping,
 	index = pos >> PAGE_CACHE_SHIFT;
 	from = pos & (PAGE_CACHE_SIZE - 1);
 
-	page = __grab_cache_page(mapping, index);
+	page = grab_cache_page_write_begin(mapping, index, flags);
 	if (!page)
 		return -ENOMEM;
 
diff --git a/fs/namei.c b/fs/namei.c
index dd5c9f0bf82..df2d3df4f04 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -2817,18 +2817,23 @@ void page_put_link(struct dentry *dentry, struct nameidata *nd, void *cookie)
 	}
 }
 
-int __page_symlink(struct inode *inode, const char *symname, int len,
-		gfp_t gfp_mask)
+/*
+ * The nofs argument instructs pagecache_write_begin to pass AOP_FLAG_NOFS
+ */
+int __page_symlink(struct inode *inode, const char *symname, int len, int nofs)
 {
 	struct address_space *mapping = inode->i_mapping;
 	struct page *page;
 	void *fsdata;
 	int err;
 	char *kaddr;
+	unsigned int flags = AOP_FLAG_UNINTERRUPTIBLE;
+	if (nofs)
+		flags |= AOP_FLAG_NOFS;
 
 retry:
 	err = pagecache_write_begin(NULL, mapping, 0, len-1,
-				AOP_FLAG_UNINTERRUPTIBLE, &page, &fsdata);
+				flags, &page, &fsdata);
 	if (err)
 		goto fail;
 
@@ -2852,7 +2857,7 @@ fail:
 int page_symlink(struct inode *inode, const char *symname, int len)
 {
 	return __page_symlink(inode, symname, len,
-			mapping_gfp_mask(inode->i_mapping));
+			!(mapping_gfp_mask(inode->i_mapping) & __GFP_FS));
 }
 
 const struct inode_operations page_symlink_inode_operations = {
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index d319b49f8f0..90f292b520d 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -354,7 +354,7 @@ static int nfs_write_begin(struct file *file, struct address_space *mapping,
 		file->f_path.dentry->d_name.name,
 		mapping->host->i_ino, len, (long long) pos);
 
-	page = __grab_cache_page(mapping, index);
+	page = grab_cache_page_write_begin(mapping, index, flags);
 	if (!page)
 		return -ENOMEM;
 	*pagep = page;
diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c
index 145c2d3e5e0..ed04f47007f 100644
--- a/fs/reiserfs/inode.c
+++ b/fs/reiserfs/inode.c
@@ -2561,7 +2561,7 @@ static int reiserfs_write_begin(struct file *file,
 	}
 
 	index = pos >> PAGE_CACHE_SHIFT;
-	page = __grab_cache_page(mapping, index);
+	page = grab_cache_page_write_begin(mapping, index, flags);
 	if (!page)
 		return -ENOMEM;
 	*pagep = page;
diff --git a/fs/smbfs/file.c b/fs/smbfs/file.c
index e4f8d51a555..92d5e8ffb63 100644
--- a/fs/smbfs/file.c
+++ b/fs/smbfs/file.c
@@ -297,7 +297,7 @@ static int smb_write_begin(struct file *file, struct address_space *mapping,
 			struct page **pagep, void **fsdata)
 {
 	pgoff_t index = pos >> PAGE_CACHE_SHIFT;
-	*pagep = __grab_cache_page(mapping, index);
+	*pagep = grab_cache_page_write_begin(mapping, index, flags);
 	if (!*pagep)
 		return -ENOMEM;
 	return 0;
diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c
index fe82d2464d4..bf37374567f 100644
--- a/fs/ubifs/file.c
+++ b/fs/ubifs/file.c
@@ -219,7 +219,8 @@ static void release_existing_page_budget(struct ubifs_info *c)
 }
 
 static int write_begin_slow(struct address_space *mapping,
-			    loff_t pos, unsigned len, struct page **pagep)
+			    loff_t pos, unsigned len, struct page **pagep,
+			    unsigned flags)
 {
 	struct inode *inode = mapping->host;
 	struct ubifs_info *c = inode->i_sb->s_fs_info;
@@ -247,7 +248,7 @@ static int write_begin_slow(struct address_space *mapping,
 	if (unlikely(err))
 		return err;
 
-	page = __grab_cache_page(mapping, index);
+	page = grab_cache_page_write_begin(mapping, index, flags);
 	if (unlikely(!page)) {
 		ubifs_release_budget(c, &req);
 		return -ENOMEM;
@@ -438,7 +439,7 @@ static int ubifs_write_begin(struct file *file, struct address_space *mapping,
 		return -EROFS;
 
 	/* Try out the fast-path part first */
-	page = __grab_cache_page(mapping, index);
+	page = grab_cache_page_write_begin(mapping, index, flags);
 	if (unlikely(!page))
 		return -ENOMEM;
 
@@ -483,7 +484,7 @@ static int ubifs_write_begin(struct file *file, struct address_space *mapping,
 		unlock_page(page);
 		page_cache_release(page);
 
-		return write_begin_slow(mapping, pos, len, pagep);
+		return write_begin_slow(mapping, pos, len, pagep, flags);
 	}
 
 	/*
diff --git a/include/linux/fs.h b/include/linux/fs.h
index e2170ee21e1..f2a3010140e 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -423,6 +423,9 @@ enum positive_aop_returns {
 
 #define AOP_FLAG_UNINTERRUPTIBLE	0x0001 /* will not do a short write */
 #define AOP_FLAG_CONT_EXPAND		0x0002 /* called from cont_expand */
+#define AOP_FLAG_NOFS			0x0004 /* used by filesystem to direct
+						* helper code (eg buffer layer)
+						* to clear GFP_FS from alloc */
 
 /*
  * oh the beauties of C type declarations.
@@ -2035,7 +2038,7 @@ extern int page_readlink(struct dentry *, char __user *, int);
 extern void *page_follow_link_light(struct dentry *, struct nameidata *);
 extern void page_put_link(struct dentry *, struct nameidata *, void *);
 extern int __page_symlink(struct inode *inode, const char *symname, int len,
-		gfp_t gfp_mask);
+		int nofs);
 extern int page_symlink(struct inode *inode, const char *symname, int len);
 extern const struct inode_operations page_symlink_inode_operations;
 extern int generic_readlink(struct dentry *, char __user *, int);
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index 709742be02f..01ca0856caf 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -241,7 +241,8 @@ unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t start,
 unsigned find_get_pages_tag(struct address_space *mapping, pgoff_t *index,
 			int tag, unsigned int nr_pages, struct page **pages);
 
-struct page *__grab_cache_page(struct address_space *mapping, pgoff_t index);
+struct page *grab_cache_page_write_begin(struct address_space *mapping,
+			pgoff_t index, unsigned flags);
 
 /*
  * Returns locked page at given index in given cache, creating it if needed.
diff --git a/mm/filemap.c b/mm/filemap.c
index f3e5f8944d1..f8c69273c37 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -2140,19 +2140,24 @@ EXPORT_SYMBOL(generic_file_direct_write);
  * Find or create a page at the given pagecache position. Return the locked
  * page. This function is specifically for buffered writes.
  */
-struct page *__grab_cache_page(struct address_space *mapping, pgoff_t index)
+struct page *grab_cache_page_write_begin(struct address_space *mapping,
+					pgoff_t index, unsigned flags)
 {
 	int status;
 	struct page *page;
+	gfp_t gfp_notmask = 0;
+	if (flags & AOP_FLAG_NOFS)
+		gfp_notmask = __GFP_FS;
 repeat:
 	page = find_lock_page(mapping, index);
 	if (likely(page))
 		return page;
 
-	page = page_cache_alloc(mapping);
+	page = __page_cache_alloc(mapping_gfp_mask(mapping) & ~gfp_notmask);
 	if (!page)
 		return NULL;
-	status = add_to_page_cache_lru(page, mapping, index, GFP_KERNEL);
+	status = add_to_page_cache_lru(page, mapping, index,
+						GFP_KERNEL & ~gfp_notmask);
 	if (unlikely(status)) {
 		page_cache_release(page);
 		if (status == -EEXIST)
@@ -2161,7 +2166,7 @@ repeat:
 	}
 	return page;
 }
-EXPORT_SYMBOL(__grab_cache_page);
+EXPORT_SYMBOL(grab_cache_page_write_begin);
 
 static ssize_t generic_perform_write(struct file *file,
 				struct iov_iter *i, loff_t pos)
-- 
cgit v1.2.3-70-g09d2


From 099e657625e801adf82054c8050dde5aceb68452 Mon Sep 17 00:00:00 2001
From: Alessandro Zummo <a.zummo@towertech.it>
Date: Sun, 4 Jan 2009 12:00:54 -0800
Subject: rtc: add alarm/update irq interfaces

Add standard interfaces for alarm/update irqs enabling.  Drivers are no
more required to implement equivalent ioctl code as rtc-dev will provide
it.

UIE emulation should now be handled correctly and will work even for those
RTC drivers who cannot be configured to do both UIE and AIE.

Signed-off-by: Alessandro Zummo <a.zummo@towertech.it>
Cc: David Brownell <david-b@pacbell.net>
Cc: Atsushi Nemoto <anemo@mba.ocn.ne.jp>
Cc: Ralf Baechle <ralf@linux-mips.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/rtc/Kconfig     |  6 +++++-
 drivers/rtc/interface.c | 54 +++++++++++++++++++++++++++++++++++++++++++++++++
 drivers/rtc/rtc-dev.c   | 51 +++++++++++++++++++++++++++++++---------------
 include/linux/rtc.h     |  8 +++++++-
 4 files changed, 101 insertions(+), 18 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/rtc/Kconfig b/drivers/rtc/Kconfig
index 123092d8a98..165a8184335 100644
--- a/drivers/rtc/Kconfig
+++ b/drivers/rtc/Kconfig
@@ -102,9 +102,13 @@ config RTC_INTF_DEV_UIE_EMUL
 	depends on RTC_INTF_DEV
 	help
 	  Provides an emulation for RTC_UIE if the underlying rtc chip
-	  driver does not expose RTC_UIE ioctls.  Those requests generate
+	  driver does not expose RTC_UIE ioctls. Those requests generate
 	  once-per-second update interrupts, used for synchronization.
 
+	  The emulation code will read the time from the hardware
+	  clock several times per second, please enable this option
+	  only if you know that you really need it.
+
 config RTC_DRV_TEST
 	tristate "Test driver/device"
 	help
diff --git a/drivers/rtc/interface.c b/drivers/rtc/interface.c
index a04c1b6b157..fd2c652504f 100644
--- a/drivers/rtc/interface.c
+++ b/drivers/rtc/interface.c
@@ -307,6 +307,60 @@ int rtc_set_alarm(struct rtc_device *rtc, struct rtc_wkalrm *alarm)
 }
 EXPORT_SYMBOL_GPL(rtc_set_alarm);
 
+int rtc_alarm_irq_enable(struct rtc_device *rtc, unsigned int enabled)
+{
+	int err = mutex_lock_interruptible(&rtc->ops_lock);
+	if (err)
+		return err;
+
+	if (!rtc->ops)
+		err = -ENODEV;
+	else if (!rtc->ops->alarm_irq_enable)
+		err = -EINVAL;
+	else
+		err = rtc->ops->alarm_irq_enable(rtc->dev.parent, enabled);
+
+	mutex_unlock(&rtc->ops_lock);
+	return err;
+}
+EXPORT_SYMBOL_GPL(rtc_alarm_irq_enable);
+
+int rtc_update_irq_enable(struct rtc_device *rtc, unsigned int enabled)
+{
+	int err = mutex_lock_interruptible(&rtc->ops_lock);
+	if (err)
+		return err;
+
+#ifdef CONFIG_RTC_INTF_DEV_UIE_EMUL
+	if (enabled == 0 && rtc->uie_irq_active) {
+		mutex_unlock(&rtc->ops_lock);
+		return rtc_dev_update_irq_enable_emul(rtc, enabled);
+	}
+#endif
+
+	if (!rtc->ops)
+		err = -ENODEV;
+	else if (!rtc->ops->update_irq_enable)
+		err = -EINVAL;
+	else
+		err = rtc->ops->update_irq_enable(rtc->dev.parent, enabled);
+
+	mutex_unlock(&rtc->ops_lock);
+
+#ifdef CONFIG_RTC_INTF_DEV_UIE_EMUL
+	/*
+	 * Enable emulation if the driver did not provide
+	 * the update_irq_enable function pointer or if returned
+	 * -EINVAL to signal that it has been configured without
+	 * interrupts or that are not available at the moment.
+	 */
+	if (err == -EINVAL)
+		err = rtc_dev_update_irq_enable_emul(rtc, enabled);
+#endif
+	return err;
+}
+EXPORT_SYMBOL_GPL(rtc_update_irq_enable);
+
 /**
  * rtc_update_irq - report RTC periodic, alarm, and/or update irqs
  * @rtc: the rtc device
diff --git a/drivers/rtc/rtc-dev.c b/drivers/rtc/rtc-dev.c
index ecdea44ae4e..45152f4952d 100644
--- a/drivers/rtc/rtc-dev.c
+++ b/drivers/rtc/rtc-dev.c
@@ -92,10 +92,10 @@ static void rtc_uie_timer(unsigned long data)
 	spin_unlock_irqrestore(&rtc->irq_lock, flags);
 }
 
-static void clear_uie(struct rtc_device *rtc)
+static int clear_uie(struct rtc_device *rtc)
 {
 	spin_lock_irq(&rtc->irq_lock);
-	if (rtc->irq_active) {
+	if (rtc->uie_irq_active) {
 		rtc->stop_uie_polling = 1;
 		if (rtc->uie_timer_active) {
 			spin_unlock_irq(&rtc->irq_lock);
@@ -108,9 +108,10 @@ static void clear_uie(struct rtc_device *rtc)
 			flush_scheduled_work();
 			spin_lock_irq(&rtc->irq_lock);
 		}
-		rtc->irq_active = 0;
+		rtc->uie_irq_active = 0;
 	}
 	spin_unlock_irq(&rtc->irq_lock);
+	return 0;
 }
 
 static int set_uie(struct rtc_device *rtc)
@@ -122,8 +123,8 @@ static int set_uie(struct rtc_device *rtc)
 	if (err)
 		return err;
 	spin_lock_irq(&rtc->irq_lock);
-	if (!rtc->irq_active) {
-		rtc->irq_active = 1;
+	if (!rtc->uie_irq_active) {
+		rtc->uie_irq_active = 1;
 		rtc->stop_uie_polling = 0;
 		rtc->oldsecs = tm.tm_sec;
 		rtc->uie_task_active = 1;
@@ -134,6 +135,16 @@ static int set_uie(struct rtc_device *rtc)
 	spin_unlock_irq(&rtc->irq_lock);
 	return 0;
 }
+
+int rtc_dev_update_irq_enable_emul(struct rtc_device *rtc, unsigned int enabled)
+{
+	if (enabled)
+		return set_uie(rtc);
+	else
+		return clear_uie(rtc);
+}
+EXPORT_SYMBOL(rtc_dev_update_irq_enable_emul);
+
 #endif /* CONFIG_RTC_INTF_DEV_UIE_EMUL */
 
 static ssize_t
@@ -357,6 +368,22 @@ static long rtc_dev_ioctl(struct file *file,
 		err = rtc_irq_set_state(rtc, NULL, 0);
 		break;
 
+	case RTC_AIE_ON:
+		mutex_unlock(&rtc->ops_lock);
+		return rtc_alarm_irq_enable(rtc, 1);
+
+	case RTC_AIE_OFF:
+		mutex_unlock(&rtc->ops_lock);
+		return rtc_alarm_irq_enable(rtc, 0);
+
+	case RTC_UIE_ON:
+		mutex_unlock(&rtc->ops_lock);
+		return rtc_update_irq_enable(rtc, 1);
+
+	case RTC_UIE_OFF:
+		mutex_unlock(&rtc->ops_lock);
+		return rtc_update_irq_enable(rtc, 0);
+
 	case RTC_IRQP_SET:
 		err = rtc_irq_set_freq(rtc, NULL, arg);
 		break;
@@ -401,17 +428,6 @@ static long rtc_dev_ioctl(struct file *file,
 			err = -EFAULT;
 		return err;
 
-#ifdef CONFIG_RTC_INTF_DEV_UIE_EMUL
-	case RTC_UIE_OFF:
-		mutex_unlock(&rtc->ops_lock);
-		clear_uie(rtc);
-		return 0;
-
-	case RTC_UIE_ON:
-		mutex_unlock(&rtc->ops_lock);
-		err = set_uie(rtc);
-		return err;
-#endif
 	default:
 		err = -ENOTTY;
 		break;
@@ -440,7 +456,10 @@ static int rtc_dev_release(struct inode *inode, struct file *file)
 	 * Leave the alarm alone; it may be set to trigger a system wakeup
 	 * later, or be used by kernel code, and is a one-shot event anyway.
 	 */
+
+	/* Keep ioctl until all drivers are converted */
 	rtc_dev_ioctl(file, RTC_UIE_OFF, 0);
+	rtc_update_irq_enable(rtc, 0);
 	rtc_irq_set_state(rtc, NULL, 0);
 
 	if (rtc->ops->release)
diff --git a/include/linux/rtc.h b/include/linux/rtc.h
index 91f597ad6ac..4046b75563c 100644
--- a/include/linux/rtc.h
+++ b/include/linux/rtc.h
@@ -145,6 +145,8 @@ struct rtc_class_ops {
 	int (*irq_set_state)(struct device *, int enabled);
 	int (*irq_set_freq)(struct device *, int freq);
 	int (*read_callback)(struct device *, int data);
+	int (*alarm_irq_enable)(struct device *, unsigned int enabled);
+	int (*update_irq_enable)(struct device *, unsigned int enabled);
 };
 
 #define RTC_DEVICE_NAME_SIZE 20
@@ -181,7 +183,7 @@ struct rtc_device
 	struct timer_list uie_timer;
 	/* Those fields are protected by rtc->irq_lock */
 	unsigned int oldsecs;
-	unsigned int irq_active:1;
+	unsigned int uie_irq_active:1;
 	unsigned int stop_uie_polling:1;
 	unsigned int uie_task_active:1;
 	unsigned int uie_timer_active:1;
@@ -216,6 +218,10 @@ extern int rtc_irq_set_state(struct rtc_device *rtc,
 				struct rtc_task *task, int enabled);
 extern int rtc_irq_set_freq(struct rtc_device *rtc,
 				struct rtc_task *task, int freq);
+extern int rtc_update_irq_enable(struct rtc_device *rtc, unsigned int enabled);
+extern int rtc_alarm_irq_enable(struct rtc_device *rtc, unsigned int enabled);
+extern int rtc_dev_update_irq_enable_emul(struct rtc_device *rtc,
+						unsigned int enabled);
 
 typedef struct rtc_task {
 	void (*func)(void *private_data);
-- 
cgit v1.2.3-70-g09d2